@portabletext/block-tools 4.0.2 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. package/lib/_chunks-es/helpers.js +1 -64
  2. package/lib/_chunks-es/helpers.js.map +1 -1
  3. package/lib/index.js +487 -38
  4. package/lib/index.js.map +1 -1
  5. package/package.json +9 -9
  6. package/src/HtmlDeserializer/helpers.ts +1 -183
  7. package/src/HtmlDeserializer/index.ts +14 -25
  8. package/src/HtmlDeserializer/preprocessors/index.ts +8 -6
  9. package/src/HtmlDeserializer/preprocessors/{gdocs.ts → preprocessor.gdocs.ts} +2 -22
  10. package/src/HtmlDeserializer/preprocessors/{html.ts → preprocessor.html.ts} +1 -1
  11. package/src/HtmlDeserializer/preprocessors/{notion.ts → preprocessor.notion.ts} +1 -1
  12. package/src/HtmlDeserializer/preprocessors/{whitespace.ts → preprocessor.whitespace.ts} +28 -3
  13. package/src/HtmlDeserializer/preprocessors/{word.ts → preprocessor.word.ts} +1 -1
  14. package/src/HtmlDeserializer/rules/index.ts +6 -4
  15. package/src/HtmlDeserializer/rules/{gdocs.ts → rules.gdocs.ts} +1 -1
  16. package/src/HtmlDeserializer/rules/{html.ts → rules.html.ts} +3 -3
  17. package/src/HtmlDeserializer/rules/{notion.ts → rules.notion.ts} +1 -1
  18. package/src/HtmlDeserializer/rules/rules.word.ts +95 -0
  19. package/src/HtmlDeserializer/trim-whitespace.ts +157 -0
  20. package/src/HtmlDeserializer/word-online/asserters.word-online.ts +153 -0
  21. package/src/HtmlDeserializer/word-online/preprocessor.word-online.ts +263 -0
  22. package/src/HtmlDeserializer/word-online/rules.word-online.ts +390 -0
  23. package/src/HtmlDeserializer/rules/word.ts +0 -59
  24. /package/src/HtmlDeserializer/rules/{whitespace-text-node.ts → rules.whitespace-text-node.ts} +0 -0
@@ -1,10 +1,5 @@
1
1
  import type {Schema} from '@portabletext/schema'
2
- import {
3
- isTextBlock,
4
- type PortableTextObject,
5
- type PortableTextTextBlock,
6
- } from '@portabletext/schema'
7
- import {isEqual} from 'lodash'
2
+ import {isTextBlock, type PortableTextObject} from '@portabletext/schema'
8
3
  import {DEFAULT_BLOCK} from '../constants'
9
4
  import type {
10
5
  ArbitraryTypedObject,
@@ -49,88 +44,6 @@ export function defaultParseHtml(): HtmlParser {
49
44
  }
50
45
  }
51
46
 
52
- function nextSpan(block: PortableTextTextBlock, index: number) {
53
- const next = block.children[index + 1]
54
- return next && next._type === 'span' ? next : null
55
- }
56
-
57
- function prevSpan(block: PortableTextTextBlock, index: number) {
58
- const prev = block.children[index - 1]
59
- return prev && prev._type === 'span' ? prev : null
60
- }
61
-
62
- function isWhiteSpaceChar(text: string) {
63
- return ['\xa0', ' '].includes(text)
64
- }
65
-
66
- /**
67
- * NOTE: _mutates_ passed blocks!
68
- *
69
- * @param blocks - Array of blocks to trim whitespace for
70
- * @returns
71
- */
72
- export function trimWhitespace(
73
- schema: Schema,
74
- blocks: TypedObject[],
75
- ): TypedObject[] {
76
- blocks.forEach((block) => {
77
- if (!isTextBlock({schema}, block)) {
78
- return
79
- }
80
-
81
- // eslint-disable-next-line complexity
82
- block.children.forEach((child, index) => {
83
- if (!isMinimalSpan(child)) {
84
- return
85
- }
86
- const nextChild = nextSpan(block, index)
87
- const prevChild = prevSpan(block, index)
88
- if (index === 0) {
89
- child.text = child.text.replace(/^[^\S\n]+/g, '')
90
- }
91
- if (index === block.children.length - 1) {
92
- child.text = child.text.replace(/[^\S\n]+$/g, '')
93
- }
94
- if (
95
- /\s/.test(child.text.slice(Math.max(0, child.text.length - 1))) &&
96
- nextChild &&
97
- isMinimalSpan(nextChild) &&
98
- /\s/.test(nextChild.text.slice(0, 1))
99
- ) {
100
- child.text = child.text.replace(/[^\S\n]+$/g, '')
101
- }
102
- if (
103
- /\s/.test(child.text.slice(0, 1)) &&
104
- prevChild &&
105
- isMinimalSpan(prevChild) &&
106
- /\s/.test(prevChild.text.slice(Math.max(0, prevChild.text.length - 1)))
107
- ) {
108
- child.text = child.text.replace(/^[^\S\n]+/g, '')
109
- }
110
- if (!child.text) {
111
- block.children.splice(index, 1)
112
- }
113
- if (
114
- prevChild &&
115
- isEqual(prevChild.marks, child.marks) &&
116
- isWhiteSpaceChar(child.text)
117
- ) {
118
- prevChild.text += ' '
119
- block.children.splice(index, 1)
120
- } else if (
121
- nextChild &&
122
- isEqual(nextChild.marks, child.marks) &&
123
- isWhiteSpaceChar(child.text)
124
- ) {
125
- nextChild.text = ` ${nextChild.text}`
126
- block.children.splice(index, 1)
127
- }
128
- })
129
- })
130
-
131
- return blocks
132
- }
133
-
134
47
  export function ensureRootIsBlocks(
135
48
  schema: Schema,
136
49
  objects: Array<ArbitraryTypedObject>,
@@ -193,98 +106,3 @@ export function isPlaceholderAnnotation(
193
106
  export function isElement(node: Node): node is Element {
194
107
  return node.nodeType === 1
195
108
  }
196
-
197
- /**
198
- * Helper to normalize whitespace to only 1 empty block between content nodes
199
- * @param node - Root node to process
200
- */
201
- export function normalizeWhitespace(rootNode: Node) {
202
- let emptyBlockCount = 0
203
- let lastParent = null
204
- const nodesToRemove: Node[] = []
205
-
206
- for (let child = rootNode.firstChild; child; child = child.nextSibling) {
207
- if (!isElement(child)) {
208
- normalizeWhitespace(child)
209
- emptyBlockCount = 0
210
- continue
211
- }
212
-
213
- const elm = child as HTMLElement
214
-
215
- if (isWhitespaceBlock(elm)) {
216
- if (lastParent && elm.parentElement === lastParent) {
217
- emptyBlockCount++
218
- if (emptyBlockCount > 1) {
219
- nodesToRemove.push(elm)
220
- }
221
- } else {
222
- // Different parent, reset counter
223
- emptyBlockCount = 1
224
- }
225
-
226
- lastParent = elm.parentElement
227
- } else {
228
- // Recurse into child nodes
229
- normalizeWhitespace(child)
230
- // Reset counter for siblings
231
- emptyBlockCount = 0
232
- }
233
- }
234
-
235
- // Remove marked nodes
236
- nodesToRemove.forEach((node) => {
237
- node.parentElement?.removeChild(node)
238
- })
239
- }
240
-
241
- /**
242
- * Helper to remove all whitespace nodes
243
- * @param node - Root node to process
244
- */
245
- export function removeAllWhitespace(rootNode: Node) {
246
- const nodesToRemove: Node[] = []
247
-
248
- function collectNodesToRemove(currentNode: Node) {
249
- if (isElement(currentNode)) {
250
- const elm = currentNode as HTMLElement
251
-
252
- // Handle <br> tags that is between <p> tags
253
- if (
254
- tagName(elm) === 'br' &&
255
- (tagName(elm.nextElementSibling) === 'p' ||
256
- tagName(elm.previousElementSibling) === 'p')
257
- ) {
258
- nodesToRemove.push(elm)
259
-
260
- return
261
- }
262
-
263
- // Handle empty blocks
264
- if (
265
- (tagName(elm) === 'p' || tagName(elm) === 'br') &&
266
- elm?.firstChild?.textContent?.trim() === ''
267
- ) {
268
- nodesToRemove.push(elm)
269
-
270
- return
271
- }
272
-
273
- // Recursively process child nodes
274
- for (let child = elm.firstChild; child; child = child.nextSibling) {
275
- collectNodesToRemove(child)
276
- }
277
- }
278
- }
279
-
280
- collectNodesToRemove(rootNode)
281
-
282
- // Remove the collected nodes
283
- nodesToRemove.forEach((node) => {
284
- node.parentElement?.removeChild(node)
285
- })
286
- }
287
-
288
- function isWhitespaceBlock(elm: HTMLElement): boolean {
289
- return ['p', 'br'].includes(tagName(elm) || '') && !elm.textContent?.trim()
290
- }
@@ -10,8 +10,6 @@ import type {
10
10
  ArbitraryTypedObject,
11
11
  DeserializerRule,
12
12
  HtmlDeserializerOptions,
13
- HtmlParser,
14
- HtmlPreprocessorOptions,
15
13
  PlaceholderAnnotation,
16
14
  PlaceholderDecorator,
17
15
  TypedObject,
@@ -28,10 +26,10 @@ import {
28
26
  isPlaceholderAnnotation,
29
27
  isPlaceholderDecorator,
30
28
  tagName,
31
- trimWhitespace,
32
29
  } from './helpers'
33
- import preprocessors from './preprocessors'
30
+ import {preprocessors} from './preprocessors'
34
31
  import {createRules} from './rules'
32
+ import {trimWhitespace} from './trim-whitespace'
35
33
 
36
34
  /**
37
35
  * HTML Deserializer
@@ -42,6 +40,7 @@ export default class HtmlDeserializer {
42
40
  schema: Schema
43
41
  rules: DeserializerRule[]
44
42
  parseHtml: (html: string) => HTMLElement
43
+ whitespaceMode: 'preserve' | 'remove' | 'normalize'
45
44
  _markDefs: PortableTextObject[] = []
46
45
 
47
46
  /**
@@ -59,9 +58,16 @@ export default class HtmlDeserializer {
59
58
  this.schema = schema
60
59
  this.keyGenerator = options.keyGenerator ?? keyGenerator
61
60
  this.rules = [...rules, ...standardRules]
61
+ this.whitespaceMode = unstable_whitespaceOnPasteMode
62
62
  const parseHtml = options.parseHtml || defaultParseHtml()
63
63
  this.parseHtml = (html) => {
64
- const doc = preprocess(html, parseHtml, {unstable_whitespaceOnPasteMode})
64
+ const cleanHTML = vercelStegaClean(html)
65
+ const doc = parseHtml(cleanHTML)
66
+
67
+ for (const processor of preprocessors) {
68
+ processor(cleanHTML, doc)
69
+ }
70
+
65
71
  return doc.body
66
72
  }
67
73
  }
@@ -77,9 +83,10 @@ export default class HtmlDeserializer {
77
83
  const {parseHtml} = this
78
84
  const fragment = parseHtml(html)
79
85
  const children = Array.from(fragment.childNodes) as HTMLElement[]
80
- // Ensure that there are no blocks within blocks, and trim whitespace
86
+
81
87
  const blocks = trimWhitespace(
82
- this.schema,
88
+ {schema: this.schema},
89
+ this.whitespaceMode,
83
90
  flattenNestedBlocks(
84
91
  {schema: this.schema},
85
92
  ensureRootIsBlocks(
@@ -306,21 +313,3 @@ export default class HtmlDeserializer {
306
313
  }, [] as TypedObject[])
307
314
  }
308
315
  }
309
-
310
- // TODO: make this plugin-style
311
- function preprocess(
312
- html: string,
313
- parseHtml: HtmlParser,
314
- options: HtmlPreprocessorOptions,
315
- ): Document {
316
- const cleanHTML = vercelStegaClean(html)
317
- const doc = parseHtml(normalizeHtmlBeforePreprocess(cleanHTML))
318
- preprocessors.forEach((processor) => {
319
- processor(cleanHTML, doc, options)
320
- })
321
- return doc
322
- }
323
-
324
- function normalizeHtmlBeforePreprocess(html: string): string {
325
- return html.trim()
326
- }
@@ -1,13 +1,15 @@
1
- import preprocessGDocs from './gdocs'
2
- import preprocessHTML from './html'
3
- import preprocessNotion from './notion'
4
- import preprocessWhitespace from './whitespace'
5
- import preprocessWord from './word'
1
+ import {preprocessWordOnline} from '../word-online/preprocessor.word-online'
2
+ import {preprocessGDocs} from './preprocessor.gdocs'
3
+ import {preprocessHTML} from './preprocessor.html'
4
+ import {preprocessNotion} from './preprocessor.notion'
5
+ import {preprocessWhitespace} from './preprocessor.whitespace'
6
+ import {preprocessWord} from './preprocessor.word'
6
7
 
7
- export default [
8
+ export const preprocessors = [
8
9
  preprocessWhitespace,
9
10
  preprocessNotion,
10
11
  preprocessWord,
12
+ preprocessWordOnline,
11
13
  preprocessGDocs,
12
14
  preprocessHTML,
13
15
  ]
@@ -1,14 +1,7 @@
1
- import type {HtmlPreprocessorOptions} from '../../types'
2
- import {normalizeWhitespace, removeAllWhitespace, tagName} from '../helpers'
1
+ import {tagName} from '../helpers'
3
2
  import {_XPathResult} from './xpathResult'
4
3
 
5
- export default (
6
- _html: string,
7
- doc: Document,
8
- options: HtmlPreprocessorOptions,
9
- ): Document => {
10
- const whitespaceOnPasteMode =
11
- options?.unstable_whitespaceOnPasteMode || 'preserve'
4
+ export function preprocessGDocs(_html: string, doc: Document): Document {
12
5
  let gDocsRootOrSiblingNode = doc
13
6
  .evaluate(
14
7
  '//*[@id and contains(@id, "docs-internal-guid")]',
@@ -27,19 +20,6 @@ export default (
27
20
  gDocsRootOrSiblingNode = doc.body
28
21
  }
29
22
 
30
- switch (whitespaceOnPasteMode) {
31
- case 'normalize':
32
- // Keep only 1 empty block between content nodes
33
- normalizeWhitespace(gDocsRootOrSiblingNode)
34
- break
35
- case 'remove':
36
- // Remove all whitespace nodes
37
- removeAllWhitespace(gDocsRootOrSiblingNode)
38
- break
39
- default:
40
- break
41
- }
42
-
43
23
  // Tag every child with attribute 'is-google-docs' so that the GDocs rule-set can
44
24
  // work exclusivly on these children
45
25
  const childNodes = doc.evaluate(
@@ -15,7 +15,7 @@ const unwantedWordDocumentPaths = [
15
15
  '//link',
16
16
  ]
17
17
 
18
- export default (_html: string, doc: Document): Document => {
18
+ export function preprocessHTML(_html: string, doc: Document): Document {
19
19
  // Make sure text directly on the body is wrapped in spans.
20
20
  // This mimics what the browser does before putting html on the clipboard,
21
21
  // when used in a script context with JSDOM
@@ -1,6 +1,6 @@
1
1
  import {_XPathResult} from './xpathResult'
2
2
 
3
- export default (html: string, doc: Document): Document => {
3
+ export function preprocessNotion(html: string, doc: Document): Document {
4
4
  const NOTION_REGEX = /<!-- notionvc:.*?-->/g
5
5
 
6
6
  if (html.match(NOTION_REGEX)) {
@@ -1,7 +1,19 @@
1
1
  import {PRESERVE_WHITESPACE_TAGS} from '../../constants'
2
2
  import {_XPathResult} from './xpathResult'
3
3
 
4
- export default (_: string, doc: Document): Document => {
4
+ // Elements that only contain block-level children (not inline text content)
5
+ const BLOCK_CONTAINER_ELEMENTS = [
6
+ 'body',
7
+ 'table',
8
+ 'tbody',
9
+ 'thead',
10
+ 'tfoot',
11
+ 'tr',
12
+ 'ul',
13
+ 'ol',
14
+ ]
15
+
16
+ export function preprocessWhitespace(_: string, doc: Document): Document {
5
17
  // Recursively process all nodes.
6
18
  function processNode(node: Node) {
7
19
  // If this is a text node and not inside a tag where whitespace should be preserved, process it.
@@ -11,14 +23,27 @@ export default (_: string, doc: Document): Document => {
11
23
  node.parentElement?.tagName.toLowerCase() || '',
12
24
  )
13
25
  ) {
14
- node.textContent =
26
+ const normalized =
15
27
  node.textContent
16
28
  ?.replace(/\s\s+/g, ' ') // Remove multiple whitespace
17
29
  .replace(/[\r\n]+/g, ' ') || '' // Replace newlines with spaces
30
+ const parentTag = node.parentElement?.tagName.toLowerCase()
31
+
32
+ if (
33
+ parentTag &&
34
+ BLOCK_CONTAINER_ELEMENTS.includes(parentTag) &&
35
+ normalized.trim() === ''
36
+ ) {
37
+ // If parent is a block container and text is only whitespace, remove it
38
+ node.parentNode?.removeChild(node)
39
+ } else {
40
+ node.textContent = normalized
41
+ }
18
42
  }
19
43
  // Otherwise, if this node has children, process them.
20
44
  else {
21
- for (let i = 0; i < node.childNodes.length; i++) {
45
+ // Process children in reverse to handle removals safely
46
+ for (let i = node.childNodes.length - 1; i >= 0; i--) {
22
47
  processNode(node.childNodes[i])
23
48
  }
24
49
  }
@@ -35,7 +35,7 @@ function isWordHtml(html: string) {
35
35
  return WORD_HTML_REGEX.test(html)
36
36
  }
37
37
 
38
- export default (html: string, doc: Document): Document => {
38
+ export function preprocessWord(html: string, doc: Document): Document {
39
39
  if (!isWordHtml(html)) {
40
40
  return doc
41
41
  }
@@ -1,10 +1,11 @@
1
1
  import type {Schema} from '@portabletext/schema'
2
2
  import type {SchemaMatchers} from '../../schema-matchers'
3
3
  import type {DeserializerRule} from '../../types'
4
- import createGDocsRules from './gdocs'
5
- import createHTMLRules from './html'
6
- import createNotionRules from './notion'
7
- import createWordRules from './word'
4
+ import {createWordOnlineRules} from '../word-online/rules.word-online'
5
+ import {createGDocsRules} from './rules.gdocs'
6
+ import {createHTMLRules} from './rules.html'
7
+ import {createNotionRules} from './rules.notion'
8
+ import {createWordRules} from './rules.word'
8
9
 
9
10
  export function createRules(
10
11
  schema: Schema,
@@ -12,6 +13,7 @@ export function createRules(
12
13
  ): DeserializerRule[] {
13
14
  return [
14
15
  ...createWordRules(),
16
+ ...createWordOnlineRules(schema, options),
15
17
  ...createNotionRules(),
16
18
  ...createGDocsRules(schema),
17
19
  ...createHTMLRules(schema, options),
@@ -93,7 +93,7 @@ function getBlockStyle(schema: Schema, el: Node): string {
93
93
  return block.style
94
94
  }
95
95
 
96
- export default function createGDocsRules(schema: Schema): DeserializerRule[] {
96
+ export function createGDocsRules(schema: Schema): DeserializerRule[] {
97
97
  return [
98
98
  {
99
99
  deserialize(el, next) {
@@ -14,9 +14,9 @@ import type {SchemaMatchers} from '../../schema-matchers'
14
14
  import type {DeserializerRule} from '../../types'
15
15
  import {keyGenerator} from '../../util/randomKey'
16
16
  import {isElement, tagName} from '../helpers'
17
- import {whitespaceTextNodeRule} from './whitespace-text-node'
17
+ import {whitespaceTextNodeRule} from './rules.whitespace-text-node'
18
18
 
19
- export function resolveListItem(
19
+ function resolveListItem(
20
20
  schema: Schema,
21
21
  listNodeTagName: string,
22
22
  ): string | undefined {
@@ -35,7 +35,7 @@ export function resolveListItem(
35
35
  return undefined
36
36
  }
37
37
 
38
- export default function createHTMLRules(
38
+ export function createHTMLRules(
39
39
  schema: Schema,
40
40
  options: {keyGenerator?: () => string; matchers?: SchemaMatchers},
41
41
  ): DeserializerRule[] {
@@ -27,7 +27,7 @@ function isNotion(el: Node): boolean {
27
27
  return isElement(el) && Boolean(el.getAttribute('data-is-notion'))
28
28
  }
29
29
 
30
- export default function createNotionRules(): DeserializerRule[] {
30
+ export function createNotionRules(): DeserializerRule[] {
31
31
  return [
32
32
  {
33
33
  deserialize(el) {
@@ -0,0 +1,95 @@
1
+ import {
2
+ BLOCK_DEFAULT_STYLE,
3
+ DEFAULT_BLOCK,
4
+ HTML_HEADER_TAGS,
5
+ } from '../../constants'
6
+ import type {DeserializerRule} from '../../types'
7
+ import {isElement, tagName} from '../helpers'
8
+
9
+ function getListItemStyle(el: Node): string | undefined {
10
+ const style = isElement(el) && el.getAttribute('style')
11
+ if (!style) {
12
+ return undefined
13
+ }
14
+
15
+ if (!style.match(/lfo\d+/)) {
16
+ return undefined
17
+ }
18
+
19
+ return style.match('lfo1') ? 'number' : 'bullet'
20
+ }
21
+
22
+ function getListItemLevel(el: Node): number | undefined {
23
+ const style = isElement(el) && el.getAttribute('style')
24
+ if (!style) {
25
+ return undefined
26
+ }
27
+
28
+ const levelMatch = style.match(/level\d+/)
29
+ if (!levelMatch) {
30
+ return undefined
31
+ }
32
+
33
+ const [level] = levelMatch[0].match(/\d/) || []
34
+ const levelNum = level ? Number.parseInt(level, 10) : 1
35
+ return levelNum || 1
36
+ }
37
+
38
+ function isWordListElement(el: Node): boolean {
39
+ if (!isElement(el)) {
40
+ return false
41
+ }
42
+
43
+ // Check for specific class names
44
+ if (el.className) {
45
+ if (
46
+ el.className === 'MsoListParagraphCxSpFirst' ||
47
+ el.className === 'MsoListParagraphCxSpMiddle' ||
48
+ el.className === 'MsoListParagraphCxSpLast'
49
+ ) {
50
+ return true
51
+ }
52
+ }
53
+
54
+ // Check for mso-list in style attribute
55
+ const style = el.getAttribute('style')
56
+ if (style && /mso-list:\s*l\d+\s+level\d+\s+lfo\d+/.test(style)) {
57
+ return true
58
+ }
59
+
60
+ return false
61
+ }
62
+
63
+ function getHeadingStyle(el: Node): string | undefined {
64
+ const tag = tagName(el)
65
+ if (tag && HTML_HEADER_TAGS[tag]) {
66
+ return HTML_HEADER_TAGS[tag]?.style
67
+ }
68
+ return undefined
69
+ }
70
+
71
+ export function createWordRules(): DeserializerRule[] {
72
+ return [
73
+ {
74
+ deserialize(el, next) {
75
+ const tag = tagName(el)
76
+
77
+ // Handle list items (both paragraphs and headings)
78
+ if (
79
+ (tag === 'p' || HTML_HEADER_TAGS[tag || '']) &&
80
+ isWordListElement(el)
81
+ ) {
82
+ const headingStyle = getHeadingStyle(el)
83
+ return {
84
+ ...DEFAULT_BLOCK,
85
+ listItem: getListItemStyle(el),
86
+ level: getListItemLevel(el),
87
+ style: headingStyle || BLOCK_DEFAULT_STYLE,
88
+ children: next(el.childNodes),
89
+ }
90
+ }
91
+ return undefined
92
+ },
93
+ },
94
+ ]
95
+ }