@portabletext/block-tools 4.0.2 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. package/lib/_chunks-es/helpers.js +1 -64
  2. package/lib/_chunks-es/helpers.js.map +1 -1
  3. package/lib/index.js +487 -38
  4. package/lib/index.js.map +1 -1
  5. package/package.json +9 -9
  6. package/src/HtmlDeserializer/helpers.ts +1 -183
  7. package/src/HtmlDeserializer/index.ts +14 -25
  8. package/src/HtmlDeserializer/preprocessors/index.ts +8 -6
  9. package/src/HtmlDeserializer/preprocessors/{gdocs.ts → preprocessor.gdocs.ts} +2 -22
  10. package/src/HtmlDeserializer/preprocessors/{html.ts → preprocessor.html.ts} +1 -1
  11. package/src/HtmlDeserializer/preprocessors/{notion.ts → preprocessor.notion.ts} +1 -1
  12. package/src/HtmlDeserializer/preprocessors/{whitespace.ts → preprocessor.whitespace.ts} +28 -3
  13. package/src/HtmlDeserializer/preprocessors/{word.ts → preprocessor.word.ts} +1 -1
  14. package/src/HtmlDeserializer/rules/index.ts +6 -4
  15. package/src/HtmlDeserializer/rules/{gdocs.ts → rules.gdocs.ts} +1 -1
  16. package/src/HtmlDeserializer/rules/{html.ts → rules.html.ts} +3 -3
  17. package/src/HtmlDeserializer/rules/{notion.ts → rules.notion.ts} +1 -1
  18. package/src/HtmlDeserializer/rules/rules.word.ts +95 -0
  19. package/src/HtmlDeserializer/trim-whitespace.ts +157 -0
  20. package/src/HtmlDeserializer/word-online/asserters.word-online.ts +153 -0
  21. package/src/HtmlDeserializer/word-online/preprocessor.word-online.ts +263 -0
  22. package/src/HtmlDeserializer/word-online/rules.word-online.ts +390 -0
  23. package/src/HtmlDeserializer/rules/word.ts +0 -59
  24. /package/src/HtmlDeserializer/rules/{whitespace-text-node.ts → rules.whitespace-text-node.ts} +0 -0
@@ -0,0 +1,390 @@
1
+ import type {Schema} from '@portabletext/schema'
2
+ import {DEFAULT_SPAN, HTML_BLOCK_TAGS, HTML_HEADER_TAGS} from '../../constants'
3
+ import type {SchemaMatchers} from '../../schema-matchers'
4
+ import type {DeserializerRule} from '../../types'
5
+ import {keyGenerator} from '../../util/randomKey'
6
+ import {isElement, tagName} from '../helpers'
7
+ import {
8
+ hasEmphasisFormatting,
9
+ hasStrikethroughFormatting,
10
+ hasStrongFormatting,
11
+ hasUnderlineFormatting,
12
+ isFindHit,
13
+ isInBlockquote,
14
+ isInHeading,
15
+ isNormalTextRun,
16
+ isWordOnlineTextRun,
17
+ } from './asserters.word-online'
18
+
19
+ function mapParaStyleToBlockStyle(schema: Schema, paraStyle: string) {
20
+ const blockStyleMap: Record<string, string> = {
21
+ 'heading 1': 'h1',
22
+ 'heading 2': 'h2',
23
+ 'heading 3': 'h3',
24
+ 'heading 4': 'h4',
25
+ 'heading 5': 'h5',
26
+ 'heading 6': 'h6',
27
+ 'Quote': 'blockquote',
28
+ }
29
+
30
+ const blockStyle = blockStyleMap[paraStyle] ?? 'normal'
31
+
32
+ return schema.styles.find((style) => style.name === blockStyle)?.name
33
+ }
34
+
35
+ export function createWordOnlineRules(
36
+ schema: Schema,
37
+ options: {keyGenerator?: () => string; matchers?: SchemaMatchers},
38
+ ): DeserializerRule[] {
39
+ return [
40
+ // Image rule - handles bare Word Online <img> tags with WACImage class
41
+ {
42
+ deserialize(el) {
43
+ if (!isElement(el) || tagName(el) !== 'img') {
44
+ return undefined
45
+ }
46
+
47
+ // Handle className which might be a string or SVGAnimatedString
48
+ const classNameRaw = el.className
49
+ let className = ''
50
+ if (typeof classNameRaw === 'string') {
51
+ className = classNameRaw
52
+ } else if (classNameRaw && typeof classNameRaw === 'object') {
53
+ // SVGAnimatedString has baseVal property
54
+ className = (classNameRaw as {baseVal?: string}).baseVal || ''
55
+ }
56
+
57
+ if (!className.includes('WACImage')) {
58
+ return undefined
59
+ }
60
+
61
+ const src = el.getAttribute('src') ?? undefined
62
+ const alt = el.getAttribute('alt') ?? undefined
63
+
64
+ const props = Object.fromEntries(
65
+ Array.from(el.attributes).map((attr) => [attr.name, attr.value]),
66
+ )
67
+
68
+ // Bare <img> tags are typically block-level, not inline
69
+ // They should be returned as block images
70
+ const image = options.matchers?.image?.({
71
+ context: {
72
+ schema: schema,
73
+ keyGenerator: options.keyGenerator ?? keyGenerator,
74
+ },
75
+ props: {
76
+ ...props,
77
+ ...(src ? {src} : {}),
78
+ ...(alt ? {alt} : {}),
79
+ },
80
+ })
81
+
82
+ if (image) {
83
+ return {
84
+ _type: '__block',
85
+ block: image,
86
+ }
87
+ }
88
+
89
+ return undefined
90
+ },
91
+ },
92
+ // Image rule - handles Word Online images wrapped in WACImageContainer
93
+ {
94
+ deserialize(el) {
95
+ if (!isElement(el)) {
96
+ return undefined
97
+ }
98
+
99
+ // Handle className which might be a string or SVGAnimatedString
100
+ const classNameRaw = el.className
101
+ let className = ''
102
+ if (typeof classNameRaw === 'string') {
103
+ className = classNameRaw
104
+ } else if (classNameRaw && typeof classNameRaw === 'object') {
105
+ // SVGAnimatedString has baseVal property
106
+ className = (classNameRaw as {baseVal?: string}).baseVal || ''
107
+ }
108
+ if (!className.includes('WACImageContainer')) {
109
+ return undefined
110
+ }
111
+
112
+ // Find the img element inside
113
+ const img = el.querySelector('img')
114
+ if (!img) {
115
+ return undefined
116
+ }
117
+
118
+ const src = img.getAttribute('src') ?? undefined
119
+ const alt = img.getAttribute('alt') ?? undefined
120
+
121
+ const props = Object.fromEntries(
122
+ Array.from(img.attributes).map((attr) => [attr.name, attr.value]),
123
+ )
124
+
125
+ // Determine if this should be an inline or block-level image
126
+ // Word Online inline images:
127
+ // 1. Siblings of TextRun spans (not wrapped in paragraphs)
128
+ // 2. Inside list items (should be inline relative to the list item)
129
+ const isInsideListItem = el.closest('li') !== null
130
+ const isInsideParagraph = el.closest('p') !== null
131
+
132
+ if (!isInsideParagraph || isInsideListItem) {
133
+ // Inline image (either not in a paragraph, or inside a list item)
134
+ const inlineImage = options.matchers?.inlineImage?.({
135
+ context: {
136
+ schema: schema,
137
+ keyGenerator: options.keyGenerator ?? keyGenerator,
138
+ },
139
+ props: {
140
+ ...props,
141
+ ...(src ? {src} : {}),
142
+ ...(alt ? {alt} : {}),
143
+ },
144
+ })
145
+
146
+ if (inlineImage) {
147
+ return inlineImage
148
+ }
149
+ }
150
+
151
+ // Block-level image (or fallback if inline image not supported)
152
+ const image = options.matchers?.image?.({
153
+ context: {
154
+ schema: schema,
155
+ keyGenerator: options.keyGenerator ?? keyGenerator,
156
+ },
157
+ props: {
158
+ ...props,
159
+ ...(src ? {src} : {}),
160
+ ...(alt ? {alt} : {}),
161
+ },
162
+ })
163
+
164
+ if (image) {
165
+ return {
166
+ _type: '__block',
167
+ block: image,
168
+ }
169
+ }
170
+
171
+ return undefined
172
+ },
173
+ },
174
+ // List item rule - handles <li> elements with aria-level
175
+ {
176
+ deserialize(el, next) {
177
+ if (!isElement(el) || tagName(el) !== 'li') {
178
+ return undefined
179
+ }
180
+
181
+ const ariaLevel = el.getAttribute('data-aria-level')
182
+
183
+ if (!ariaLevel) {
184
+ return undefined
185
+ }
186
+
187
+ const listItem = tagName(el.parentNode) === 'ol' ? 'number' : 'bullet'
188
+
189
+ let childNodesToProcess = el.childNodes
190
+ let blockStyle = 'normal'
191
+
192
+ if (
193
+ el.childNodes.length === 1 &&
194
+ el.firstChild &&
195
+ isElement(el.firstChild)
196
+ ) {
197
+ const childTag = tagName(el.firstChild)
198
+
199
+ if (
200
+ childTag &&
201
+ (HTML_BLOCK_TAGS[childTag as keyof typeof HTML_BLOCK_TAGS] ||
202
+ HTML_HEADER_TAGS[childTag as keyof typeof HTML_HEADER_TAGS] ||
203
+ childTag === 'word-online-block')
204
+ ) {
205
+ // If it's a word-online-block, extract the style before skipping it
206
+ if (childTag === 'word-online-block') {
207
+ const paraStyle = el.firstChild.getAttribute('data-parastyle')
208
+ const foundBlockStyle = paraStyle
209
+ ? mapParaStyleToBlockStyle(schema, paraStyle)
210
+ : undefined
211
+
212
+ if (foundBlockStyle) {
213
+ blockStyle = foundBlockStyle
214
+ }
215
+ }
216
+
217
+ // Skip the block wrapper and process its children directly
218
+ childNodesToProcess = el.firstChild.childNodes
219
+ }
220
+ }
221
+
222
+ const children = next(childNodesToProcess)
223
+ let childArray = Array.isArray(children)
224
+ ? children
225
+ : [children].filter(Boolean)
226
+
227
+ // Clean up trailing empty or whitespace-only spans
228
+ // Word Online often adds trailing tabs/breaks and extra spaces in list items
229
+ while (childArray.length > 0) {
230
+ const lastChild = childArray[childArray.length - 1]
231
+
232
+ if (
233
+ lastChild &&
234
+ typeof lastChild === 'object' &&
235
+ 'text' in lastChild
236
+ ) {
237
+ const text = (lastChild.text as string).trimEnd()
238
+ if (text === '') {
239
+ // Remove empty span
240
+ childArray = childArray.slice(0, -1)
241
+ } else if (text !== lastChild.text) {
242
+ // Update with trimmed text
243
+ lastChild.text = text
244
+ break
245
+ } else {
246
+ break
247
+ }
248
+ } else {
249
+ break
250
+ }
251
+ }
252
+
253
+ return {
254
+ _type: schema.block.name,
255
+ children: childArray,
256
+ markDefs: [],
257
+ style: blockStyle,
258
+ listItem,
259
+ level: parseInt(ariaLevel, 10),
260
+ }
261
+ },
262
+ },
263
+ // Block style rule - handles paragraph styles like Quote
264
+ // The preprocessor wraps grouped NormalTextRun spans in a word-online-block element
265
+ {
266
+ deserialize(el, next) {
267
+ if (!isElement(el)) {
268
+ return undefined
269
+ }
270
+
271
+ const paraStyle = el.getAttribute('data-parastyle')
272
+ const blockStyle = paraStyle
273
+ ? mapParaStyleToBlockStyle(schema, paraStyle)
274
+ : undefined
275
+
276
+ if (!blockStyle) {
277
+ return undefined
278
+ }
279
+
280
+ const children = next(el.childNodes)
281
+
282
+ return {
283
+ _type: schema.block.name,
284
+ style: blockStyle,
285
+ markDefs: [],
286
+ children: Array.isArray(children)
287
+ ? children
288
+ : children
289
+ ? [children]
290
+ : [],
291
+ }
292
+ },
293
+ },
294
+ // TextRun rule
295
+ {
296
+ deserialize(el) {
297
+ if (isWordOnlineTextRun(el)) {
298
+ if (!isElement(el)) {
299
+ return undefined
300
+ }
301
+
302
+ if (!el.textContent) {
303
+ return undefined
304
+ }
305
+
306
+ // Find ALL NormalTextRun and FindHit children and extract text from them
307
+ // (Word Online sometimes splits text across multiple spans)
308
+ // FindHit is used for search result highlighting
309
+ const textSpans = Array.from(el.childNodes).filter(
310
+ (node) => isNormalTextRun(node) || isFindHit(node),
311
+ )
312
+ const text = textSpans
313
+ .map((span) => (isElement(span) ? (span.textContent ?? '') : ''))
314
+ .join('')
315
+
316
+ if (!text) {
317
+ return undefined
318
+ }
319
+
320
+ const span = {
321
+ ...DEFAULT_SPAN,
322
+ marks: [] as Array<string>,
323
+ text,
324
+ }
325
+
326
+ if (hasStrongFormatting(el)) {
327
+ span.marks.push('strong')
328
+ }
329
+
330
+ // Don't add italic mark if we're in a heading or blockquote (it's part of their default style)
331
+ if (
332
+ hasEmphasisFormatting(el) &&
333
+ !isInHeading(el) &&
334
+ !isInBlockquote(el)
335
+ ) {
336
+ span.marks.push('em')
337
+ }
338
+
339
+ // Add underline mark if the element has explicit underline formatting
340
+ // Word Online always adds underline to links, so we need to distinguish between:
341
+ // 1. Default link underline (skip)
342
+ // 2. Explicit user underline that includes the link (add)
343
+ // We check: if the link is surrounded by underlined content, it's explicit user underline
344
+ if (hasUnderlineFormatting(el)) {
345
+ const isInsideLink =
346
+ isElement(el) &&
347
+ el.parentElement &&
348
+ tagName(el.parentElement) === 'a'
349
+
350
+ if (isInsideLink) {
351
+ // Check if there are underlined siblings of the link
352
+ const linkElement = el.parentElement
353
+ if (linkElement) {
354
+ const prevSibling = linkElement.previousSibling
355
+ const nextSibling = linkElement.nextSibling
356
+
357
+ // If either sibling is an underlined TextRun, the link is part of explicit underline
358
+ const hasPrevUnderline =
359
+ prevSibling &&
360
+ isElement(prevSibling) &&
361
+ hasUnderlineFormatting(prevSibling)
362
+ const hasNextUnderline =
363
+ nextSibling &&
364
+ isElement(nextSibling) &&
365
+ hasUnderlineFormatting(nextSibling)
366
+
367
+ if (hasPrevUnderline || hasNextUnderline) {
368
+ span.marks.push('underline')
369
+ }
370
+ // Otherwise, it's just default link styling, don't add underline mark
371
+ }
372
+ } else {
373
+ // Not in a link, always add underline
374
+ span.marks.push('underline')
375
+ }
376
+ }
377
+
378
+ // Add strikethrough mark if the element has strikethrough formatting
379
+ if (hasStrikethroughFormatting(el)) {
380
+ span.marks.push('strike-through')
381
+ }
382
+
383
+ return span
384
+ }
385
+
386
+ return undefined
387
+ },
388
+ },
389
+ ]
390
+ }
@@ -1,59 +0,0 @@
1
- import {BLOCK_DEFAULT_STYLE, DEFAULT_BLOCK} from '../../constants'
2
- import type {DeserializerRule} from '../../types'
3
- import {isElement, tagName} from '../helpers'
4
-
5
- function getListItemStyle(el: Node): string | undefined {
6
- const style = isElement(el) && el.getAttribute('style')
7
- if (!style) {
8
- return undefined
9
- }
10
-
11
- if (!style.match(/lfo\d+/)) {
12
- return undefined
13
- }
14
-
15
- return style.match('lfo1') ? 'bullet' : 'number'
16
- }
17
-
18
- function getListItemLevel(el: Node): number | undefined {
19
- const style = isElement(el) && el.getAttribute('style')
20
- if (!style) {
21
- return undefined
22
- }
23
-
24
- const levelMatch = style.match(/level\d+/)
25
- if (!levelMatch) {
26
- return undefined
27
- }
28
-
29
- const [level] = levelMatch[0].match(/\d/) || []
30
- const levelNum = level ? Number.parseInt(level, 10) : 1
31
- return levelNum || 1
32
- }
33
-
34
- function isWordListElement(el: Node): boolean {
35
- return isElement(el) && el.className
36
- ? el.className === 'MsoListParagraphCxSpFirst' ||
37
- el.className === 'MsoListParagraphCxSpMiddle' ||
38
- el.className === 'MsoListParagraphCxSpLast'
39
- : false
40
- }
41
-
42
- export default function createWordRules(): DeserializerRule[] {
43
- return [
44
- {
45
- deserialize(el, next) {
46
- if (tagName(el) === 'p' && isWordListElement(el)) {
47
- return {
48
- ...DEFAULT_BLOCK,
49
- listItem: getListItemStyle(el),
50
- level: getListItemLevel(el),
51
- style: BLOCK_DEFAULT_STYLE,
52
- children: next(el.childNodes),
53
- }
54
- }
55
- return undefined
56
- },
57
- },
58
- ]
59
- }