@portabletext/block-tools 4.0.2 → 4.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/_chunks-es/helpers.js +1 -64
- package/lib/_chunks-es/helpers.js.map +1 -1
- package/lib/index.js +487 -38
- package/lib/index.js.map +1 -1
- package/package.json +9 -9
- package/src/HtmlDeserializer/helpers.ts +1 -183
- package/src/HtmlDeserializer/index.ts +14 -25
- package/src/HtmlDeserializer/preprocessors/index.ts +8 -6
- package/src/HtmlDeserializer/preprocessors/{gdocs.ts → preprocessor.gdocs.ts} +2 -22
- package/src/HtmlDeserializer/preprocessors/{html.ts → preprocessor.html.ts} +1 -1
- package/src/HtmlDeserializer/preprocessors/{notion.ts → preprocessor.notion.ts} +1 -1
- package/src/HtmlDeserializer/preprocessors/{whitespace.ts → preprocessor.whitespace.ts} +28 -3
- package/src/HtmlDeserializer/preprocessors/{word.ts → preprocessor.word.ts} +1 -1
- package/src/HtmlDeserializer/rules/index.ts +6 -4
- package/src/HtmlDeserializer/rules/{gdocs.ts → rules.gdocs.ts} +1 -1
- package/src/HtmlDeserializer/rules/{html.ts → rules.html.ts} +3 -3
- package/src/HtmlDeserializer/rules/{notion.ts → rules.notion.ts} +1 -1
- package/src/HtmlDeserializer/rules/rules.word.ts +95 -0
- package/src/HtmlDeserializer/trim-whitespace.ts +157 -0
- package/src/HtmlDeserializer/word-online/asserters.word-online.ts +153 -0
- package/src/HtmlDeserializer/word-online/preprocessor.word-online.ts +263 -0
- package/src/HtmlDeserializer/word-online/rules.word-online.ts +390 -0
- package/src/HtmlDeserializer/rules/word.ts +0 -59
- /package/src/HtmlDeserializer/rules/{whitespace-text-node.ts → rules.whitespace-text-node.ts} +0 -0
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
import type {Schema} from '@portabletext/schema'
|
|
2
|
+
import {DEFAULT_SPAN, HTML_BLOCK_TAGS, HTML_HEADER_TAGS} from '../../constants'
|
|
3
|
+
import type {SchemaMatchers} from '../../schema-matchers'
|
|
4
|
+
import type {DeserializerRule} from '../../types'
|
|
5
|
+
import {keyGenerator} from '../../util/randomKey'
|
|
6
|
+
import {isElement, tagName} from '../helpers'
|
|
7
|
+
import {
|
|
8
|
+
hasEmphasisFormatting,
|
|
9
|
+
hasStrikethroughFormatting,
|
|
10
|
+
hasStrongFormatting,
|
|
11
|
+
hasUnderlineFormatting,
|
|
12
|
+
isFindHit,
|
|
13
|
+
isInBlockquote,
|
|
14
|
+
isInHeading,
|
|
15
|
+
isNormalTextRun,
|
|
16
|
+
isWordOnlineTextRun,
|
|
17
|
+
} from './asserters.word-online'
|
|
18
|
+
|
|
19
|
+
function mapParaStyleToBlockStyle(schema: Schema, paraStyle: string) {
|
|
20
|
+
const blockStyleMap: Record<string, string> = {
|
|
21
|
+
'heading 1': 'h1',
|
|
22
|
+
'heading 2': 'h2',
|
|
23
|
+
'heading 3': 'h3',
|
|
24
|
+
'heading 4': 'h4',
|
|
25
|
+
'heading 5': 'h5',
|
|
26
|
+
'heading 6': 'h6',
|
|
27
|
+
'Quote': 'blockquote',
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const blockStyle = blockStyleMap[paraStyle] ?? 'normal'
|
|
31
|
+
|
|
32
|
+
return schema.styles.find((style) => style.name === blockStyle)?.name
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export function createWordOnlineRules(
|
|
36
|
+
schema: Schema,
|
|
37
|
+
options: {keyGenerator?: () => string; matchers?: SchemaMatchers},
|
|
38
|
+
): DeserializerRule[] {
|
|
39
|
+
return [
|
|
40
|
+
// Image rule - handles bare Word Online <img> tags with WACImage class
|
|
41
|
+
{
|
|
42
|
+
deserialize(el) {
|
|
43
|
+
if (!isElement(el) || tagName(el) !== 'img') {
|
|
44
|
+
return undefined
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Handle className which might be a string or SVGAnimatedString
|
|
48
|
+
const classNameRaw = el.className
|
|
49
|
+
let className = ''
|
|
50
|
+
if (typeof classNameRaw === 'string') {
|
|
51
|
+
className = classNameRaw
|
|
52
|
+
} else if (classNameRaw && typeof classNameRaw === 'object') {
|
|
53
|
+
// SVGAnimatedString has baseVal property
|
|
54
|
+
className = (classNameRaw as {baseVal?: string}).baseVal || ''
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if (!className.includes('WACImage')) {
|
|
58
|
+
return undefined
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const src = el.getAttribute('src') ?? undefined
|
|
62
|
+
const alt = el.getAttribute('alt') ?? undefined
|
|
63
|
+
|
|
64
|
+
const props = Object.fromEntries(
|
|
65
|
+
Array.from(el.attributes).map((attr) => [attr.name, attr.value]),
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
// Bare <img> tags are typically block-level, not inline
|
|
69
|
+
// They should be returned as block images
|
|
70
|
+
const image = options.matchers?.image?.({
|
|
71
|
+
context: {
|
|
72
|
+
schema: schema,
|
|
73
|
+
keyGenerator: options.keyGenerator ?? keyGenerator,
|
|
74
|
+
},
|
|
75
|
+
props: {
|
|
76
|
+
...props,
|
|
77
|
+
...(src ? {src} : {}),
|
|
78
|
+
...(alt ? {alt} : {}),
|
|
79
|
+
},
|
|
80
|
+
})
|
|
81
|
+
|
|
82
|
+
if (image) {
|
|
83
|
+
return {
|
|
84
|
+
_type: '__block',
|
|
85
|
+
block: image,
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return undefined
|
|
90
|
+
},
|
|
91
|
+
},
|
|
92
|
+
// Image rule - handles Word Online images wrapped in WACImageContainer
|
|
93
|
+
{
|
|
94
|
+
deserialize(el) {
|
|
95
|
+
if (!isElement(el)) {
|
|
96
|
+
return undefined
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Handle className which might be a string or SVGAnimatedString
|
|
100
|
+
const classNameRaw = el.className
|
|
101
|
+
let className = ''
|
|
102
|
+
if (typeof classNameRaw === 'string') {
|
|
103
|
+
className = classNameRaw
|
|
104
|
+
} else if (classNameRaw && typeof classNameRaw === 'object') {
|
|
105
|
+
// SVGAnimatedString has baseVal property
|
|
106
|
+
className = (classNameRaw as {baseVal?: string}).baseVal || ''
|
|
107
|
+
}
|
|
108
|
+
if (!className.includes('WACImageContainer')) {
|
|
109
|
+
return undefined
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Find the img element inside
|
|
113
|
+
const img = el.querySelector('img')
|
|
114
|
+
if (!img) {
|
|
115
|
+
return undefined
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
const src = img.getAttribute('src') ?? undefined
|
|
119
|
+
const alt = img.getAttribute('alt') ?? undefined
|
|
120
|
+
|
|
121
|
+
const props = Object.fromEntries(
|
|
122
|
+
Array.from(img.attributes).map((attr) => [attr.name, attr.value]),
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
// Determine if this should be an inline or block-level image
|
|
126
|
+
// Word Online inline images:
|
|
127
|
+
// 1. Siblings of TextRun spans (not wrapped in paragraphs)
|
|
128
|
+
// 2. Inside list items (should be inline relative to the list item)
|
|
129
|
+
const isInsideListItem = el.closest('li') !== null
|
|
130
|
+
const isInsideParagraph = el.closest('p') !== null
|
|
131
|
+
|
|
132
|
+
if (!isInsideParagraph || isInsideListItem) {
|
|
133
|
+
// Inline image (either not in a paragraph, or inside a list item)
|
|
134
|
+
const inlineImage = options.matchers?.inlineImage?.({
|
|
135
|
+
context: {
|
|
136
|
+
schema: schema,
|
|
137
|
+
keyGenerator: options.keyGenerator ?? keyGenerator,
|
|
138
|
+
},
|
|
139
|
+
props: {
|
|
140
|
+
...props,
|
|
141
|
+
...(src ? {src} : {}),
|
|
142
|
+
...(alt ? {alt} : {}),
|
|
143
|
+
},
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
if (inlineImage) {
|
|
147
|
+
return inlineImage
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// Block-level image (or fallback if inline image not supported)
|
|
152
|
+
const image = options.matchers?.image?.({
|
|
153
|
+
context: {
|
|
154
|
+
schema: schema,
|
|
155
|
+
keyGenerator: options.keyGenerator ?? keyGenerator,
|
|
156
|
+
},
|
|
157
|
+
props: {
|
|
158
|
+
...props,
|
|
159
|
+
...(src ? {src} : {}),
|
|
160
|
+
...(alt ? {alt} : {}),
|
|
161
|
+
},
|
|
162
|
+
})
|
|
163
|
+
|
|
164
|
+
if (image) {
|
|
165
|
+
return {
|
|
166
|
+
_type: '__block',
|
|
167
|
+
block: image,
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return undefined
|
|
172
|
+
},
|
|
173
|
+
},
|
|
174
|
+
// List item rule - handles <li> elements with aria-level
|
|
175
|
+
{
|
|
176
|
+
deserialize(el, next) {
|
|
177
|
+
if (!isElement(el) || tagName(el) !== 'li') {
|
|
178
|
+
return undefined
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
const ariaLevel = el.getAttribute('data-aria-level')
|
|
182
|
+
|
|
183
|
+
if (!ariaLevel) {
|
|
184
|
+
return undefined
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const listItem = tagName(el.parentNode) === 'ol' ? 'number' : 'bullet'
|
|
188
|
+
|
|
189
|
+
let childNodesToProcess = el.childNodes
|
|
190
|
+
let blockStyle = 'normal'
|
|
191
|
+
|
|
192
|
+
if (
|
|
193
|
+
el.childNodes.length === 1 &&
|
|
194
|
+
el.firstChild &&
|
|
195
|
+
isElement(el.firstChild)
|
|
196
|
+
) {
|
|
197
|
+
const childTag = tagName(el.firstChild)
|
|
198
|
+
|
|
199
|
+
if (
|
|
200
|
+
childTag &&
|
|
201
|
+
(HTML_BLOCK_TAGS[childTag as keyof typeof HTML_BLOCK_TAGS] ||
|
|
202
|
+
HTML_HEADER_TAGS[childTag as keyof typeof HTML_HEADER_TAGS] ||
|
|
203
|
+
childTag === 'word-online-block')
|
|
204
|
+
) {
|
|
205
|
+
// If it's a word-online-block, extract the style before skipping it
|
|
206
|
+
if (childTag === 'word-online-block') {
|
|
207
|
+
const paraStyle = el.firstChild.getAttribute('data-parastyle')
|
|
208
|
+
const foundBlockStyle = paraStyle
|
|
209
|
+
? mapParaStyleToBlockStyle(schema, paraStyle)
|
|
210
|
+
: undefined
|
|
211
|
+
|
|
212
|
+
if (foundBlockStyle) {
|
|
213
|
+
blockStyle = foundBlockStyle
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Skip the block wrapper and process its children directly
|
|
218
|
+
childNodesToProcess = el.firstChild.childNodes
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
const children = next(childNodesToProcess)
|
|
223
|
+
let childArray = Array.isArray(children)
|
|
224
|
+
? children
|
|
225
|
+
: [children].filter(Boolean)
|
|
226
|
+
|
|
227
|
+
// Clean up trailing empty or whitespace-only spans
|
|
228
|
+
// Word Online often adds trailing tabs/breaks and extra spaces in list items
|
|
229
|
+
while (childArray.length > 0) {
|
|
230
|
+
const lastChild = childArray[childArray.length - 1]
|
|
231
|
+
|
|
232
|
+
if (
|
|
233
|
+
lastChild &&
|
|
234
|
+
typeof lastChild === 'object' &&
|
|
235
|
+
'text' in lastChild
|
|
236
|
+
) {
|
|
237
|
+
const text = (lastChild.text as string).trimEnd()
|
|
238
|
+
if (text === '') {
|
|
239
|
+
// Remove empty span
|
|
240
|
+
childArray = childArray.slice(0, -1)
|
|
241
|
+
} else if (text !== lastChild.text) {
|
|
242
|
+
// Update with trimmed text
|
|
243
|
+
lastChild.text = text
|
|
244
|
+
break
|
|
245
|
+
} else {
|
|
246
|
+
break
|
|
247
|
+
}
|
|
248
|
+
} else {
|
|
249
|
+
break
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
return {
|
|
254
|
+
_type: schema.block.name,
|
|
255
|
+
children: childArray,
|
|
256
|
+
markDefs: [],
|
|
257
|
+
style: blockStyle,
|
|
258
|
+
listItem,
|
|
259
|
+
level: parseInt(ariaLevel, 10),
|
|
260
|
+
}
|
|
261
|
+
},
|
|
262
|
+
},
|
|
263
|
+
// Block style rule - handles paragraph styles like Quote
|
|
264
|
+
// The preprocessor wraps grouped NormalTextRun spans in a word-online-block element
|
|
265
|
+
{
|
|
266
|
+
deserialize(el, next) {
|
|
267
|
+
if (!isElement(el)) {
|
|
268
|
+
return undefined
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
const paraStyle = el.getAttribute('data-parastyle')
|
|
272
|
+
const blockStyle = paraStyle
|
|
273
|
+
? mapParaStyleToBlockStyle(schema, paraStyle)
|
|
274
|
+
: undefined
|
|
275
|
+
|
|
276
|
+
if (!blockStyle) {
|
|
277
|
+
return undefined
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
const children = next(el.childNodes)
|
|
281
|
+
|
|
282
|
+
return {
|
|
283
|
+
_type: schema.block.name,
|
|
284
|
+
style: blockStyle,
|
|
285
|
+
markDefs: [],
|
|
286
|
+
children: Array.isArray(children)
|
|
287
|
+
? children
|
|
288
|
+
: children
|
|
289
|
+
? [children]
|
|
290
|
+
: [],
|
|
291
|
+
}
|
|
292
|
+
},
|
|
293
|
+
},
|
|
294
|
+
// TextRun rule
|
|
295
|
+
{
|
|
296
|
+
deserialize(el) {
|
|
297
|
+
if (isWordOnlineTextRun(el)) {
|
|
298
|
+
if (!isElement(el)) {
|
|
299
|
+
return undefined
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
if (!el.textContent) {
|
|
303
|
+
return undefined
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// Find ALL NormalTextRun and FindHit children and extract text from them
|
|
307
|
+
// (Word Online sometimes splits text across multiple spans)
|
|
308
|
+
// FindHit is used for search result highlighting
|
|
309
|
+
const textSpans = Array.from(el.childNodes).filter(
|
|
310
|
+
(node) => isNormalTextRun(node) || isFindHit(node),
|
|
311
|
+
)
|
|
312
|
+
const text = textSpans
|
|
313
|
+
.map((span) => (isElement(span) ? (span.textContent ?? '') : ''))
|
|
314
|
+
.join('')
|
|
315
|
+
|
|
316
|
+
if (!text) {
|
|
317
|
+
return undefined
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
const span = {
|
|
321
|
+
...DEFAULT_SPAN,
|
|
322
|
+
marks: [] as Array<string>,
|
|
323
|
+
text,
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
if (hasStrongFormatting(el)) {
|
|
327
|
+
span.marks.push('strong')
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// Don't add italic mark if we're in a heading or blockquote (it's part of their default style)
|
|
331
|
+
if (
|
|
332
|
+
hasEmphasisFormatting(el) &&
|
|
333
|
+
!isInHeading(el) &&
|
|
334
|
+
!isInBlockquote(el)
|
|
335
|
+
) {
|
|
336
|
+
span.marks.push('em')
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
// Add underline mark if the element has explicit underline formatting
|
|
340
|
+
// Word Online always adds underline to links, so we need to distinguish between:
|
|
341
|
+
// 1. Default link underline (skip)
|
|
342
|
+
// 2. Explicit user underline that includes the link (add)
|
|
343
|
+
// We check: if the link is surrounded by underlined content, it's explicit user underline
|
|
344
|
+
if (hasUnderlineFormatting(el)) {
|
|
345
|
+
const isInsideLink =
|
|
346
|
+
isElement(el) &&
|
|
347
|
+
el.parentElement &&
|
|
348
|
+
tagName(el.parentElement) === 'a'
|
|
349
|
+
|
|
350
|
+
if (isInsideLink) {
|
|
351
|
+
// Check if there are underlined siblings of the link
|
|
352
|
+
const linkElement = el.parentElement
|
|
353
|
+
if (linkElement) {
|
|
354
|
+
const prevSibling = linkElement.previousSibling
|
|
355
|
+
const nextSibling = linkElement.nextSibling
|
|
356
|
+
|
|
357
|
+
// If either sibling is an underlined TextRun, the link is part of explicit underline
|
|
358
|
+
const hasPrevUnderline =
|
|
359
|
+
prevSibling &&
|
|
360
|
+
isElement(prevSibling) &&
|
|
361
|
+
hasUnderlineFormatting(prevSibling)
|
|
362
|
+
const hasNextUnderline =
|
|
363
|
+
nextSibling &&
|
|
364
|
+
isElement(nextSibling) &&
|
|
365
|
+
hasUnderlineFormatting(nextSibling)
|
|
366
|
+
|
|
367
|
+
if (hasPrevUnderline || hasNextUnderline) {
|
|
368
|
+
span.marks.push('underline')
|
|
369
|
+
}
|
|
370
|
+
// Otherwise, it's just default link styling, don't add underline mark
|
|
371
|
+
}
|
|
372
|
+
} else {
|
|
373
|
+
// Not in a link, always add underline
|
|
374
|
+
span.marks.push('underline')
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
// Add strikethrough mark if the element has strikethrough formatting
|
|
379
|
+
if (hasStrikethroughFormatting(el)) {
|
|
380
|
+
span.marks.push('strike-through')
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
return span
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
return undefined
|
|
387
|
+
},
|
|
388
|
+
},
|
|
389
|
+
]
|
|
390
|
+
}
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
import {BLOCK_DEFAULT_STYLE, DEFAULT_BLOCK} from '../../constants'
|
|
2
|
-
import type {DeserializerRule} from '../../types'
|
|
3
|
-
import {isElement, tagName} from '../helpers'
|
|
4
|
-
|
|
5
|
-
function getListItemStyle(el: Node): string | undefined {
|
|
6
|
-
const style = isElement(el) && el.getAttribute('style')
|
|
7
|
-
if (!style) {
|
|
8
|
-
return undefined
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
if (!style.match(/lfo\d+/)) {
|
|
12
|
-
return undefined
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
return style.match('lfo1') ? 'bullet' : 'number'
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
function getListItemLevel(el: Node): number | undefined {
|
|
19
|
-
const style = isElement(el) && el.getAttribute('style')
|
|
20
|
-
if (!style) {
|
|
21
|
-
return undefined
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
const levelMatch = style.match(/level\d+/)
|
|
25
|
-
if (!levelMatch) {
|
|
26
|
-
return undefined
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
const [level] = levelMatch[0].match(/\d/) || []
|
|
30
|
-
const levelNum = level ? Number.parseInt(level, 10) : 1
|
|
31
|
-
return levelNum || 1
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
function isWordListElement(el: Node): boolean {
|
|
35
|
-
return isElement(el) && el.className
|
|
36
|
-
? el.className === 'MsoListParagraphCxSpFirst' ||
|
|
37
|
-
el.className === 'MsoListParagraphCxSpMiddle' ||
|
|
38
|
-
el.className === 'MsoListParagraphCxSpLast'
|
|
39
|
-
: false
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
export default function createWordRules(): DeserializerRule[] {
|
|
43
|
-
return [
|
|
44
|
-
{
|
|
45
|
-
deserialize(el, next) {
|
|
46
|
-
if (tagName(el) === 'p' && isWordListElement(el)) {
|
|
47
|
-
return {
|
|
48
|
-
...DEFAULT_BLOCK,
|
|
49
|
-
listItem: getListItemStyle(el),
|
|
50
|
-
level: getListItemLevel(el),
|
|
51
|
-
style: BLOCK_DEFAULT_STYLE,
|
|
52
|
-
children: next(el.childNodes),
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
return undefined
|
|
56
|
-
},
|
|
57
|
-
},
|
|
58
|
-
]
|
|
59
|
-
}
|
/package/src/HtmlDeserializer/rules/{whitespace-text-node.ts → rules.whitespace-text-node.ts}
RENAMED
|
File without changes
|