@portabletext/block-tools 4.1.8 → 4.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/_chunks-es/helpers.js +81 -9
- package/lib/_chunks-es/helpers.js.map +1 -1
- package/lib/index.js +14 -18
- package/lib/index.js.map +1 -1
- package/package.json +7 -10
- package/src/HtmlDeserializer/flatten-nested-blocks.test.ts +0 -248
- package/src/HtmlDeserializer/flatten-nested-blocks.ts +0 -173
- package/src/HtmlDeserializer/helpers.ts +0 -108
- package/src/HtmlDeserializer/index.ts +0 -315
- package/src/HtmlDeserializer/preprocessors/index.ts +0 -15
- package/src/HtmlDeserializer/preprocessors/preprocessor.gdocs.ts +0 -66
- package/src/HtmlDeserializer/preprocessors/preprocessor.html.ts +0 -57
- package/src/HtmlDeserializer/preprocessors/preprocessor.notion.ts +0 -25
- package/src/HtmlDeserializer/preprocessors/preprocessor.whitespace.ts +0 -56
- package/src/HtmlDeserializer/preprocessors/preprocessor.word.ts +0 -92
- package/src/HtmlDeserializer/preprocessors/xpathResult.ts +0 -13
- package/src/HtmlDeserializer/rules/index.ts +0 -21
- package/src/HtmlDeserializer/rules/rules.gdocs.ts +0 -188
- package/src/HtmlDeserializer/rules/rules.html.ts +0 -356
- package/src/HtmlDeserializer/rules/rules.notion.ts +0 -57
- package/src/HtmlDeserializer/rules/rules.whitespace-text-node.ts +0 -31
- package/src/HtmlDeserializer/rules/rules.word.ts +0 -95
- package/src/HtmlDeserializer/trim-whitespace.ts +0 -157
- package/src/HtmlDeserializer/word-online/asserters.word-online.ts +0 -153
- package/src/HtmlDeserializer/word-online/preprocessor.word-online.ts +0 -263
- package/src/HtmlDeserializer/word-online/rules.word-online.ts +0 -390
- package/src/constants.ts +0 -104
- package/src/index.ts +0 -49
- package/src/rules/_exports/index.ts +0 -1
- package/src/rules/flatten-tables.test.ts +0 -495
- package/src/rules/flatten-tables.ts +0 -216
- package/src/rules/index.ts +0 -1
- package/src/schema-matchers.ts +0 -41
- package/src/types.ts +0 -100
- package/src/util/findBlockType.ts +0 -13
- package/src/util/normalizeBlock.ts +0 -171
- package/src/util/randomKey.ts +0 -28
- package/src/util/resolveJsType.ts +0 -44
|
@@ -1,315 +0,0 @@
|
|
|
1
|
-
import type {Schema} from '@portabletext/schema'
|
|
2
|
-
import {
|
|
3
|
-
isTextBlock,
|
|
4
|
-
type PortableTextBlock,
|
|
5
|
-
type PortableTextObject,
|
|
6
|
-
} from '@portabletext/schema'
|
|
7
|
-
import {vercelStegaClean} from '@vercel/stega'
|
|
8
|
-
import {flatten} from 'lodash'
|
|
9
|
-
import type {
|
|
10
|
-
ArbitraryTypedObject,
|
|
11
|
-
DeserializerRule,
|
|
12
|
-
HtmlDeserializerOptions,
|
|
13
|
-
PlaceholderAnnotation,
|
|
14
|
-
PlaceholderDecorator,
|
|
15
|
-
TypedObject,
|
|
16
|
-
} from '../types'
|
|
17
|
-
import {keyGenerator} from '../util/randomKey'
|
|
18
|
-
import {resolveJsType} from '../util/resolveJsType'
|
|
19
|
-
import {flattenNestedBlocks} from './flatten-nested-blocks'
|
|
20
|
-
import {
|
|
21
|
-
defaultParseHtml,
|
|
22
|
-
ensureRootIsBlocks,
|
|
23
|
-
isMinimalBlock,
|
|
24
|
-
isMinimalSpan,
|
|
25
|
-
isNodeList,
|
|
26
|
-
isPlaceholderAnnotation,
|
|
27
|
-
isPlaceholderDecorator,
|
|
28
|
-
tagName,
|
|
29
|
-
} from './helpers'
|
|
30
|
-
import {preprocessors} from './preprocessors'
|
|
31
|
-
import {createRules} from './rules'
|
|
32
|
-
import {trimWhitespace} from './trim-whitespace'
|
|
33
|
-
|
|
34
|
-
/**
|
|
35
|
-
* HTML Deserializer
|
|
36
|
-
*
|
|
37
|
-
*/
|
|
38
|
-
export default class HtmlDeserializer {
|
|
39
|
-
keyGenerator: () => string
|
|
40
|
-
schema: Schema
|
|
41
|
-
rules: DeserializerRule[]
|
|
42
|
-
parseHtml: (html: string) => HTMLElement
|
|
43
|
-
whitespaceMode: 'preserve' | 'remove' | 'normalize'
|
|
44
|
-
_markDefs: PortableTextObject[] = []
|
|
45
|
-
|
|
46
|
-
/**
|
|
47
|
-
* Create a new serializer respecting a Sanity block content type's schema
|
|
48
|
-
*
|
|
49
|
-
* @param blockContentType - Schema type for array containing _at least_ a block child type
|
|
50
|
-
* @param options - Options for the deserialization process
|
|
51
|
-
*/
|
|
52
|
-
constructor(schema: Schema, options: HtmlDeserializerOptions = {}) {
|
|
53
|
-
const {rules = [], unstable_whitespaceOnPasteMode = 'preserve'} = options
|
|
54
|
-
const standardRules = createRules(schema, {
|
|
55
|
-
keyGenerator: options.keyGenerator,
|
|
56
|
-
matchers: options.matchers,
|
|
57
|
-
})
|
|
58
|
-
this.schema = schema
|
|
59
|
-
this.keyGenerator = options.keyGenerator ?? keyGenerator
|
|
60
|
-
this.rules = [...rules, ...standardRules]
|
|
61
|
-
this.whitespaceMode = unstable_whitespaceOnPasteMode
|
|
62
|
-
const parseHtml = options.parseHtml || defaultParseHtml()
|
|
63
|
-
this.parseHtml = (html) => {
|
|
64
|
-
const cleanHTML = vercelStegaClean(html)
|
|
65
|
-
const doc = parseHtml(cleanHTML)
|
|
66
|
-
|
|
67
|
-
for (const processor of preprocessors) {
|
|
68
|
-
processor(cleanHTML, doc)
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
return doc.body
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
/**
|
|
76
|
-
* Deserialize HTML.
|
|
77
|
-
*
|
|
78
|
-
* @param html - The HTML to deserialize, as a string
|
|
79
|
-
* @returns Array of blocks - either portable text blocks or other allowed blocks
|
|
80
|
-
*/
|
|
81
|
-
deserialize = (html: string): TypedObject[] => {
|
|
82
|
-
this._markDefs = []
|
|
83
|
-
const {parseHtml} = this
|
|
84
|
-
const fragment = parseHtml(html)
|
|
85
|
-
const children = Array.from(fragment.childNodes) as HTMLElement[]
|
|
86
|
-
|
|
87
|
-
const blocks = trimWhitespace(
|
|
88
|
-
{schema: this.schema},
|
|
89
|
-
this.whitespaceMode,
|
|
90
|
-
flattenNestedBlocks(
|
|
91
|
-
{schema: this.schema},
|
|
92
|
-
ensureRootIsBlocks(
|
|
93
|
-
this.schema,
|
|
94
|
-
this.deserializeElements(children) as Array<ArbitraryTypedObject>,
|
|
95
|
-
),
|
|
96
|
-
),
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
if (this._markDefs.length > 0) {
|
|
100
|
-
blocks
|
|
101
|
-
.filter((block) => isTextBlock({schema: this.schema}, block))
|
|
102
|
-
.forEach((block) => {
|
|
103
|
-
block.markDefs = block.markDefs || []
|
|
104
|
-
block.markDefs = block.markDefs.concat(
|
|
105
|
-
this._markDefs.filter((def) => {
|
|
106
|
-
return flatten(
|
|
107
|
-
block.children.map((child) => child.marks || []),
|
|
108
|
-
).includes(def._key)
|
|
109
|
-
}),
|
|
110
|
-
)
|
|
111
|
-
})
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
return blocks.map((block) => {
|
|
115
|
-
if (block._type === 'block') {
|
|
116
|
-
block._type = this.schema.block.name
|
|
117
|
-
}
|
|
118
|
-
return block
|
|
119
|
-
})
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
/**
|
|
123
|
-
* Deserialize an array of DOM elements.
|
|
124
|
-
*
|
|
125
|
-
* @param elements - Array of DOM elements to deserialize
|
|
126
|
-
* @returns
|
|
127
|
-
*/
|
|
128
|
-
deserializeElements = (elements: Node[] = []): TypedObject[] => {
|
|
129
|
-
let nodes: TypedObject[] = []
|
|
130
|
-
elements.forEach((element) => {
|
|
131
|
-
nodes = nodes.concat(this.deserializeElement(element))
|
|
132
|
-
})
|
|
133
|
-
return nodes
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
/**
|
|
137
|
-
* Deserialize a DOM element
|
|
138
|
-
*
|
|
139
|
-
* @param element - Deserialize a DOM element
|
|
140
|
-
* @returns
|
|
141
|
-
*/
|
|
142
|
-
deserializeElement = (element: Node): TypedObject | TypedObject[] => {
|
|
143
|
-
const next = (
|
|
144
|
-
elements: Node | Node[] | NodeList,
|
|
145
|
-
): TypedObject | TypedObject[] | undefined => {
|
|
146
|
-
if (isNodeList(elements)) {
|
|
147
|
-
return this.deserializeElements(Array.from(elements))
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
if (Array.isArray(elements)) {
|
|
151
|
-
return this.deserializeElements(elements)
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
if (!elements) {
|
|
155
|
-
return undefined
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
return this.deserializeElement(elements)
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
const block = (props: ArbitraryTypedObject) => {
|
|
162
|
-
return {
|
|
163
|
-
_type: '__block',
|
|
164
|
-
block: props,
|
|
165
|
-
}
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
let node: TypedObject | Array<TypedObject> | undefined
|
|
169
|
-
for (let i = 0; i < this.rules.length; i++) {
|
|
170
|
-
const rule = this.rules[i]
|
|
171
|
-
if (!rule.deserialize) {
|
|
172
|
-
continue
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
const ret = rule.deserialize(element, next, block)
|
|
176
|
-
const type = resolveJsType(ret)
|
|
177
|
-
|
|
178
|
-
if (
|
|
179
|
-
type !== 'array' &&
|
|
180
|
-
type !== 'object' &&
|
|
181
|
-
type !== 'null' &&
|
|
182
|
-
type !== 'undefined'
|
|
183
|
-
) {
|
|
184
|
-
throw new Error(
|
|
185
|
-
`A rule returned an invalid deserialized representation: "${node}".`,
|
|
186
|
-
)
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
if (ret === undefined) {
|
|
190
|
-
continue
|
|
191
|
-
} else if (ret === null) {
|
|
192
|
-
throw new Error('Deserializer rule returned `null`')
|
|
193
|
-
} else if (Array.isArray(ret)) {
|
|
194
|
-
node = ret
|
|
195
|
-
} else if (isPlaceholderDecorator(ret)) {
|
|
196
|
-
node = this.deserializeDecorator(ret)
|
|
197
|
-
} else if (isPlaceholderAnnotation(ret)) {
|
|
198
|
-
node = this.deserializeAnnotation(ret)
|
|
199
|
-
} else {
|
|
200
|
-
node = ret
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
// Set list level on list item
|
|
204
|
-
if (
|
|
205
|
-
ret &&
|
|
206
|
-
!Array.isArray(ret) &&
|
|
207
|
-
isMinimalBlock(ret) &&
|
|
208
|
-
'listItem' in ret
|
|
209
|
-
) {
|
|
210
|
-
let parent = element.parentNode?.parentNode
|
|
211
|
-
while (parent && tagName(parent) === 'li') {
|
|
212
|
-
parent = parent.parentNode?.parentNode
|
|
213
|
-
ret.level = ret.level ? ret.level + 1 : 1
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
// Set newlines on spans orginating from a block element within a blockquote
|
|
218
|
-
if (
|
|
219
|
-
ret &&
|
|
220
|
-
!Array.isArray(ret) &&
|
|
221
|
-
isMinimalBlock(ret) &&
|
|
222
|
-
ret.style === 'blockquote'
|
|
223
|
-
) {
|
|
224
|
-
ret.children.forEach((child, index) => {
|
|
225
|
-
if (isMinimalSpan(child) && child.text === '\r') {
|
|
226
|
-
child.text = '\n'
|
|
227
|
-
if (index === 0 || index === ret.children.length - 1) {
|
|
228
|
-
ret.children.splice(index, 1)
|
|
229
|
-
}
|
|
230
|
-
}
|
|
231
|
-
})
|
|
232
|
-
}
|
|
233
|
-
break
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
return node || next(element.childNodes) || []
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
/**
|
|
240
|
-
* Deserialize a `__decorator` type
|
|
241
|
-
* (an internal made up type to process decorators exclusively)
|
|
242
|
-
*
|
|
243
|
-
* @param decorator -
|
|
244
|
-
* @returns array of ...
|
|
245
|
-
*/
|
|
246
|
-
deserializeDecorator = (decorator: PlaceholderDecorator): TypedObject[] => {
|
|
247
|
-
const {name} = decorator
|
|
248
|
-
const applyDecorator = (node: TypedObject) => {
|
|
249
|
-
if (isPlaceholderDecorator(node)) {
|
|
250
|
-
return this.deserializeDecorator(node)
|
|
251
|
-
} else if (isMinimalSpan(node)) {
|
|
252
|
-
node.marks = node.marks || []
|
|
253
|
-
if (node.text.trim()) {
|
|
254
|
-
// Only apply marks if this is an actual text
|
|
255
|
-
node.marks.unshift(name)
|
|
256
|
-
}
|
|
257
|
-
} else if (
|
|
258
|
-
'children' in node &&
|
|
259
|
-
Array.isArray((node as PortableTextBlock).children)
|
|
260
|
-
) {
|
|
261
|
-
const block = node as any
|
|
262
|
-
block.children = block.children.map(applyDecorator)
|
|
263
|
-
}
|
|
264
|
-
return node
|
|
265
|
-
}
|
|
266
|
-
return decorator.children.reduce((children, node) => {
|
|
267
|
-
const ret = applyDecorator(node)
|
|
268
|
-
if (Array.isArray(ret)) {
|
|
269
|
-
return children.concat(ret)
|
|
270
|
-
}
|
|
271
|
-
children.push(ret)
|
|
272
|
-
return children
|
|
273
|
-
}, [] as TypedObject[])
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
/**
|
|
277
|
-
* Deserialize a `__annotation` object.
|
|
278
|
-
* (an internal made up type to process annotations exclusively)
|
|
279
|
-
*
|
|
280
|
-
* @param annotation -
|
|
281
|
-
* @returns Array of...
|
|
282
|
-
*/
|
|
283
|
-
deserializeAnnotation = (
|
|
284
|
-
annotation: PlaceholderAnnotation,
|
|
285
|
-
): TypedObject[] => {
|
|
286
|
-
const {markDef} = annotation
|
|
287
|
-
this._markDefs.push(markDef)
|
|
288
|
-
const applyAnnotation = (node: TypedObject) => {
|
|
289
|
-
if (isPlaceholderAnnotation(node)) {
|
|
290
|
-
return this.deserializeAnnotation(node)
|
|
291
|
-
} else if (isMinimalSpan(node)) {
|
|
292
|
-
node.marks = node.marks || []
|
|
293
|
-
if (node.text.trim()) {
|
|
294
|
-
// Only apply marks if this is an actual text
|
|
295
|
-
node.marks.unshift(markDef._key)
|
|
296
|
-
}
|
|
297
|
-
} else if (
|
|
298
|
-
'children' in node &&
|
|
299
|
-
Array.isArray((node as PortableTextBlock).children)
|
|
300
|
-
) {
|
|
301
|
-
const block = node as any
|
|
302
|
-
block.children = block.children.map(applyAnnotation)
|
|
303
|
-
}
|
|
304
|
-
return node
|
|
305
|
-
}
|
|
306
|
-
return annotation.children.reduce((children, node) => {
|
|
307
|
-
const ret = applyAnnotation(node)
|
|
308
|
-
if (Array.isArray(ret)) {
|
|
309
|
-
return children.concat(ret)
|
|
310
|
-
}
|
|
311
|
-
children.push(ret)
|
|
312
|
-
return children
|
|
313
|
-
}, [] as TypedObject[])
|
|
314
|
-
}
|
|
315
|
-
}
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
import {preprocessWordOnline} from '../word-online/preprocessor.word-online'
|
|
2
|
-
import {preprocessGDocs} from './preprocessor.gdocs'
|
|
3
|
-
import {preprocessHTML} from './preprocessor.html'
|
|
4
|
-
import {preprocessNotion} from './preprocessor.notion'
|
|
5
|
-
import {preprocessWhitespace} from './preprocessor.whitespace'
|
|
6
|
-
import {preprocessWord} from './preprocessor.word'
|
|
7
|
-
|
|
8
|
-
export const preprocessors = [
|
|
9
|
-
preprocessWhitespace,
|
|
10
|
-
preprocessNotion,
|
|
11
|
-
preprocessWord,
|
|
12
|
-
preprocessWordOnline,
|
|
13
|
-
preprocessGDocs,
|
|
14
|
-
preprocessHTML,
|
|
15
|
-
]
|
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
import {tagName} from '../helpers'
|
|
2
|
-
import {_XPathResult} from './xpathResult'
|
|
3
|
-
|
|
4
|
-
export function preprocessGDocs(_html: string, doc: Document): Document {
|
|
5
|
-
let gDocsRootOrSiblingNode = doc
|
|
6
|
-
.evaluate(
|
|
7
|
-
'//*[@id and contains(@id, "docs-internal-guid")]',
|
|
8
|
-
doc,
|
|
9
|
-
null,
|
|
10
|
-
_XPathResult.ORDERED_NODE_ITERATOR_TYPE,
|
|
11
|
-
null,
|
|
12
|
-
)
|
|
13
|
-
.iterateNext()
|
|
14
|
-
|
|
15
|
-
if (gDocsRootOrSiblingNode) {
|
|
16
|
-
const isWrappedRootTag = tagName(gDocsRootOrSiblingNode) === 'b'
|
|
17
|
-
|
|
18
|
-
// If this document isn't wrapped in a 'b' tag, then assume all siblings live on the root level
|
|
19
|
-
if (!isWrappedRootTag) {
|
|
20
|
-
gDocsRootOrSiblingNode = doc.body
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
// Tag every child with attribute 'is-google-docs' so that the GDocs rule-set can
|
|
24
|
-
// work exclusivly on these children
|
|
25
|
-
const childNodes = doc.evaluate(
|
|
26
|
-
'//*',
|
|
27
|
-
doc,
|
|
28
|
-
null,
|
|
29
|
-
_XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
|
|
30
|
-
null,
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
for (let i = childNodes.snapshotLength - 1; i >= 0; i--) {
|
|
34
|
-
const elm = childNodes.snapshotItem(i) as HTMLElement
|
|
35
|
-
elm?.setAttribute('data-is-google-docs', 'true')
|
|
36
|
-
|
|
37
|
-
if (
|
|
38
|
-
elm?.parentElement === gDocsRootOrSiblingNode ||
|
|
39
|
-
(!isWrappedRootTag && elm.parentElement === doc.body)
|
|
40
|
-
) {
|
|
41
|
-
elm?.setAttribute('data-is-root-node', 'true')
|
|
42
|
-
tagName(elm)
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
// Handle checkmark lists - The first child of a list item is an image with a checkmark, and the serializer
|
|
46
|
-
// expects the first child to be the text node
|
|
47
|
-
if (
|
|
48
|
-
tagName(elm) === 'li' &&
|
|
49
|
-
elm.firstChild &&
|
|
50
|
-
tagName(elm?.firstChild) === 'img'
|
|
51
|
-
) {
|
|
52
|
-
elm.removeChild(elm.firstChild)
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
// Remove that 'b' which Google Docs wraps the HTML content in
|
|
57
|
-
if (isWrappedRootTag) {
|
|
58
|
-
doc.body.firstElementChild?.replaceWith(
|
|
59
|
-
...Array.from(gDocsRootOrSiblingNode.childNodes),
|
|
60
|
-
)
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
return doc
|
|
64
|
-
}
|
|
65
|
-
return doc
|
|
66
|
-
}
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
import {_XPathResult} from './xpathResult'
|
|
2
|
-
|
|
3
|
-
// Remove this cruft from the document
|
|
4
|
-
const unwantedWordDocumentPaths = [
|
|
5
|
-
'/html/text()',
|
|
6
|
-
'/html/head/text()',
|
|
7
|
-
'/html/body/text()',
|
|
8
|
-
'/html/body/ul/text()',
|
|
9
|
-
'/html/body/ol/text()',
|
|
10
|
-
'//comment()',
|
|
11
|
-
'//style',
|
|
12
|
-
'//xml',
|
|
13
|
-
'//script',
|
|
14
|
-
'//meta',
|
|
15
|
-
'//link',
|
|
16
|
-
]
|
|
17
|
-
|
|
18
|
-
export function preprocessHTML(_html: string, doc: Document): Document {
|
|
19
|
-
// Make sure text directly on the body is wrapped in spans.
|
|
20
|
-
// This mimics what the browser does before putting html on the clipboard,
|
|
21
|
-
// when used in a script context with JSDOM
|
|
22
|
-
const bodyTextNodes = doc.evaluate(
|
|
23
|
-
'/html/body/text()',
|
|
24
|
-
doc,
|
|
25
|
-
null,
|
|
26
|
-
_XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
|
|
27
|
-
null,
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
for (let i = bodyTextNodes.snapshotLength - 1; i >= 0; i--) {
|
|
31
|
-
const node = bodyTextNodes.snapshotItem(i) as HTMLElement
|
|
32
|
-
const text = node.textContent || ''
|
|
33
|
-
if (text.replace(/[^\S\n]+$/g, '')) {
|
|
34
|
-
const newNode = doc.createElement('span')
|
|
35
|
-
newNode.appendChild(doc.createTextNode(text))
|
|
36
|
-
node.parentNode?.replaceChild(newNode, node)
|
|
37
|
-
} else {
|
|
38
|
-
node.parentNode?.removeChild(node)
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
const unwantedNodes = doc.evaluate(
|
|
43
|
-
unwantedWordDocumentPaths.join('|'),
|
|
44
|
-
doc,
|
|
45
|
-
null,
|
|
46
|
-
_XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
|
|
47
|
-
null,
|
|
48
|
-
)
|
|
49
|
-
for (let i = unwantedNodes.snapshotLength - 1; i >= 0; i--) {
|
|
50
|
-
const unwanted = unwantedNodes.snapshotItem(i)
|
|
51
|
-
if (!unwanted) {
|
|
52
|
-
continue
|
|
53
|
-
}
|
|
54
|
-
unwanted.parentNode?.removeChild(unwanted)
|
|
55
|
-
}
|
|
56
|
-
return doc
|
|
57
|
-
}
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
import {_XPathResult} from './xpathResult'
|
|
2
|
-
|
|
3
|
-
export function preprocessNotion(html: string, doc: Document): Document {
|
|
4
|
-
const NOTION_REGEX = /<!-- notionvc:.*?-->/g
|
|
5
|
-
|
|
6
|
-
if (html.match(NOTION_REGEX)) {
|
|
7
|
-
// Tag every child with attribute 'is-notion' so that the Notion rule-set can
|
|
8
|
-
// work exclusivly on these children
|
|
9
|
-
const childNodes = doc.evaluate(
|
|
10
|
-
'//*',
|
|
11
|
-
doc,
|
|
12
|
-
null,
|
|
13
|
-
_XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
|
|
14
|
-
null,
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
for (let i = childNodes.snapshotLength - 1; i >= 0; i--) {
|
|
18
|
-
const elm = childNodes.snapshotItem(i) as HTMLElement
|
|
19
|
-
elm?.setAttribute('data-is-notion', 'true')
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
return doc
|
|
23
|
-
}
|
|
24
|
-
return doc
|
|
25
|
-
}
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
import {PRESERVE_WHITESPACE_TAGS} from '../../constants'
|
|
2
|
-
import {_XPathResult} from './xpathResult'
|
|
3
|
-
|
|
4
|
-
// Elements that only contain block-level children (not inline text content)
|
|
5
|
-
const BLOCK_CONTAINER_ELEMENTS = [
|
|
6
|
-
'body',
|
|
7
|
-
'table',
|
|
8
|
-
'tbody',
|
|
9
|
-
'thead',
|
|
10
|
-
'tfoot',
|
|
11
|
-
'tr',
|
|
12
|
-
'ul',
|
|
13
|
-
'ol',
|
|
14
|
-
]
|
|
15
|
-
|
|
16
|
-
export function preprocessWhitespace(_: string, doc: Document): Document {
|
|
17
|
-
// Recursively process all nodes.
|
|
18
|
-
function processNode(node: Node) {
|
|
19
|
-
// If this is a text node and not inside a tag where whitespace should be preserved, process it.
|
|
20
|
-
if (
|
|
21
|
-
node.nodeType === _XPathResult.BOOLEAN_TYPE &&
|
|
22
|
-
!PRESERVE_WHITESPACE_TAGS.includes(
|
|
23
|
-
node.parentElement?.tagName.toLowerCase() || '',
|
|
24
|
-
)
|
|
25
|
-
) {
|
|
26
|
-
const normalized =
|
|
27
|
-
node.textContent
|
|
28
|
-
?.replace(/\s\s+/g, ' ') // Remove multiple whitespace
|
|
29
|
-
.replace(/[\r\n]+/g, ' ') || '' // Replace newlines with spaces
|
|
30
|
-
const parentTag = node.parentElement?.tagName.toLowerCase()
|
|
31
|
-
|
|
32
|
-
if (
|
|
33
|
-
parentTag &&
|
|
34
|
-
BLOCK_CONTAINER_ELEMENTS.includes(parentTag) &&
|
|
35
|
-
normalized.trim() === ''
|
|
36
|
-
) {
|
|
37
|
-
// If parent is a block container and text is only whitespace, remove it
|
|
38
|
-
node.parentNode?.removeChild(node)
|
|
39
|
-
} else {
|
|
40
|
-
node.textContent = normalized
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
// Otherwise, if this node has children, process them.
|
|
44
|
-
else {
|
|
45
|
-
// Process children in reverse to handle removals safely
|
|
46
|
-
for (let i = node.childNodes.length - 1; i >= 0; i--) {
|
|
47
|
-
processNode(node.childNodes[i])
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
// Process all nodes starting from the root.
|
|
53
|
-
processNode(doc.body)
|
|
54
|
-
|
|
55
|
-
return doc
|
|
56
|
-
}
|
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
import {_XPathResult} from './xpathResult'
|
|
2
|
-
|
|
3
|
-
const WORD_HTML_REGEX =
|
|
4
|
-
/(class="?Mso|style=(?:"|')[^"]*?\bmso-|w:WordDocument|<o:\w+>|<\/font>)/
|
|
5
|
-
|
|
6
|
-
// xPaths for elements that will be removed from the document
|
|
7
|
-
const unwantedPaths = [
|
|
8
|
-
'//o:p',
|
|
9
|
-
"//span[@style='mso-list:Ignore']",
|
|
10
|
-
"//span[@style='mso-list: Ignore']",
|
|
11
|
-
]
|
|
12
|
-
|
|
13
|
-
// xPaths for elements that needs to be remapped into other tags
|
|
14
|
-
const mappedPaths = [
|
|
15
|
-
"//p[@class='MsoTocHeading']",
|
|
16
|
-
"//p[@class='MsoTitle']",
|
|
17
|
-
"//p[@class='MsoToaHeading']",
|
|
18
|
-
"//p[@class='MsoSubtitle']",
|
|
19
|
-
"//span[@class='MsoSubtleEmphasis']",
|
|
20
|
-
"//span[@class='MsoIntenseEmphasis']",
|
|
21
|
-
]
|
|
22
|
-
|
|
23
|
-
// Which HTML element(s) to map the elements matching mappedPaths into
|
|
24
|
-
const elementMap: Record<string, string[] | undefined> = {
|
|
25
|
-
MsoTocHeading: ['h3'],
|
|
26
|
-
MsoTitle: ['h1'],
|
|
27
|
-
MsoToaHeading: ['h2'],
|
|
28
|
-
MsoSubtitle: ['h5'],
|
|
29
|
-
MsoSubtleEmphasis: ['span', 'em'],
|
|
30
|
-
MsoIntenseEmphasis: ['span', 'em', 'strong'],
|
|
31
|
-
// Remove cruft
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
function isWordHtml(html: string) {
|
|
35
|
-
return WORD_HTML_REGEX.test(html)
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
export function preprocessWord(html: string, doc: Document): Document {
|
|
39
|
-
if (!isWordHtml(html)) {
|
|
40
|
-
return doc
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
const unwantedNodes = doc.evaluate(
|
|
44
|
-
unwantedPaths.join('|'),
|
|
45
|
-
doc,
|
|
46
|
-
(prefix) => {
|
|
47
|
-
if (prefix === 'o') {
|
|
48
|
-
return 'urn:schemas-microsoft-com:office:office'
|
|
49
|
-
}
|
|
50
|
-
return null
|
|
51
|
-
},
|
|
52
|
-
_XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
|
|
53
|
-
null,
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
for (let i = unwantedNodes.snapshotLength - 1; i >= 0; i--) {
|
|
57
|
-
const unwanted = unwantedNodes.snapshotItem(i)
|
|
58
|
-
if (unwanted?.parentNode) {
|
|
59
|
-
unwanted.parentNode.removeChild(unwanted)
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
// Transform mapped elements into what they should be mapped to
|
|
64
|
-
const mappedElements = doc.evaluate(
|
|
65
|
-
mappedPaths.join('|'),
|
|
66
|
-
doc,
|
|
67
|
-
null,
|
|
68
|
-
_XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
|
|
69
|
-
null,
|
|
70
|
-
)
|
|
71
|
-
for (let i = mappedElements.snapshotLength - 1; i >= 0; i--) {
|
|
72
|
-
const mappedElm = mappedElements.snapshotItem(i) as HTMLElement
|
|
73
|
-
const tags = elementMap[mappedElm.className]
|
|
74
|
-
const text = doc.createTextNode(mappedElm.textContent || '')
|
|
75
|
-
if (!tags) {
|
|
76
|
-
continue
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
const parentElement = doc.createElement(tags[0])
|
|
80
|
-
let parent = parentElement
|
|
81
|
-
let child = parentElement
|
|
82
|
-
tags.slice(1).forEach((tag) => {
|
|
83
|
-
child = doc.createElement(tag)
|
|
84
|
-
parent.appendChild(child)
|
|
85
|
-
parent = child
|
|
86
|
-
})
|
|
87
|
-
child.appendChild(text)
|
|
88
|
-
mappedElm?.parentNode?.replaceChild(parentElement, mappedElm)
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
return doc
|
|
92
|
-
}
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
// We need this here if run server side
|
|
2
|
-
export const _XPathResult = {
|
|
3
|
-
ANY_TYPE: 0,
|
|
4
|
-
NUMBER_TYPE: 1,
|
|
5
|
-
STRING_TYPE: 2,
|
|
6
|
-
BOOLEAN_TYPE: 3,
|
|
7
|
-
UNORDERED_NODE_ITERATOR_TYPE: 4,
|
|
8
|
-
ORDERED_NODE_ITERATOR_TYPE: 5,
|
|
9
|
-
UNORDERED_NODE_SNAPSHOT_TYPE: 6,
|
|
10
|
-
ORDERED_NODE_SNAPSHOT_TYPE: 7,
|
|
11
|
-
ANY_UNORDERED_NODE_TYPE: 8,
|
|
12
|
-
FIRST_ORDERED_NODE_TYPE: 9,
|
|
13
|
-
}
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import type {Schema} from '@portabletext/schema'
|
|
2
|
-
import type {SchemaMatchers} from '../../schema-matchers'
|
|
3
|
-
import type {DeserializerRule} from '../../types'
|
|
4
|
-
import {createWordOnlineRules} from '../word-online/rules.word-online'
|
|
5
|
-
import {createGDocsRules} from './rules.gdocs'
|
|
6
|
-
import {createHTMLRules} from './rules.html'
|
|
7
|
-
import {createNotionRules} from './rules.notion'
|
|
8
|
-
import {createWordRules} from './rules.word'
|
|
9
|
-
|
|
10
|
-
export function createRules(
|
|
11
|
-
schema: Schema,
|
|
12
|
-
options: {keyGenerator?: () => string; matchers?: SchemaMatchers},
|
|
13
|
-
): DeserializerRule[] {
|
|
14
|
-
return [
|
|
15
|
-
...createWordRules(),
|
|
16
|
-
...createWordOnlineRules(schema, options),
|
|
17
|
-
...createNotionRules(),
|
|
18
|
-
...createGDocsRules(schema),
|
|
19
|
-
...createHTMLRules(schema, options),
|
|
20
|
-
]
|
|
21
|
-
}
|