@portabletext/block-tools 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +226 -0
- package/lib/index.cjs +1056 -0
- package/lib/index.cjs.map +1 -0
- package/lib/index.d.cts +172 -0
- package/lib/index.d.ts +172 -0
- package/lib/index.js +1056 -0
- package/lib/index.js.map +1 -0
- package/package.json +71 -0
- package/src/HtmlDeserializer/helpers.ts +363 -0
- package/src/HtmlDeserializer/index.ts +313 -0
- package/src/HtmlDeserializer/preprocessors/gdocs.ts +86 -0
- package/src/HtmlDeserializer/preprocessors/html.ts +57 -0
- package/src/HtmlDeserializer/preprocessors/index.ts +13 -0
- package/src/HtmlDeserializer/preprocessors/notion.ts +25 -0
- package/src/HtmlDeserializer/preprocessors/whitespace.ts +31 -0
- package/src/HtmlDeserializer/preprocessors/word.ts +92 -0
- package/src/HtmlDeserializer/preprocessors/xpathResult.ts +13 -0
- package/src/HtmlDeserializer/rules/gdocs.ts +183 -0
- package/src/HtmlDeserializer/rules/html.ts +264 -0
- package/src/HtmlDeserializer/rules/index.ts +18 -0
- package/src/HtmlDeserializer/rules/notion.ts +60 -0
- package/src/HtmlDeserializer/rules/word.ts +59 -0
- package/src/constants.ts +104 -0
- package/src/index.ts +52 -0
- package/src/types.ts +139 -0
- package/src/util/blockContentTypeFeatures.ts +141 -0
- package/src/util/findBlockType.ts +13 -0
- package/src/util/normalizeBlock.ts +142 -0
- package/src/util/randomKey.ts +26 -0
- package/src/util/resolveJsType.ts +44 -0
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
ArraySchemaType,
|
|
3
|
+
PortableTextBlock,
|
|
4
|
+
PortableTextObject,
|
|
5
|
+
PortableTextTextBlock,
|
|
6
|
+
} from '@sanity/types'
|
|
7
|
+
import {flatten} from 'lodash'
|
|
8
|
+
import type {
|
|
9
|
+
ArbitraryTypedObject,
|
|
10
|
+
DeserializerRule,
|
|
11
|
+
HtmlDeserializerOptions,
|
|
12
|
+
PlaceholderAnnotation,
|
|
13
|
+
PlaceholderDecorator,
|
|
14
|
+
TypedObject,
|
|
15
|
+
} from '../types'
|
|
16
|
+
import {findBlockType} from '../util/findBlockType'
|
|
17
|
+
import {resolveJsType} from '../util/resolveJsType'
|
|
18
|
+
import {
|
|
19
|
+
createRuleOptions,
|
|
20
|
+
defaultParseHtml,
|
|
21
|
+
ensureRootIsBlocks,
|
|
22
|
+
flattenNestedBlocks,
|
|
23
|
+
isMinimalBlock,
|
|
24
|
+
isMinimalSpan,
|
|
25
|
+
isNodeList,
|
|
26
|
+
isPlaceholderAnnotation,
|
|
27
|
+
isPlaceholderDecorator,
|
|
28
|
+
preprocess,
|
|
29
|
+
tagName,
|
|
30
|
+
trimWhitespace,
|
|
31
|
+
} from './helpers'
|
|
32
|
+
import {createRules} from './rules'
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* HTML Deserializer
|
|
36
|
+
*
|
|
37
|
+
*/
|
|
38
|
+
export default class HtmlDeserializer {
|
|
39
|
+
blockContentType: ArraySchemaType
|
|
40
|
+
rules: DeserializerRule[]
|
|
41
|
+
parseHtml: (html: string) => HTMLElement
|
|
42
|
+
_markDefs: PortableTextObject[] = []
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Create a new serializer respecting a Sanity block content type's schema
|
|
46
|
+
*
|
|
47
|
+
* @param blockContentType - Schema type for array containing _at least_ a block child type
|
|
48
|
+
* @param options - Options for the deserialization process
|
|
49
|
+
*/
|
|
50
|
+
constructor(
|
|
51
|
+
blockContentType: ArraySchemaType,
|
|
52
|
+
options: HtmlDeserializerOptions = {},
|
|
53
|
+
) {
|
|
54
|
+
const {rules = [], unstable_whitespaceOnPasteMode = 'preserve'} = options
|
|
55
|
+
if (!blockContentType) {
|
|
56
|
+
throw new Error("Parameter 'blockContentType' is required")
|
|
57
|
+
}
|
|
58
|
+
const standardRules = createRules(
|
|
59
|
+
blockContentType,
|
|
60
|
+
createRuleOptions(blockContentType),
|
|
61
|
+
)
|
|
62
|
+
this.rules = [...rules, ...standardRules]
|
|
63
|
+
const parseHtml = options.parseHtml || defaultParseHtml()
|
|
64
|
+
this.blockContentType = blockContentType
|
|
65
|
+
this.parseHtml = (html) => {
|
|
66
|
+
const doc = preprocess(html, parseHtml, {unstable_whitespaceOnPasteMode})
|
|
67
|
+
return doc.body
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Deserialize HTML.
|
|
73
|
+
*
|
|
74
|
+
* @param html - The HTML to deserialize, as a string
|
|
75
|
+
* @returns Array of blocks - either portable text blocks or other allowed blocks
|
|
76
|
+
*/
|
|
77
|
+
deserialize = (html: string): TypedObject[] => {
|
|
78
|
+
this._markDefs = []
|
|
79
|
+
const {parseHtml} = this
|
|
80
|
+
const fragment = parseHtml(html)
|
|
81
|
+
const children = Array.from(fragment.childNodes) as HTMLElement[]
|
|
82
|
+
// Ensure that there are no blocks within blocks, and trim whitespace
|
|
83
|
+
const blocks = trimWhitespace(
|
|
84
|
+
flattenNestedBlocks(
|
|
85
|
+
ensureRootIsBlocks(this.deserializeElements(children)),
|
|
86
|
+
),
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
if (this._markDefs.length > 0) {
|
|
90
|
+
blocks
|
|
91
|
+
.filter(
|
|
92
|
+
(block): block is PortableTextTextBlock => block._type === 'block',
|
|
93
|
+
)
|
|
94
|
+
.forEach((block) => {
|
|
95
|
+
block.markDefs = block.markDefs || []
|
|
96
|
+
block.markDefs = block.markDefs.concat(
|
|
97
|
+
this._markDefs.filter((def) => {
|
|
98
|
+
return flatten(
|
|
99
|
+
block.children.map((child) => child.marks || []),
|
|
100
|
+
).includes(def._key)
|
|
101
|
+
}),
|
|
102
|
+
)
|
|
103
|
+
})
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Set back the potentially hoisted block type
|
|
107
|
+
const type = this.blockContentType.of.find(findBlockType)
|
|
108
|
+
if (!type) {
|
|
109
|
+
return blocks
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return blocks.map((block) => {
|
|
113
|
+
if (block._type === 'block') {
|
|
114
|
+
block._type = type.name
|
|
115
|
+
}
|
|
116
|
+
return block
|
|
117
|
+
})
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Deserialize an array of DOM elements.
|
|
122
|
+
*
|
|
123
|
+
* @param elements - Array of DOM elements to deserialize
|
|
124
|
+
* @returns
|
|
125
|
+
*/
|
|
126
|
+
deserializeElements = (elements: Node[] = []): TypedObject[] => {
|
|
127
|
+
let nodes: TypedObject[] = []
|
|
128
|
+
elements.forEach((element) => {
|
|
129
|
+
nodes = nodes.concat(this.deserializeElement(element))
|
|
130
|
+
})
|
|
131
|
+
return nodes
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Deserialize a DOM element
|
|
136
|
+
*
|
|
137
|
+
* @param element - Deserialize a DOM element
|
|
138
|
+
* @returns
|
|
139
|
+
*/
|
|
140
|
+
deserializeElement = (element: Node): TypedObject | TypedObject[] => {
|
|
141
|
+
const next = (
|
|
142
|
+
elements: Node | Node[] | NodeList,
|
|
143
|
+
): TypedObject | TypedObject[] | undefined => {
|
|
144
|
+
if (isNodeList(elements)) {
|
|
145
|
+
return this.deserializeElements(Array.from(elements))
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (Array.isArray(elements)) {
|
|
149
|
+
return this.deserializeElements(elements)
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if (!elements) {
|
|
153
|
+
return undefined
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
return this.deserializeElement(elements)
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const block = (props: ArbitraryTypedObject) => {
|
|
160
|
+
return {
|
|
161
|
+
_type: '__block',
|
|
162
|
+
block: props,
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
let node: TypedObject | Array<TypedObject> | undefined
|
|
167
|
+
for (let i = 0; i < this.rules.length; i++) {
|
|
168
|
+
const rule = this.rules[i]
|
|
169
|
+
if (!rule.deserialize) {
|
|
170
|
+
continue
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
const ret = rule.deserialize(element, next, block)
|
|
174
|
+
const type = resolveJsType(ret)
|
|
175
|
+
|
|
176
|
+
if (
|
|
177
|
+
type !== 'array' &&
|
|
178
|
+
type !== 'object' &&
|
|
179
|
+
type !== 'null' &&
|
|
180
|
+
type !== 'undefined'
|
|
181
|
+
) {
|
|
182
|
+
throw new Error(
|
|
183
|
+
`A rule returned an invalid deserialized representation: "${node}".`,
|
|
184
|
+
)
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
if (ret === undefined) {
|
|
188
|
+
continue
|
|
189
|
+
} else if (ret === null) {
|
|
190
|
+
throw new Error('Deserializer rule returned `null`')
|
|
191
|
+
} else if (Array.isArray(ret)) {
|
|
192
|
+
node = ret
|
|
193
|
+
} else if (isPlaceholderDecorator(ret)) {
|
|
194
|
+
node = this.deserializeDecorator(ret)
|
|
195
|
+
} else if (isPlaceholderAnnotation(ret)) {
|
|
196
|
+
node = this.deserializeAnnotation(ret)
|
|
197
|
+
} else {
|
|
198
|
+
node = ret
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Set list level on list item
|
|
202
|
+
if (
|
|
203
|
+
ret &&
|
|
204
|
+
!Array.isArray(ret) &&
|
|
205
|
+
isMinimalBlock(ret) &&
|
|
206
|
+
'listItem' in ret
|
|
207
|
+
) {
|
|
208
|
+
let parent = element.parentNode?.parentNode
|
|
209
|
+
while (parent && tagName(parent) === 'li') {
|
|
210
|
+
parent = parent.parentNode?.parentNode
|
|
211
|
+
ret.level = ret.level ? ret.level + 1 : 1
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Set newlines on spans orginating from a block element within a blockquote
|
|
216
|
+
if (
|
|
217
|
+
ret &&
|
|
218
|
+
!Array.isArray(ret) &&
|
|
219
|
+
isMinimalBlock(ret) &&
|
|
220
|
+
ret.style === 'blockquote'
|
|
221
|
+
) {
|
|
222
|
+
ret.children.forEach((child, index) => {
|
|
223
|
+
if (isMinimalSpan(child) && child.text === '\r') {
|
|
224
|
+
child.text = '\n\n'
|
|
225
|
+
if (index === 0 || index === ret.children.length - 1) {
|
|
226
|
+
ret.children.splice(index, 1)
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
})
|
|
230
|
+
}
|
|
231
|
+
break
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return node || next(element.childNodes) || []
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Deserialize a `__decorator` type
|
|
239
|
+
* (an internal made up type to process decorators exclusively)
|
|
240
|
+
*
|
|
241
|
+
* @param decorator -
|
|
242
|
+
* @returns array of ...
|
|
243
|
+
*/
|
|
244
|
+
deserializeDecorator = (decorator: PlaceholderDecorator): TypedObject[] => {
|
|
245
|
+
const {name} = decorator
|
|
246
|
+
const applyDecorator = (node: TypedObject) => {
|
|
247
|
+
if (isPlaceholderDecorator(node)) {
|
|
248
|
+
return this.deserializeDecorator(node)
|
|
249
|
+
} else if (isMinimalSpan(node)) {
|
|
250
|
+
node.marks = node.marks || []
|
|
251
|
+
if (node.text.trim()) {
|
|
252
|
+
// Only apply marks if this is an actual text
|
|
253
|
+
node.marks.unshift(name)
|
|
254
|
+
}
|
|
255
|
+
} else if (
|
|
256
|
+
'children' in node &&
|
|
257
|
+
Array.isArray((node as PortableTextBlock).children)
|
|
258
|
+
) {
|
|
259
|
+
const block = node as any
|
|
260
|
+
block.children = block.children.map(applyDecorator)
|
|
261
|
+
}
|
|
262
|
+
return node
|
|
263
|
+
}
|
|
264
|
+
return decorator.children.reduce((children, node) => {
|
|
265
|
+
const ret = applyDecorator(node)
|
|
266
|
+
if (Array.isArray(ret)) {
|
|
267
|
+
return children.concat(ret)
|
|
268
|
+
}
|
|
269
|
+
children.push(ret)
|
|
270
|
+
return children
|
|
271
|
+
}, [] as TypedObject[])
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Deserialize a `__annotation` object.
|
|
276
|
+
* (an internal made up type to process annotations exclusively)
|
|
277
|
+
*
|
|
278
|
+
* @param annotation -
|
|
279
|
+
* @returns Array of...
|
|
280
|
+
*/
|
|
281
|
+
deserializeAnnotation = (
|
|
282
|
+
annotation: PlaceholderAnnotation,
|
|
283
|
+
): TypedObject[] => {
|
|
284
|
+
const {markDef} = annotation
|
|
285
|
+
this._markDefs.push(markDef)
|
|
286
|
+
const applyAnnotation = (node: TypedObject) => {
|
|
287
|
+
if (isPlaceholderAnnotation(node)) {
|
|
288
|
+
return this.deserializeAnnotation(node)
|
|
289
|
+
} else if (isMinimalSpan(node)) {
|
|
290
|
+
node.marks = node.marks || []
|
|
291
|
+
if (node.text.trim()) {
|
|
292
|
+
// Only apply marks if this is an actual text
|
|
293
|
+
node.marks.unshift(markDef._key)
|
|
294
|
+
}
|
|
295
|
+
} else if (
|
|
296
|
+
'children' in node &&
|
|
297
|
+
Array.isArray((node as PortableTextBlock).children)
|
|
298
|
+
) {
|
|
299
|
+
const block = node as any
|
|
300
|
+
block.children = block.children.map(applyAnnotation)
|
|
301
|
+
}
|
|
302
|
+
return node
|
|
303
|
+
}
|
|
304
|
+
return annotation.children.reduce((children, node) => {
|
|
305
|
+
const ret = applyAnnotation(node)
|
|
306
|
+
if (Array.isArray(ret)) {
|
|
307
|
+
return children.concat(ret)
|
|
308
|
+
}
|
|
309
|
+
children.push(ret)
|
|
310
|
+
return children
|
|
311
|
+
}, [] as TypedObject[])
|
|
312
|
+
}
|
|
313
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import type {HtmlPreprocessorOptions} from '../../types'
|
|
2
|
+
import {normalizeWhitespace, removeAllWhitespace, tagName} from '../helpers'
|
|
3
|
+
import {_XPathResult} from './xpathResult'
|
|
4
|
+
|
|
5
|
+
export default (
|
|
6
|
+
_html: string,
|
|
7
|
+
doc: Document,
|
|
8
|
+
options: HtmlPreprocessorOptions,
|
|
9
|
+
): Document => {
|
|
10
|
+
const whitespaceOnPasteMode =
|
|
11
|
+
options?.unstable_whitespaceOnPasteMode || 'preserve'
|
|
12
|
+
let gDocsRootOrSiblingNode = doc
|
|
13
|
+
.evaluate(
|
|
14
|
+
'//*[@id and contains(@id, "docs-internal-guid")]',
|
|
15
|
+
doc,
|
|
16
|
+
null,
|
|
17
|
+
_XPathResult.ORDERED_NODE_ITERATOR_TYPE,
|
|
18
|
+
null,
|
|
19
|
+
)
|
|
20
|
+
.iterateNext()
|
|
21
|
+
|
|
22
|
+
if (gDocsRootOrSiblingNode) {
|
|
23
|
+
const isWrappedRootTag = tagName(gDocsRootOrSiblingNode) === 'b'
|
|
24
|
+
|
|
25
|
+
// If this document isn't wrapped in a 'b' tag, then assume all siblings live on the root level
|
|
26
|
+
if (!isWrappedRootTag) {
|
|
27
|
+
gDocsRootOrSiblingNode = doc.body
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
switch (whitespaceOnPasteMode) {
|
|
31
|
+
case 'normalize':
|
|
32
|
+
// Keep only 1 empty block between content nodes
|
|
33
|
+
normalizeWhitespace(gDocsRootOrSiblingNode)
|
|
34
|
+
break
|
|
35
|
+
case 'remove':
|
|
36
|
+
// Remove all whitespace nodes
|
|
37
|
+
removeAllWhitespace(gDocsRootOrSiblingNode)
|
|
38
|
+
break
|
|
39
|
+
default:
|
|
40
|
+
break
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Tag every child with attribute 'is-google-docs' so that the GDocs rule-set can
|
|
44
|
+
// work exclusivly on these children
|
|
45
|
+
const childNodes = doc.evaluate(
|
|
46
|
+
'//*',
|
|
47
|
+
doc,
|
|
48
|
+
null,
|
|
49
|
+
_XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
|
|
50
|
+
null,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
for (let i = childNodes.snapshotLength - 1; i >= 0; i--) {
|
|
54
|
+
const elm = childNodes.snapshotItem(i) as HTMLElement
|
|
55
|
+
elm?.setAttribute('data-is-google-docs', 'true')
|
|
56
|
+
|
|
57
|
+
if (
|
|
58
|
+
elm?.parentElement === gDocsRootOrSiblingNode ||
|
|
59
|
+
(!isWrappedRootTag && elm.parentElement === doc.body)
|
|
60
|
+
) {
|
|
61
|
+
elm?.setAttribute('data-is-root-node', 'true')
|
|
62
|
+
tagName(elm)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Handle checkmark lists - The first child of a list item is an image with a checkmark, and the serializer
|
|
66
|
+
// expects the first child to be the text node
|
|
67
|
+
if (
|
|
68
|
+
tagName(elm) === 'li' &&
|
|
69
|
+
elm.firstChild &&
|
|
70
|
+
tagName(elm?.firstChild) === 'img'
|
|
71
|
+
) {
|
|
72
|
+
elm.removeChild(elm.firstChild)
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Remove that 'b' which Google Docs wraps the HTML content in
|
|
77
|
+
if (isWrappedRootTag) {
|
|
78
|
+
doc.body.firstElementChild?.replaceWith(
|
|
79
|
+
...Array.from(gDocsRootOrSiblingNode.childNodes),
|
|
80
|
+
)
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return doc
|
|
84
|
+
}
|
|
85
|
+
return doc
|
|
86
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import {_XPathResult} from './xpathResult'
|
|
2
|
+
|
|
3
|
+
// Remove this cruft from the document
|
|
4
|
+
const unwantedWordDocumentPaths = [
|
|
5
|
+
'/html/text()',
|
|
6
|
+
'/html/head/text()',
|
|
7
|
+
'/html/body/text()',
|
|
8
|
+
'/html/body/ul/text()',
|
|
9
|
+
'/html/body/ol/text()',
|
|
10
|
+
'//comment()',
|
|
11
|
+
'//style',
|
|
12
|
+
'//xml',
|
|
13
|
+
'//script',
|
|
14
|
+
'//meta',
|
|
15
|
+
'//link',
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
export default (_html: string, doc: Document): Document => {
|
|
19
|
+
// Make sure text directly on the body is wrapped in spans.
|
|
20
|
+
// This mimics what the browser does before putting html on the clipboard,
|
|
21
|
+
// when used in a script context with JSDOM
|
|
22
|
+
const bodyTextNodes = doc.evaluate(
|
|
23
|
+
'/html/body/text()',
|
|
24
|
+
doc,
|
|
25
|
+
null,
|
|
26
|
+
_XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
|
|
27
|
+
null,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
for (let i = bodyTextNodes.snapshotLength - 1; i >= 0; i--) {
|
|
31
|
+
const node = bodyTextNodes.snapshotItem(i) as HTMLElement
|
|
32
|
+
const text = node.textContent || ''
|
|
33
|
+
if (text.replace(/[^\S\n]+$/g, '')) {
|
|
34
|
+
const newNode = doc.createElement('span')
|
|
35
|
+
newNode.appendChild(doc.createTextNode(text))
|
|
36
|
+
node.parentNode?.replaceChild(newNode, node)
|
|
37
|
+
} else {
|
|
38
|
+
node.parentNode?.removeChild(node)
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const unwantedNodes = doc.evaluate(
|
|
43
|
+
unwantedWordDocumentPaths.join('|'),
|
|
44
|
+
doc,
|
|
45
|
+
null,
|
|
46
|
+
_XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
|
|
47
|
+
null,
|
|
48
|
+
)
|
|
49
|
+
for (let i = unwantedNodes.snapshotLength - 1; i >= 0; i--) {
|
|
50
|
+
const unwanted = unwantedNodes.snapshotItem(i)
|
|
51
|
+
if (!unwanted) {
|
|
52
|
+
continue
|
|
53
|
+
}
|
|
54
|
+
unwanted.parentNode?.removeChild(unwanted)
|
|
55
|
+
}
|
|
56
|
+
return doc
|
|
57
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import preprocessGDocs from './gdocs'
|
|
2
|
+
import preprocessHTML from './html'
|
|
3
|
+
import preprocessNotion from './notion'
|
|
4
|
+
import preprocessWhitespace from './whitespace'
|
|
5
|
+
import preprocessWord from './word'
|
|
6
|
+
|
|
7
|
+
export default [
|
|
8
|
+
preprocessWhitespace,
|
|
9
|
+
preprocessNotion,
|
|
10
|
+
preprocessWord,
|
|
11
|
+
preprocessGDocs,
|
|
12
|
+
preprocessHTML,
|
|
13
|
+
]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import {_XPathResult} from './xpathResult'
|
|
2
|
+
|
|
3
|
+
export default (html: string, doc: Document): Document => {
|
|
4
|
+
const NOTION_REGEX = /<!-- notionvc:.*?-->/g
|
|
5
|
+
|
|
6
|
+
if (html.match(NOTION_REGEX)) {
|
|
7
|
+
// Tag every child with attribute 'is-notion' so that the Notion rule-set can
|
|
8
|
+
// work exclusivly on these children
|
|
9
|
+
const childNodes = doc.evaluate(
|
|
10
|
+
'//*',
|
|
11
|
+
doc,
|
|
12
|
+
null,
|
|
13
|
+
_XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
|
|
14
|
+
null,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
for (let i = childNodes.snapshotLength - 1; i >= 0; i--) {
|
|
18
|
+
const elm = childNodes.snapshotItem(i) as HTMLElement
|
|
19
|
+
elm?.setAttribute('data-is-notion', 'true')
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
return doc
|
|
23
|
+
}
|
|
24
|
+
return doc
|
|
25
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import {PRESERVE_WHITESPACE_TAGS} from '../../constants'
|
|
2
|
+
import {_XPathResult} from './xpathResult'
|
|
3
|
+
|
|
4
|
+
export default (_: string, doc: Document): Document => {
|
|
5
|
+
// Recursively process all nodes.
|
|
6
|
+
function processNode(node: Node) {
|
|
7
|
+
// If this is a text node and not inside a tag where whitespace should be preserved, process it.
|
|
8
|
+
if (
|
|
9
|
+
node.nodeType === _XPathResult.BOOLEAN_TYPE &&
|
|
10
|
+
!PRESERVE_WHITESPACE_TAGS.includes(
|
|
11
|
+
node.parentElement?.tagName.toLowerCase() || '',
|
|
12
|
+
)
|
|
13
|
+
) {
|
|
14
|
+
node.textContent =
|
|
15
|
+
node.textContent
|
|
16
|
+
?.replace(/\s\s+/g, ' ') // Remove multiple whitespace
|
|
17
|
+
.replace(/[\r\n]+/g, ' ') || '' // Replace newlines with spaces
|
|
18
|
+
}
|
|
19
|
+
// Otherwise, if this node has children, process them.
|
|
20
|
+
else {
|
|
21
|
+
for (let i = 0; i < node.childNodes.length; i++) {
|
|
22
|
+
processNode(node.childNodes[i])
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Process all nodes starting from the root.
|
|
28
|
+
processNode(doc.body)
|
|
29
|
+
|
|
30
|
+
return doc
|
|
31
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import {_XPathResult} from './xpathResult'
|
|
2
|
+
|
|
3
|
+
const WORD_HTML_REGEX =
|
|
4
|
+
/(class="?Mso|style=(?:"|')[^"]*?\bmso-|w:WordDocument|<o:\w+>|<\/font>)/
|
|
5
|
+
|
|
6
|
+
// xPaths for elements that will be removed from the document
|
|
7
|
+
const unwantedPaths = [
|
|
8
|
+
'//o:p',
|
|
9
|
+
"//span[@style='mso-list:Ignore']",
|
|
10
|
+
"//span[@style='mso-list: Ignore']",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
// xPaths for elements that needs to be remapped into other tags
|
|
14
|
+
const mappedPaths = [
|
|
15
|
+
"//p[@class='MsoTocHeading']",
|
|
16
|
+
"//p[@class='MsoTitle']",
|
|
17
|
+
"//p[@class='MsoToaHeading']",
|
|
18
|
+
"//p[@class='MsoSubtitle']",
|
|
19
|
+
"//span[@class='MsoSubtleEmphasis']",
|
|
20
|
+
"//span[@class='MsoIntenseEmphasis']",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
// Which HTML element(s) to map the elements matching mappedPaths into
|
|
24
|
+
const elementMap: Record<string, string[] | undefined> = {
|
|
25
|
+
MsoTocHeading: ['h3'],
|
|
26
|
+
MsoTitle: ['h1'],
|
|
27
|
+
MsoToaHeading: ['h2'],
|
|
28
|
+
MsoSubtitle: ['h5'],
|
|
29
|
+
MsoSubtleEmphasis: ['span', 'em'],
|
|
30
|
+
MsoIntenseEmphasis: ['span', 'em', 'strong'],
|
|
31
|
+
// Remove cruft
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function isWordHtml(html: string) {
|
|
35
|
+
return WORD_HTML_REGEX.test(html)
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export default (html: string, doc: Document): Document => {
|
|
39
|
+
if (!isWordHtml(html)) {
|
|
40
|
+
return doc
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const unwantedNodes = doc.evaluate(
|
|
44
|
+
unwantedPaths.join('|'),
|
|
45
|
+
doc,
|
|
46
|
+
(prefix) => {
|
|
47
|
+
if (prefix === 'o') {
|
|
48
|
+
return 'urn:schemas-microsoft-com:office:office'
|
|
49
|
+
}
|
|
50
|
+
return null
|
|
51
|
+
},
|
|
52
|
+
_XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
|
|
53
|
+
null,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
for (let i = unwantedNodes.snapshotLength - 1; i >= 0; i--) {
|
|
57
|
+
const unwanted = unwantedNodes.snapshotItem(i)
|
|
58
|
+
if (unwanted?.parentNode) {
|
|
59
|
+
unwanted.parentNode.removeChild(unwanted)
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Transform mapped elements into what they should be mapped to
|
|
64
|
+
const mappedElements = doc.evaluate(
|
|
65
|
+
mappedPaths.join('|'),
|
|
66
|
+
doc,
|
|
67
|
+
null,
|
|
68
|
+
_XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
|
|
69
|
+
null,
|
|
70
|
+
)
|
|
71
|
+
for (let i = mappedElements.snapshotLength - 1; i >= 0; i--) {
|
|
72
|
+
const mappedElm = mappedElements.snapshotItem(i) as HTMLElement
|
|
73
|
+
const tags = elementMap[mappedElm.className]
|
|
74
|
+
const text = doc.createTextNode(mappedElm.textContent || '')
|
|
75
|
+
if (!tags) {
|
|
76
|
+
continue
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const parentElement = doc.createElement(tags[0])
|
|
80
|
+
let parent = parentElement
|
|
81
|
+
let child = parentElement
|
|
82
|
+
tags.slice(1).forEach((tag) => {
|
|
83
|
+
child = doc.createElement(tag)
|
|
84
|
+
parent.appendChild(child)
|
|
85
|
+
parent = child
|
|
86
|
+
})
|
|
87
|
+
child.appendChild(text)
|
|
88
|
+
mappedElm?.parentNode?.replaceChild(parentElement, mappedElm)
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
return doc
|
|
92
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
// We need this here if run server side
|
|
2
|
+
export const _XPathResult = {
|
|
3
|
+
ANY_TYPE: 0,
|
|
4
|
+
NUMBER_TYPE: 1,
|
|
5
|
+
STRING_TYPE: 2,
|
|
6
|
+
BOOLEAN_TYPE: 3,
|
|
7
|
+
UNORDERED_NODE_ITERATOR_TYPE: 4,
|
|
8
|
+
ORDERED_NODE_ITERATOR_TYPE: 5,
|
|
9
|
+
UNORDERED_NODE_SNAPSHOT_TYPE: 6,
|
|
10
|
+
ORDERED_NODE_SNAPSHOT_TYPE: 7,
|
|
11
|
+
ANY_UNORDERED_NODE_TYPE: 8,
|
|
12
|
+
FIRST_ORDERED_NODE_TYPE: 9,
|
|
13
|
+
}
|