@portabletext/block-tools 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,313 @@
1
+ import type {
2
+ ArraySchemaType,
3
+ PortableTextBlock,
4
+ PortableTextObject,
5
+ PortableTextTextBlock,
6
+ } from '@sanity/types'
7
+ import {flatten} from 'lodash'
8
+ import type {
9
+ ArbitraryTypedObject,
10
+ DeserializerRule,
11
+ HtmlDeserializerOptions,
12
+ PlaceholderAnnotation,
13
+ PlaceholderDecorator,
14
+ TypedObject,
15
+ } from '../types'
16
+ import {findBlockType} from '../util/findBlockType'
17
+ import {resolveJsType} from '../util/resolveJsType'
18
+ import {
19
+ createRuleOptions,
20
+ defaultParseHtml,
21
+ ensureRootIsBlocks,
22
+ flattenNestedBlocks,
23
+ isMinimalBlock,
24
+ isMinimalSpan,
25
+ isNodeList,
26
+ isPlaceholderAnnotation,
27
+ isPlaceholderDecorator,
28
+ preprocess,
29
+ tagName,
30
+ trimWhitespace,
31
+ } from './helpers'
32
+ import {createRules} from './rules'
33
+
34
+ /**
35
+ * HTML Deserializer
36
+ *
37
+ */
38
+ export default class HtmlDeserializer {
39
+ blockContentType: ArraySchemaType
40
+ rules: DeserializerRule[]
41
+ parseHtml: (html: string) => HTMLElement
42
+ _markDefs: PortableTextObject[] = []
43
+
44
+ /**
45
+ * Create a new serializer respecting a Sanity block content type's schema
46
+ *
47
+ * @param blockContentType - Schema type for array containing _at least_ a block child type
48
+ * @param options - Options for the deserialization process
49
+ */
50
+ constructor(
51
+ blockContentType: ArraySchemaType,
52
+ options: HtmlDeserializerOptions = {},
53
+ ) {
54
+ const {rules = [], unstable_whitespaceOnPasteMode = 'preserve'} = options
55
+ if (!blockContentType) {
56
+ throw new Error("Parameter 'blockContentType' is required")
57
+ }
58
+ const standardRules = createRules(
59
+ blockContentType,
60
+ createRuleOptions(blockContentType),
61
+ )
62
+ this.rules = [...rules, ...standardRules]
63
+ const parseHtml = options.parseHtml || defaultParseHtml()
64
+ this.blockContentType = blockContentType
65
+ this.parseHtml = (html) => {
66
+ const doc = preprocess(html, parseHtml, {unstable_whitespaceOnPasteMode})
67
+ return doc.body
68
+ }
69
+ }
70
+
71
+ /**
72
+ * Deserialize HTML.
73
+ *
74
+ * @param html - The HTML to deserialize, as a string
75
+ * @returns Array of blocks - either portable text blocks or other allowed blocks
76
+ */
77
+ deserialize = (html: string): TypedObject[] => {
78
+ this._markDefs = []
79
+ const {parseHtml} = this
80
+ const fragment = parseHtml(html)
81
+ const children = Array.from(fragment.childNodes) as HTMLElement[]
82
+ // Ensure that there are no blocks within blocks, and trim whitespace
83
+ const blocks = trimWhitespace(
84
+ flattenNestedBlocks(
85
+ ensureRootIsBlocks(this.deserializeElements(children)),
86
+ ),
87
+ )
88
+
89
+ if (this._markDefs.length > 0) {
90
+ blocks
91
+ .filter(
92
+ (block): block is PortableTextTextBlock => block._type === 'block',
93
+ )
94
+ .forEach((block) => {
95
+ block.markDefs = block.markDefs || []
96
+ block.markDefs = block.markDefs.concat(
97
+ this._markDefs.filter((def) => {
98
+ return flatten(
99
+ block.children.map((child) => child.marks || []),
100
+ ).includes(def._key)
101
+ }),
102
+ )
103
+ })
104
+ }
105
+
106
+ // Set back the potentially hoisted block type
107
+ const type = this.blockContentType.of.find(findBlockType)
108
+ if (!type) {
109
+ return blocks
110
+ }
111
+
112
+ return blocks.map((block) => {
113
+ if (block._type === 'block') {
114
+ block._type = type.name
115
+ }
116
+ return block
117
+ })
118
+ }
119
+
120
+ /**
121
+ * Deserialize an array of DOM elements.
122
+ *
123
+ * @param elements - Array of DOM elements to deserialize
124
+ * @returns
125
+ */
126
+ deserializeElements = (elements: Node[] = []): TypedObject[] => {
127
+ let nodes: TypedObject[] = []
128
+ elements.forEach((element) => {
129
+ nodes = nodes.concat(this.deserializeElement(element))
130
+ })
131
+ return nodes
132
+ }
133
+
134
+ /**
135
+ * Deserialize a DOM element
136
+ *
137
+ * @param element - Deserialize a DOM element
138
+ * @returns
139
+ */
140
+ deserializeElement = (element: Node): TypedObject | TypedObject[] => {
141
+ const next = (
142
+ elements: Node | Node[] | NodeList,
143
+ ): TypedObject | TypedObject[] | undefined => {
144
+ if (isNodeList(elements)) {
145
+ return this.deserializeElements(Array.from(elements))
146
+ }
147
+
148
+ if (Array.isArray(elements)) {
149
+ return this.deserializeElements(elements)
150
+ }
151
+
152
+ if (!elements) {
153
+ return undefined
154
+ }
155
+
156
+ return this.deserializeElement(elements)
157
+ }
158
+
159
+ const block = (props: ArbitraryTypedObject) => {
160
+ return {
161
+ _type: '__block',
162
+ block: props,
163
+ }
164
+ }
165
+
166
+ let node: TypedObject | Array<TypedObject> | undefined
167
+ for (let i = 0; i < this.rules.length; i++) {
168
+ const rule = this.rules[i]
169
+ if (!rule.deserialize) {
170
+ continue
171
+ }
172
+
173
+ const ret = rule.deserialize(element, next, block)
174
+ const type = resolveJsType(ret)
175
+
176
+ if (
177
+ type !== 'array' &&
178
+ type !== 'object' &&
179
+ type !== 'null' &&
180
+ type !== 'undefined'
181
+ ) {
182
+ throw new Error(
183
+ `A rule returned an invalid deserialized representation: "${node}".`,
184
+ )
185
+ }
186
+
187
+ if (ret === undefined) {
188
+ continue
189
+ } else if (ret === null) {
190
+ throw new Error('Deserializer rule returned `null`')
191
+ } else if (Array.isArray(ret)) {
192
+ node = ret
193
+ } else if (isPlaceholderDecorator(ret)) {
194
+ node = this.deserializeDecorator(ret)
195
+ } else if (isPlaceholderAnnotation(ret)) {
196
+ node = this.deserializeAnnotation(ret)
197
+ } else {
198
+ node = ret
199
+ }
200
+
201
+ // Set list level on list item
202
+ if (
203
+ ret &&
204
+ !Array.isArray(ret) &&
205
+ isMinimalBlock(ret) &&
206
+ 'listItem' in ret
207
+ ) {
208
+ let parent = element.parentNode?.parentNode
209
+ while (parent && tagName(parent) === 'li') {
210
+ parent = parent.parentNode?.parentNode
211
+ ret.level = ret.level ? ret.level + 1 : 1
212
+ }
213
+ }
214
+
215
+ // Set newlines on spans orginating from a block element within a blockquote
216
+ if (
217
+ ret &&
218
+ !Array.isArray(ret) &&
219
+ isMinimalBlock(ret) &&
220
+ ret.style === 'blockquote'
221
+ ) {
222
+ ret.children.forEach((child, index) => {
223
+ if (isMinimalSpan(child) && child.text === '\r') {
224
+ child.text = '\n\n'
225
+ if (index === 0 || index === ret.children.length - 1) {
226
+ ret.children.splice(index, 1)
227
+ }
228
+ }
229
+ })
230
+ }
231
+ break
232
+ }
233
+
234
+ return node || next(element.childNodes) || []
235
+ }
236
+
237
+ /**
238
+ * Deserialize a `__decorator` type
239
+ * (an internal made up type to process decorators exclusively)
240
+ *
241
+ * @param decorator -
242
+ * @returns array of ...
243
+ */
244
+ deserializeDecorator = (decorator: PlaceholderDecorator): TypedObject[] => {
245
+ const {name} = decorator
246
+ const applyDecorator = (node: TypedObject) => {
247
+ if (isPlaceholderDecorator(node)) {
248
+ return this.deserializeDecorator(node)
249
+ } else if (isMinimalSpan(node)) {
250
+ node.marks = node.marks || []
251
+ if (node.text.trim()) {
252
+ // Only apply marks if this is an actual text
253
+ node.marks.unshift(name)
254
+ }
255
+ } else if (
256
+ 'children' in node &&
257
+ Array.isArray((node as PortableTextBlock).children)
258
+ ) {
259
+ const block = node as any
260
+ block.children = block.children.map(applyDecorator)
261
+ }
262
+ return node
263
+ }
264
+ return decorator.children.reduce((children, node) => {
265
+ const ret = applyDecorator(node)
266
+ if (Array.isArray(ret)) {
267
+ return children.concat(ret)
268
+ }
269
+ children.push(ret)
270
+ return children
271
+ }, [] as TypedObject[])
272
+ }
273
+
274
+ /**
275
+ * Deserialize a `__annotation` object.
276
+ * (an internal made up type to process annotations exclusively)
277
+ *
278
+ * @param annotation -
279
+ * @returns Array of...
280
+ */
281
+ deserializeAnnotation = (
282
+ annotation: PlaceholderAnnotation,
283
+ ): TypedObject[] => {
284
+ const {markDef} = annotation
285
+ this._markDefs.push(markDef)
286
+ const applyAnnotation = (node: TypedObject) => {
287
+ if (isPlaceholderAnnotation(node)) {
288
+ return this.deserializeAnnotation(node)
289
+ } else if (isMinimalSpan(node)) {
290
+ node.marks = node.marks || []
291
+ if (node.text.trim()) {
292
+ // Only apply marks if this is an actual text
293
+ node.marks.unshift(markDef._key)
294
+ }
295
+ } else if (
296
+ 'children' in node &&
297
+ Array.isArray((node as PortableTextBlock).children)
298
+ ) {
299
+ const block = node as any
300
+ block.children = block.children.map(applyAnnotation)
301
+ }
302
+ return node
303
+ }
304
+ return annotation.children.reduce((children, node) => {
305
+ const ret = applyAnnotation(node)
306
+ if (Array.isArray(ret)) {
307
+ return children.concat(ret)
308
+ }
309
+ children.push(ret)
310
+ return children
311
+ }, [] as TypedObject[])
312
+ }
313
+ }
@@ -0,0 +1,86 @@
1
+ import type {HtmlPreprocessorOptions} from '../../types'
2
+ import {normalizeWhitespace, removeAllWhitespace, tagName} from '../helpers'
3
+ import {_XPathResult} from './xpathResult'
4
+
5
+ export default (
6
+ _html: string,
7
+ doc: Document,
8
+ options: HtmlPreprocessorOptions,
9
+ ): Document => {
10
+ const whitespaceOnPasteMode =
11
+ options?.unstable_whitespaceOnPasteMode || 'preserve'
12
+ let gDocsRootOrSiblingNode = doc
13
+ .evaluate(
14
+ '//*[@id and contains(@id, "docs-internal-guid")]',
15
+ doc,
16
+ null,
17
+ _XPathResult.ORDERED_NODE_ITERATOR_TYPE,
18
+ null,
19
+ )
20
+ .iterateNext()
21
+
22
+ if (gDocsRootOrSiblingNode) {
23
+ const isWrappedRootTag = tagName(gDocsRootOrSiblingNode) === 'b'
24
+
25
+ // If this document isn't wrapped in a 'b' tag, then assume all siblings live on the root level
26
+ if (!isWrappedRootTag) {
27
+ gDocsRootOrSiblingNode = doc.body
28
+ }
29
+
30
+ switch (whitespaceOnPasteMode) {
31
+ case 'normalize':
32
+ // Keep only 1 empty block between content nodes
33
+ normalizeWhitespace(gDocsRootOrSiblingNode)
34
+ break
35
+ case 'remove':
36
+ // Remove all whitespace nodes
37
+ removeAllWhitespace(gDocsRootOrSiblingNode)
38
+ break
39
+ default:
40
+ break
41
+ }
42
+
43
+ // Tag every child with attribute 'is-google-docs' so that the GDocs rule-set can
44
+ // work exclusivly on these children
45
+ const childNodes = doc.evaluate(
46
+ '//*',
47
+ doc,
48
+ null,
49
+ _XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
50
+ null,
51
+ )
52
+
53
+ for (let i = childNodes.snapshotLength - 1; i >= 0; i--) {
54
+ const elm = childNodes.snapshotItem(i) as HTMLElement
55
+ elm?.setAttribute('data-is-google-docs', 'true')
56
+
57
+ if (
58
+ elm?.parentElement === gDocsRootOrSiblingNode ||
59
+ (!isWrappedRootTag && elm.parentElement === doc.body)
60
+ ) {
61
+ elm?.setAttribute('data-is-root-node', 'true')
62
+ tagName(elm)
63
+ }
64
+
65
+ // Handle checkmark lists - The first child of a list item is an image with a checkmark, and the serializer
66
+ // expects the first child to be the text node
67
+ if (
68
+ tagName(elm) === 'li' &&
69
+ elm.firstChild &&
70
+ tagName(elm?.firstChild) === 'img'
71
+ ) {
72
+ elm.removeChild(elm.firstChild)
73
+ }
74
+ }
75
+
76
+ // Remove that 'b' which Google Docs wraps the HTML content in
77
+ if (isWrappedRootTag) {
78
+ doc.body.firstElementChild?.replaceWith(
79
+ ...Array.from(gDocsRootOrSiblingNode.childNodes),
80
+ )
81
+ }
82
+
83
+ return doc
84
+ }
85
+ return doc
86
+ }
@@ -0,0 +1,57 @@
1
+ import {_XPathResult} from './xpathResult'
2
+
3
+ // Remove this cruft from the document
4
+ const unwantedWordDocumentPaths = [
5
+ '/html/text()',
6
+ '/html/head/text()',
7
+ '/html/body/text()',
8
+ '/html/body/ul/text()',
9
+ '/html/body/ol/text()',
10
+ '//comment()',
11
+ '//style',
12
+ '//xml',
13
+ '//script',
14
+ '//meta',
15
+ '//link',
16
+ ]
17
+
18
+ export default (_html: string, doc: Document): Document => {
19
+ // Make sure text directly on the body is wrapped in spans.
20
+ // This mimics what the browser does before putting html on the clipboard,
21
+ // when used in a script context with JSDOM
22
+ const bodyTextNodes = doc.evaluate(
23
+ '/html/body/text()',
24
+ doc,
25
+ null,
26
+ _XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
27
+ null,
28
+ )
29
+
30
+ for (let i = bodyTextNodes.snapshotLength - 1; i >= 0; i--) {
31
+ const node = bodyTextNodes.snapshotItem(i) as HTMLElement
32
+ const text = node.textContent || ''
33
+ if (text.replace(/[^\S\n]+$/g, '')) {
34
+ const newNode = doc.createElement('span')
35
+ newNode.appendChild(doc.createTextNode(text))
36
+ node.parentNode?.replaceChild(newNode, node)
37
+ } else {
38
+ node.parentNode?.removeChild(node)
39
+ }
40
+ }
41
+
42
+ const unwantedNodes = doc.evaluate(
43
+ unwantedWordDocumentPaths.join('|'),
44
+ doc,
45
+ null,
46
+ _XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
47
+ null,
48
+ )
49
+ for (let i = unwantedNodes.snapshotLength - 1; i >= 0; i--) {
50
+ const unwanted = unwantedNodes.snapshotItem(i)
51
+ if (!unwanted) {
52
+ continue
53
+ }
54
+ unwanted.parentNode?.removeChild(unwanted)
55
+ }
56
+ return doc
57
+ }
@@ -0,0 +1,13 @@
1
+ import preprocessGDocs from './gdocs'
2
+ import preprocessHTML from './html'
3
+ import preprocessNotion from './notion'
4
+ import preprocessWhitespace from './whitespace'
5
+ import preprocessWord from './word'
6
+
7
+ export default [
8
+ preprocessWhitespace,
9
+ preprocessNotion,
10
+ preprocessWord,
11
+ preprocessGDocs,
12
+ preprocessHTML,
13
+ ]
@@ -0,0 +1,25 @@
1
+ import {_XPathResult} from './xpathResult'
2
+
3
+ export default (html: string, doc: Document): Document => {
4
+ const NOTION_REGEX = /<!-- notionvc:.*?-->/g
5
+
6
+ if (html.match(NOTION_REGEX)) {
7
+ // Tag every child with attribute 'is-notion' so that the Notion rule-set can
8
+ // work exclusivly on these children
9
+ const childNodes = doc.evaluate(
10
+ '//*',
11
+ doc,
12
+ null,
13
+ _XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
14
+ null,
15
+ )
16
+
17
+ for (let i = childNodes.snapshotLength - 1; i >= 0; i--) {
18
+ const elm = childNodes.snapshotItem(i) as HTMLElement
19
+ elm?.setAttribute('data-is-notion', 'true')
20
+ }
21
+
22
+ return doc
23
+ }
24
+ return doc
25
+ }
@@ -0,0 +1,31 @@
1
+ import {PRESERVE_WHITESPACE_TAGS} from '../../constants'
2
+ import {_XPathResult} from './xpathResult'
3
+
4
+ export default (_: string, doc: Document): Document => {
5
+ // Recursively process all nodes.
6
+ function processNode(node: Node) {
7
+ // If this is a text node and not inside a tag where whitespace should be preserved, process it.
8
+ if (
9
+ node.nodeType === _XPathResult.BOOLEAN_TYPE &&
10
+ !PRESERVE_WHITESPACE_TAGS.includes(
11
+ node.parentElement?.tagName.toLowerCase() || '',
12
+ )
13
+ ) {
14
+ node.textContent =
15
+ node.textContent
16
+ ?.replace(/\s\s+/g, ' ') // Remove multiple whitespace
17
+ .replace(/[\r\n]+/g, ' ') || '' // Replace newlines with spaces
18
+ }
19
+ // Otherwise, if this node has children, process them.
20
+ else {
21
+ for (let i = 0; i < node.childNodes.length; i++) {
22
+ processNode(node.childNodes[i])
23
+ }
24
+ }
25
+ }
26
+
27
+ // Process all nodes starting from the root.
28
+ processNode(doc.body)
29
+
30
+ return doc
31
+ }
@@ -0,0 +1,92 @@
1
+ import {_XPathResult} from './xpathResult'
2
+
3
+ const WORD_HTML_REGEX =
4
+ /(class="?Mso|style=(?:"|')[^"]*?\bmso-|w:WordDocument|<o:\w+>|<\/font>)/
5
+
6
+ // xPaths for elements that will be removed from the document
7
+ const unwantedPaths = [
8
+ '//o:p',
9
+ "//span[@style='mso-list:Ignore']",
10
+ "//span[@style='mso-list: Ignore']",
11
+ ]
12
+
13
+ // xPaths for elements that needs to be remapped into other tags
14
+ const mappedPaths = [
15
+ "//p[@class='MsoTocHeading']",
16
+ "//p[@class='MsoTitle']",
17
+ "//p[@class='MsoToaHeading']",
18
+ "//p[@class='MsoSubtitle']",
19
+ "//span[@class='MsoSubtleEmphasis']",
20
+ "//span[@class='MsoIntenseEmphasis']",
21
+ ]
22
+
23
+ // Which HTML element(s) to map the elements matching mappedPaths into
24
+ const elementMap: Record<string, string[] | undefined> = {
25
+ MsoTocHeading: ['h3'],
26
+ MsoTitle: ['h1'],
27
+ MsoToaHeading: ['h2'],
28
+ MsoSubtitle: ['h5'],
29
+ MsoSubtleEmphasis: ['span', 'em'],
30
+ MsoIntenseEmphasis: ['span', 'em', 'strong'],
31
+ // Remove cruft
32
+ }
33
+
34
+ function isWordHtml(html: string) {
35
+ return WORD_HTML_REGEX.test(html)
36
+ }
37
+
38
+ export default (html: string, doc: Document): Document => {
39
+ if (!isWordHtml(html)) {
40
+ return doc
41
+ }
42
+
43
+ const unwantedNodes = doc.evaluate(
44
+ unwantedPaths.join('|'),
45
+ doc,
46
+ (prefix) => {
47
+ if (prefix === 'o') {
48
+ return 'urn:schemas-microsoft-com:office:office'
49
+ }
50
+ return null
51
+ },
52
+ _XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
53
+ null,
54
+ )
55
+
56
+ for (let i = unwantedNodes.snapshotLength - 1; i >= 0; i--) {
57
+ const unwanted = unwantedNodes.snapshotItem(i)
58
+ if (unwanted?.parentNode) {
59
+ unwanted.parentNode.removeChild(unwanted)
60
+ }
61
+ }
62
+
63
+ // Transform mapped elements into what they should be mapped to
64
+ const mappedElements = doc.evaluate(
65
+ mappedPaths.join('|'),
66
+ doc,
67
+ null,
68
+ _XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE,
69
+ null,
70
+ )
71
+ for (let i = mappedElements.snapshotLength - 1; i >= 0; i--) {
72
+ const mappedElm = mappedElements.snapshotItem(i) as HTMLElement
73
+ const tags = elementMap[mappedElm.className]
74
+ const text = doc.createTextNode(mappedElm.textContent || '')
75
+ if (!tags) {
76
+ continue
77
+ }
78
+
79
+ const parentElement = doc.createElement(tags[0])
80
+ let parent = parentElement
81
+ let child = parentElement
82
+ tags.slice(1).forEach((tag) => {
83
+ child = doc.createElement(tag)
84
+ parent.appendChild(child)
85
+ parent = child
86
+ })
87
+ child.appendChild(text)
88
+ mappedElm?.parentNode?.replaceChild(parentElement, mappedElm)
89
+ }
90
+
91
+ return doc
92
+ }
@@ -0,0 +1,13 @@
1
+ // We need this here if run server side
2
+ export const _XPathResult = {
3
+ ANY_TYPE: 0,
4
+ NUMBER_TYPE: 1,
5
+ STRING_TYPE: 2,
6
+ BOOLEAN_TYPE: 3,
7
+ UNORDERED_NODE_ITERATOR_TYPE: 4,
8
+ ORDERED_NODE_ITERATOR_TYPE: 5,
9
+ UNORDERED_NODE_SNAPSHOT_TYPE: 6,
10
+ ORDERED_NODE_SNAPSHOT_TYPE: 7,
11
+ ANY_UNORDERED_NODE_TYPE: 8,
12
+ FIRST_ORDERED_NODE_TYPE: 9,
13
+ }