npm - @portabletext/block-tools - Versions diffs - 4.0.2 → 4.1.1 - Mend

@portabletext/block-tools 4.0.2 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/src/HtmlDeserializer/helpers.ts CHANGED Viewed

@@ -1,10 +1,5 @@
 import type {Schema} from '@portabletext/schema'
-import {
-  isTextBlock,
-  type PortableTextObject,
-  type PortableTextTextBlock,
-} from '@portabletext/schema'
-import {isEqual} from 'lodash'
+import {isTextBlock, type PortableTextObject} from '@portabletext/schema'
 import {DEFAULT_BLOCK} from '../constants'
 import type {
   ArbitraryTypedObject,
@@ -49,88 +44,6 @@ export function defaultParseHtml(): HtmlParser {
   }
 }
-function nextSpan(block: PortableTextTextBlock, index: number) {
-  const next = block.children[index + 1]
-  return next && next._type === 'span' ? next : null
-}
-function prevSpan(block: PortableTextTextBlock, index: number) {
-  const prev = block.children[index - 1]
-  return prev && prev._type === 'span' ? prev : null
-}
-function isWhiteSpaceChar(text: string) {
-  return ['\xa0', ' '].includes(text)
-}
-/**
- * NOTE: _mutates_ passed blocks!
- *
- * @param blocks - Array of blocks to trim whitespace for
- * @returns
- */
-export function trimWhitespace(
-  schema: Schema,
-  blocks: TypedObject[],
-): TypedObject[] {
-  blocks.forEach((block) => {
-    if (!isTextBlock({schema}, block)) {
-      return
-    }
-    // eslint-disable-next-line complexity
-    block.children.forEach((child, index) => {
-      if (!isMinimalSpan(child)) {
-        return
-      }
-      const nextChild = nextSpan(block, index)
-      const prevChild = prevSpan(block, index)
-      if (index === 0) {
-        child.text = child.text.replace(/^[^\S\n]+/g, '')
-      }
-      if (index === block.children.length - 1) {
-        child.text = child.text.replace(/[^\S\n]+$/g, '')
-      }
-      if (
-        /\s/.test(child.text.slice(Math.max(0, child.text.length - 1))) &&
-        nextChild &&
-        isMinimalSpan(nextChild) &&
-        /\s/.test(nextChild.text.slice(0, 1))
-      ) {
-        child.text = child.text.replace(/[^\S\n]+$/g, '')
-      }
-      if (
-        /\s/.test(child.text.slice(0, 1)) &&
-        prevChild &&
-        isMinimalSpan(prevChild) &&
-        /\s/.test(prevChild.text.slice(Math.max(0, prevChild.text.length - 1)))
-      ) {
-        child.text = child.text.replace(/^[^\S\n]+/g, '')
-      }
-      if (!child.text) {
-        block.children.splice(index, 1)
-      }
-      if (
-        prevChild &&
-        isEqual(prevChild.marks, child.marks) &&
-        isWhiteSpaceChar(child.text)
-      ) {
-        prevChild.text += ' '
-        block.children.splice(index, 1)
-      } else if (
-        nextChild &&
-        isEqual(nextChild.marks, child.marks) &&
-        isWhiteSpaceChar(child.text)
-      ) {
-        nextChild.text = ` ${nextChild.text}`
-        block.children.splice(index, 1)
-      }
-    })
-  })
-  return blocks
-}
 export function ensureRootIsBlocks(
   schema: Schema,
   objects: Array<ArbitraryTypedObject>,
@@ -193,98 +106,3 @@ export function isPlaceholderAnnotation(
 export function isElement(node: Node): node is Element {
   return node.nodeType === 1
 }
-/**
- * Helper to normalize whitespace to only 1 empty block between content nodes
- * @param node - Root node to process
- */
-export function normalizeWhitespace(rootNode: Node) {
-  let emptyBlockCount = 0
-  let lastParent = null
-  const nodesToRemove: Node[] = []
-  for (let child = rootNode.firstChild; child; child = child.nextSibling) {
-    if (!isElement(child)) {
-      normalizeWhitespace(child)
-      emptyBlockCount = 0
-      continue
-    }
-    const elm = child as HTMLElement
-    if (isWhitespaceBlock(elm)) {
-      if (lastParent && elm.parentElement === lastParent) {
-        emptyBlockCount++
-        if (emptyBlockCount > 1) {
-          nodesToRemove.push(elm)
-        }
-      } else {
-        // Different parent, reset counter
-        emptyBlockCount = 1
-      }
-      lastParent = elm.parentElement
-    } else {
-      // Recurse into child nodes
-      normalizeWhitespace(child)
-      // Reset counter for siblings
-      emptyBlockCount = 0
-    }
-  }
-  // Remove marked nodes
-  nodesToRemove.forEach((node) => {
-    node.parentElement?.removeChild(node)
-  })
-}
-/**
- * Helper to remove all whitespace nodes
- * @param node - Root node to process
- */
-export function removeAllWhitespace(rootNode: Node) {
-  const nodesToRemove: Node[] = []
-  function collectNodesToRemove(currentNode: Node) {
-    if (isElement(currentNode)) {
-      const elm = currentNode as HTMLElement
-      // Handle <br> tags that is between <p> tags
-      if (
-        tagName(elm) === 'br' &&
-        (tagName(elm.nextElementSibling) === 'p' ||
-          tagName(elm.previousElementSibling) === 'p')
-      ) {
-        nodesToRemove.push(elm)
-        return
-      }
-      // Handle empty blocks
-      if (
-        (tagName(elm) === 'p' || tagName(elm) === 'br') &&
-        elm?.firstChild?.textContent?.trim() === ''
-      ) {
-        nodesToRemove.push(elm)
-        return
-      }
-      // Recursively process child nodes
-      for (let child = elm.firstChild; child; child = child.nextSibling) {
-        collectNodesToRemove(child)
-      }
-    }
-  }
-  collectNodesToRemove(rootNode)
-  // Remove the collected nodes
-  nodesToRemove.forEach((node) => {
-    node.parentElement?.removeChild(node)
-  })
-}
-function isWhitespaceBlock(elm: HTMLElement): boolean {
-  return ['p', 'br'].includes(tagName(elm) || '') && !elm.textContent?.trim()
-}

package/src/HtmlDeserializer/index.ts CHANGED Viewed

@@ -10,8 +10,6 @@ import type {
   ArbitraryTypedObject,
   DeserializerRule,
   HtmlDeserializerOptions,
-  HtmlParser,
-  HtmlPreprocessorOptions,
   PlaceholderAnnotation,
   PlaceholderDecorator,
   TypedObject,
@@ -28,10 +26,10 @@ import {
   isPlaceholderAnnotation,
   isPlaceholderDecorator,
   tagName,
-  trimWhitespace,
 } from './helpers'
-import preprocessors from './preprocessors'
+import {preprocessors} from './preprocessors'
 import {createRules} from './rules'
+import {trimWhitespace} from './trim-whitespace'
 /**
  * HTML Deserializer
@@ -42,6 +40,7 @@ export default class HtmlDeserializer {
   schema: Schema
   rules: DeserializerRule[]
   parseHtml: (html: string) => HTMLElement
+  whitespaceMode: 'preserve' | 'remove' | 'normalize'
   _markDefs: PortableTextObject[] = []
   /**
@@ -59,9 +58,16 @@ export default class HtmlDeserializer {
     this.schema = schema
     this.keyGenerator = options.keyGenerator ?? keyGenerator
     this.rules = [...rules, ...standardRules]
+    this.whitespaceMode = unstable_whitespaceOnPasteMode
     const parseHtml = options.parseHtml || defaultParseHtml()
     this.parseHtml = (html) => {
-      const doc = preprocess(html, parseHtml, {unstable_whitespaceOnPasteMode})
+      const cleanHTML = vercelStegaClean(html)
+      const doc = parseHtml(cleanHTML)
+      for (const processor of preprocessors) {
+        processor(cleanHTML, doc)
+      }
       return doc.body
     }
   }
@@ -77,9 +83,10 @@ export default class HtmlDeserializer {
     const {parseHtml} = this
     const fragment = parseHtml(html)
     const children = Array.from(fragment.childNodes) as HTMLElement[]
-    // Ensure that there are no blocks within blocks, and trim whitespace
     const blocks = trimWhitespace(
-      this.schema,
+      {schema: this.schema},
+      this.whitespaceMode,
       flattenNestedBlocks(
         {schema: this.schema},
         ensureRootIsBlocks(
@@ -306,21 +313,3 @@ export default class HtmlDeserializer {
     }, [] as TypedObject[])
   }
 }
-// TODO: make this plugin-style
-function preprocess(
-  html: string,
-  parseHtml: HtmlParser,
-  options: HtmlPreprocessorOptions,
-): Document {
-  const cleanHTML = vercelStegaClean(html)
-  const doc = parseHtml(normalizeHtmlBeforePreprocess(cleanHTML))
-  preprocessors.forEach((processor) => {
-    processor(cleanHTML, doc, options)
-  })
-  return doc
-}
-function normalizeHtmlBeforePreprocess(html: string): string {
-  return html.trim()
-}

package/src/HtmlDeserializer/preprocessors/index.ts CHANGED Viewed

@@ -1,13 +1,15 @@
-import preprocessGDocs from './gdocs'
-import preprocessHTML from './html'
-import preprocessNotion from './notion'
-import preprocessWhitespace from './whitespace'
-import preprocessWord from './word'
+import {preprocessWordOnline} from '../word-online/preprocessor.word-online'
+import {preprocessGDocs} from './preprocessor.gdocs'
+import {preprocessHTML} from './preprocessor.html'
+import {preprocessNotion} from './preprocessor.notion'
+import {preprocessWhitespace} from './preprocessor.whitespace'
+import {preprocessWord} from './preprocessor.word'
-export default [
+export const preprocessors = [
   preprocessWhitespace,
   preprocessNotion,
   preprocessWord,
+  preprocessWordOnline,
   preprocessGDocs,
   preprocessHTML,
 ]

package/src/HtmlDeserializer/preprocessors/{gdocs.ts → preprocessor.gdocs.ts} RENAMED Viewed

@@ -1,14 +1,7 @@
-import type {HtmlPreprocessorOptions} from '../../types'
-import {normalizeWhitespace, removeAllWhitespace, tagName} from '../helpers'
+import {tagName} from '../helpers'
 import {_XPathResult} from './xpathResult'
-export default (
-  _html: string,
-  doc: Document,
-  options: HtmlPreprocessorOptions,
-): Document => {
-  const whitespaceOnPasteMode =
-    options?.unstable_whitespaceOnPasteMode || 'preserve'
+export function preprocessGDocs(_html: string, doc: Document): Document {
   let gDocsRootOrSiblingNode = doc
     .evaluate(
       '//*[@id and contains(@id, "docs-internal-guid")]',
@@ -27,19 +20,6 @@ export default (
       gDocsRootOrSiblingNode = doc.body
     }
-    switch (whitespaceOnPasteMode) {
-      case 'normalize':
-        // Keep only 1 empty block between content nodes
-        normalizeWhitespace(gDocsRootOrSiblingNode)
-        break
-      case 'remove':
-        // Remove all whitespace nodes
-        removeAllWhitespace(gDocsRootOrSiblingNode)
-        break
-      default:
-        break
-    }
     // Tag every child with attribute 'is-google-docs' so that the GDocs rule-set can
     // work exclusivly on these children
     const childNodes = doc.evaluate(

package/src/HtmlDeserializer/preprocessors/{html.ts → preprocessor.html.ts} RENAMED Viewed

@@ -15,7 +15,7 @@ const unwantedWordDocumentPaths = [
   '//link',
 ]
-export default (_html: string, doc: Document): Document => {
+export function preprocessHTML(_html: string, doc: Document): Document {
   // Make sure text directly on the body is wrapped in spans.
   // This mimics what the browser does before putting html on the clipboard,
   // when used in a script context with JSDOM

package/src/HtmlDeserializer/preprocessors/{notion.ts → preprocessor.notion.ts} RENAMED Viewed

@@ -1,6 +1,6 @@
 import {_XPathResult} from './xpathResult'
-export default (html: string, doc: Document): Document => {
+export function preprocessNotion(html: string, doc: Document): Document {
   const NOTION_REGEX = /<!-- notionvc:.*?-->/g
   if (html.match(NOTION_REGEX)) {

package/src/HtmlDeserializer/preprocessors/{whitespace.ts → preprocessor.whitespace.ts} RENAMED Viewed

@@ -1,7 +1,19 @@
 import {PRESERVE_WHITESPACE_TAGS} from '../../constants'
 import {_XPathResult} from './xpathResult'
-export default (_: string, doc: Document): Document => {
+// Elements that only contain block-level children (not inline text content)
+const BLOCK_CONTAINER_ELEMENTS = [
+  'body',
+  'table',
+  'tbody',
+  'thead',
+  'tfoot',
+  'tr',
+  'ul',
+  'ol',
+]
+export function preprocessWhitespace(_: string, doc: Document): Document {
   // Recursively process all nodes.
   function processNode(node: Node) {
     // If this is a text node and not inside a tag where whitespace should be preserved, process it.
@@ -11,14 +23,27 @@ export default (_: string, doc: Document): Document => {
         node.parentElement?.tagName.toLowerCase() || '',
       )
     ) {
-      node.textContent =
+      const normalized =
         node.textContent
           ?.replace(/\s\s+/g, ' ') // Remove multiple whitespace
           .replace(/[\r\n]+/g, ' ') || '' // Replace newlines with spaces
+      const parentTag = node.parentElement?.tagName.toLowerCase()
+      if (
+        parentTag &&
+        BLOCK_CONTAINER_ELEMENTS.includes(parentTag) &&
+        normalized.trim() === ''
+      ) {
+        // If parent is a block container and text is only whitespace, remove it
+        node.parentNode?.removeChild(node)
+      } else {
+        node.textContent = normalized
+      }
     }
     // Otherwise, if this node has children, process them.
     else {
-      for (let i = 0; i < node.childNodes.length; i++) {
+      // Process children in reverse to handle removals safely
+      for (let i = node.childNodes.length - 1; i >= 0; i--) {
         processNode(node.childNodes[i])
       }
     }

package/src/HtmlDeserializer/preprocessors/{word.ts → preprocessor.word.ts} RENAMED Viewed

@@ -35,7 +35,7 @@ function isWordHtml(html: string) {
   return WORD_HTML_REGEX.test(html)
 }
-export default (html: string, doc: Document): Document => {
+export function preprocessWord(html: string, doc: Document): Document {
   if (!isWordHtml(html)) {
     return doc
   }

package/src/HtmlDeserializer/rules/index.ts CHANGED Viewed

@@ -1,10 +1,11 @@
 import type {Schema} from '@portabletext/schema'
 import type {SchemaMatchers} from '../../schema-matchers'
 import type {DeserializerRule} from '../../types'
-import createGDocsRules from './gdocs'
-import createHTMLRules from './html'
-import createNotionRules from './notion'
-import createWordRules from './word'
+import {createWordOnlineRules} from '../word-online/rules.word-online'
+import {createGDocsRules} from './rules.gdocs'
+import {createHTMLRules} from './rules.html'
+import {createNotionRules} from './rules.notion'
+import {createWordRules} from './rules.word'
 export function createRules(
   schema: Schema,
@@ -12,6 +13,7 @@ export function createRules(
 ): DeserializerRule[] {
   return [
     ...createWordRules(),
+    ...createWordOnlineRules(schema, options),
     ...createNotionRules(),
     ...createGDocsRules(schema),
     ...createHTMLRules(schema, options),

package/src/HtmlDeserializer/rules/{gdocs.ts → rules.gdocs.ts} RENAMED Viewed

@@ -93,7 +93,7 @@ function getBlockStyle(schema: Schema, el: Node): string {
   return block.style
 }
-export default function createGDocsRules(schema: Schema): DeserializerRule[] {
+export function createGDocsRules(schema: Schema): DeserializerRule[] {
   return [
     {
       deserialize(el, next) {

package/src/HtmlDeserializer/rules/{html.ts → rules.html.ts} RENAMED Viewed

@@ -14,9 +14,9 @@ import type {SchemaMatchers} from '../../schema-matchers'
 import type {DeserializerRule} from '../../types'
 import {keyGenerator} from '../../util/randomKey'
 import {isElement, tagName} from '../helpers'
-import {whitespaceTextNodeRule} from './whitespace-text-node'
+import {whitespaceTextNodeRule} from './rules.whitespace-text-node'
-export function resolveListItem(
+function resolveListItem(
   schema: Schema,
   listNodeTagName: string,
 ): string | undefined {
@@ -35,7 +35,7 @@ export function resolveListItem(
   return undefined
 }
-export default function createHTMLRules(
+export function createHTMLRules(
   schema: Schema,
   options: {keyGenerator?: () => string; matchers?: SchemaMatchers},
 ): DeserializerRule[] {

package/src/HtmlDeserializer/rules/{notion.ts → rules.notion.ts} RENAMED Viewed

@@ -27,7 +27,7 @@ function isNotion(el: Node): boolean {
   return isElement(el) && Boolean(el.getAttribute('data-is-notion'))
 }
-export default function createNotionRules(): DeserializerRule[] {
+export function createNotionRules(): DeserializerRule[] {
   return [
     {
       deserialize(el) {

package/src/HtmlDeserializer/rules/rules.word.ts ADDED Viewed

@@ -0,0 +1,95 @@
+import {
+  BLOCK_DEFAULT_STYLE,
+  DEFAULT_BLOCK,
+  HTML_HEADER_TAGS,
+} from '../../constants'
+import type {DeserializerRule} from '../../types'
+import {isElement, tagName} from '../helpers'
+function getListItemStyle(el: Node): string | undefined {
+  const style = isElement(el) && el.getAttribute('style')
+  if (!style) {
+    return undefined
+  }
+  if (!style.match(/lfo\d+/)) {
+    return undefined
+  }
+  return style.match('lfo1') ? 'number' : 'bullet'
+}
+function getListItemLevel(el: Node): number | undefined {
+  const style = isElement(el) && el.getAttribute('style')
+  if (!style) {
+    return undefined
+  }
+  const levelMatch = style.match(/level\d+/)
+  if (!levelMatch) {
+    return undefined
+  }
+  const [level] = levelMatch[0].match(/\d/) || []
+  const levelNum = level ? Number.parseInt(level, 10) : 1
+  return levelNum || 1
+}
+function isWordListElement(el: Node): boolean {
+  if (!isElement(el)) {
+    return false
+  }
+  // Check for specific class names
+  if (el.className) {
+    if (
+      el.className === 'MsoListParagraphCxSpFirst' ||
+      el.className === 'MsoListParagraphCxSpMiddle' ||
+      el.className === 'MsoListParagraphCxSpLast'
+    ) {
+      return true
+    }
+  }
+  // Check for mso-list in style attribute
+  const style = el.getAttribute('style')
+  if (style && /mso-list:\s*l\d+\s+level\d+\s+lfo\d+/.test(style)) {
+    return true
+  }
+  return false
+}
+function getHeadingStyle(el: Node): string | undefined {
+  const tag = tagName(el)
+  if (tag && HTML_HEADER_TAGS[tag]) {
+    return HTML_HEADER_TAGS[tag]?.style
+  }
+  return undefined
+}
+export function createWordRules(): DeserializerRule[] {
+  return [
+    {
+      deserialize(el, next) {
+        const tag = tagName(el)
+        // Handle list items (both paragraphs and headings)
+        if (
+          (tag === 'p' || HTML_HEADER_TAGS[tag || '']) &&
+          isWordListElement(el)
+        ) {
+          const headingStyle = getHeadingStyle(el)
+          return {
+            ...DEFAULT_BLOCK,
+            listItem: getListItemStyle(el),
+            level: getListItemLevel(el),
+            style: headingStyle || BLOCK_DEFAULT_STYLE,
+            children: next(el.childNodes),
+          }
+        }
+        return undefined
+      },
+    },
+  ]
+}