@uniweb/semantic-parser 1.0.12 → 1.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -322,10 +322,30 @@ Inline formatting is preserved as HTML tags:
322
322
  // Input: Text with bold mark
323
323
  // Output: "Text with <strong>bold</strong>"
324
324
 
325
+ // Input: Text with italic mark
326
+ // Output: "Text with <em>emphasis</em>"
327
+
325
328
  // Input: Link mark
326
329
  // Output: "Click <a href=\"/docs\">here</a>"
330
+
331
+ // Input: Span mark (bracketed spans)
332
+ // Output: "This is <span class=\"highlight\">highlighted</span> text"
327
333
  ```
328
334
 
335
+ ### Span Marks
336
+
337
+ Bracketed spans (`[text]{.class}`) are converted to `<span>` elements with their attributes:
338
+
339
+ ```js
340
+ // Input mark
341
+ { type: "span", attrs: { class: "highlight", id: "note-1" } }
342
+
343
+ // Output HTML
344
+ '<span class="highlight" id="note-1">text</span>'
345
+ ```
346
+
347
+ Spans can have classes, IDs, and custom attributes. They combine with other marks—a span with bold becomes `<strong><span class="...">text</span></strong>`.
348
+
329
349
  ## Documentation
330
350
 
331
351
  - **[Content Writing Guide](./docs/guide.md)**: Learn how to structure content for optimal parsing
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@uniweb/semantic-parser",
3
- "version": "1.0.12",
3
+ "version": "1.0.14",
4
4
  "description": "Semantic parser for ProseMirror/TipTap content structures",
5
5
  "type": "module",
6
6
  "main": "./src/index.js",
@@ -62,24 +62,19 @@ function processSequence(doc, options = {}) {
62
62
 
63
63
  function processNode(node, sequence, options) {
64
64
  if (node.content && Array.isArray(node.content)) {
65
- // node.content?.forEach((child) => processNode(child, sequence, options));
66
- // return;
67
65
  node.content?.forEach((child) => {
68
- const element = createSequenceElement(child, options);
69
-
70
- if (element) {
71
- sequence.push(element);
66
+ const result = createSequenceElement(child, options);
67
+
68
+ if (result) {
69
+ // Handle case where element returns multiple items (e.g., paragraph with only links)
70
+ if (Array.isArray(result)) {
71
+ sequence.push(...result);
72
+ } else {
73
+ sequence.push(result);
74
+ }
72
75
  }
73
76
  });
74
77
  }
75
-
76
- // Create element based on node type
77
- // const element = createSequenceElement(node, options);
78
-
79
- // //Skip empty paragraph when create sequence
80
- // if (element) {
81
- // sequence.push(element);
82
- // }
83
78
  }
84
79
 
85
80
  function createSequenceElement(node, options = {}) {
@@ -95,6 +90,12 @@ function createSequenceElement(node, options = {}) {
95
90
  };
96
91
  }
97
92
 
93
+ // Check for paragraph containing only multiple links (no other text)
94
+ const multipleLinks = isOnlyLinks(node);
95
+ if (multipleLinks) {
96
+ return multipleLinks; // Returns array of link elements
97
+ }
98
+
98
99
  const styledLink = isStyledLink(node);
99
100
 
100
101
  if (styledLink) return styledLink;
@@ -281,6 +282,26 @@ function getTextContent(content, options = {}) {
281
282
  styledText = `<span style="background-color: var(--highlight)">${styledText}</span>`;
282
283
  }
283
284
 
285
+ // span (bracketed spans with class/id/attributes)
286
+ if (marks.some((mark) => mark.type === "span")) {
287
+ const spanMark = marks.find((mark) => mark.type === "span");
288
+ const attrs = spanMark?.attrs || {};
289
+ const attrParts = [];
290
+
291
+ if (attrs.class) attrParts.push(`class="${attrs.class}"`);
292
+ if (attrs.id) attrParts.push(`id="${attrs.id}"`);
293
+
294
+ // Add any other custom attributes (data-*, etc.)
295
+ for (const [key, value] of Object.entries(attrs)) {
296
+ if (key !== 'class' && key !== 'id') {
297
+ attrParts.push(`${key}="${value}"`);
298
+ }
299
+ }
300
+
301
+ const attrString = attrParts.length > 0 ? ` ${attrParts.join(' ')}` : '';
302
+ styledText = `<span${attrString}>${styledText}</span>`;
303
+ }
304
+
284
305
  // bold
285
306
  if (marks.some((mark) => mark.type === "bold")) {
286
307
  styledText = `<strong>${styledText}</strong>`;
@@ -434,7 +455,7 @@ function parseDocumentBlock(itemAttrs) {
434
455
  }
435
456
 
436
457
  function parseUniwebIcon(itemAttrs) {
437
- let { svg, url, size, color, preserveColors } = itemAttrs;
458
+ let { svg, url, size, color, preserveColors, href, target } = itemAttrs || {};
438
459
 
439
460
  return {
440
461
  svg,
@@ -442,6 +463,8 @@ function parseUniwebIcon(itemAttrs) {
442
463
  size,
443
464
  color,
444
465
  preserveColors,
466
+ href,
467
+ target,
445
468
  };
446
469
  }
447
470
 
@@ -461,12 +484,13 @@ function parseImgBlock(itemAttrs) {
461
484
  alt = "",
462
485
  url,
463
486
  href = "",
487
+ target = "",
464
488
  theme,
465
489
  role,
466
490
  credit = "",
467
491
  } = itemAttrs;
468
492
 
469
- let { contentType, viewType, contentId, identifier } = imgInfo;
493
+ let { contentType, viewType, contentId, identifier } = imgInfo || {};
470
494
 
471
495
  const sizes = {
472
496
  center: "basic",
@@ -493,6 +517,7 @@ function parseImgBlock(itemAttrs) {
493
517
  imgPos: direction === "left" || direction === "right" ? direction : "",
494
518
  size: sizes[direction] || "basic",
495
519
  href,
520
+ target,
496
521
  theme,
497
522
  role,
498
523
  credit,
@@ -507,6 +532,8 @@ function parseVideoBlock(itemAttrs) {
507
532
  info = {},
508
533
  coverImg = {},
509
534
  alt,
535
+ href = "",
536
+ target = "",
510
537
  } = itemAttrs;
511
538
 
512
539
  let video = makeAssetUrl({
@@ -520,6 +547,8 @@ function parseVideoBlock(itemAttrs) {
520
547
  direction,
521
548
  coverImg: makeAssetUrl(coverImg),
522
549
  alt,
550
+ href,
551
+ target,
523
552
  };
524
553
  }
525
554
 
@@ -539,35 +568,64 @@ function stripTags(htmlString) {
539
568
  }
540
569
 
541
570
  function isLink(item) {
542
- //For fast check, we only assume link in paragraph or heading
571
+ // Detect paragraphs/headings that are semantically "just a link"
572
+ // (single link text, possibly with decorative icons)
573
+ //
574
+ // For single-link paragraphs, the icon-link association is unambiguous:
575
+ // - Icons before the link text → iconBefore
576
+ // - Icons after the link text → iconAfter
577
+ //
578
+ // This supports natural content authoring: insert icon, type link text, add href
543
579
  if (["paragraph", "heading"].includes(item.type)) {
544
- let content = item?.content || [];
580
+ const originalContent = item?.content || [];
545
581
 
546
- //filter out icons
547
- content = content.filter((c) => {
582
+ // Filter out icons and whitespace to check for single link
583
+ const textContent = originalContent.filter((c) => {
548
584
  if (c.type === "UniwebIcon") {
549
585
  return false;
550
586
  } else if (c.type === "text") {
551
587
  return (c.text || "").trim() !== "";
552
588
  }
553
-
554
589
  return true;
555
590
  });
556
591
 
557
- if (content.length === 1) {
558
- let contentItem = content?.[0];
592
+ if (textContent.length === 1) {
593
+ let contentItem = textContent[0];
559
594
  let marks = contentItem?.marks || [];
560
595
 
561
596
  for (let l = 0; l < marks.length; l++) {
562
597
  let mark = marks[l];
563
598
 
564
- const markType = mark?.type;
599
+ if (mark?.type === "link") {
600
+ // Find the position of the link text in the original content
601
+ const linkIndex = originalContent.findIndex(
602
+ (c) => c.type === "text" && c.text === contentItem.text
603
+ );
604
+
605
+ // Collect icons before and after the link text
606
+ let iconBefore = null;
607
+ let iconAfter = null;
608
+
609
+ for (let i = 0; i < originalContent.length; i++) {
610
+ if (originalContent[i].type === "UniwebIcon") {
611
+ const iconAttrs = parseUniwebIcon(originalContent[i].attrs);
612
+ if (i < linkIndex) {
613
+ // Take the last icon before the link
614
+ iconBefore = iconAttrs;
615
+ } else if (i > linkIndex) {
616
+ // Take the first icon after the link
617
+ if (!iconAfter) iconAfter = iconAttrs;
618
+ }
619
+ }
620
+ }
565
621
 
566
- if (markType === "link") {
567
622
  return {
568
623
  href: mark?.attrs?.href,
569
624
  label: contentItem?.text || "",
570
- children: processInlineElements(content),
625
+ iconBefore,
626
+ iconAfter,
627
+ // Preserve all inline elements for advanced rendering
628
+ children: processInlineElements(originalContent),
571
629
  };
572
630
  }
573
631
  }
@@ -577,6 +635,58 @@ function isLink(item) {
577
635
  return false;
578
636
  }
579
637
 
638
+ /**
639
+ * Check if a paragraph contains ONLY links (multiple links, no other text)
640
+ * If so, return array of link data to be added to sequence separately.
641
+ *
642
+ * This handles the common pattern of writing links on consecutive lines:
643
+ * ```
644
+ * [Privacy Policy](/privacy)
645
+ * [Terms of Service](/terms)
646
+ * ```
647
+ * Markdown treats these as a single paragraph, but semantically they're separate links.
648
+ *
649
+ * @param {Object} item - Sequence item (paragraph)
650
+ * @returns {Array|false} Array of link objects or false
651
+ */
652
+ function isOnlyLinks(item) {
653
+ if (item.type !== "paragraph") return false;
654
+
655
+ const content = item?.content || [];
656
+ if (!content.length) return false;
657
+
658
+ // Filter to get only significant content (no icons, no whitespace)
659
+ const textContent = content.filter((c) => {
660
+ if (c.type === "UniwebIcon") return false;
661
+ if (c.type === "text" && !(c.text || "").trim()) return false;
662
+ return true;
663
+ });
664
+
665
+ if (textContent.length < 2) return false; // Single link handled by isLink
666
+
667
+ // Check if ALL remaining content items are text nodes with link marks
668
+ const allLinks = textContent.every((c) => {
669
+ if (c.type !== "text") return false;
670
+ const hasLinkMark = c.marks?.some((m) => m.type === "link");
671
+ return hasLinkMark;
672
+ });
673
+
674
+ if (!allLinks) return false;
675
+
676
+ // Extract links as simple {href, label} objects
677
+ // Icons in this paragraph go to body.icons separately (no association)
678
+ return textContent.map((c) => {
679
+ const linkMark = c.marks.find((m) => m.type === "link");
680
+ return {
681
+ type: "link",
682
+ attrs: {
683
+ href: linkMark?.attrs?.href,
684
+ label: c.text || "",
685
+ },
686
+ };
687
+ });
688
+ }
689
+
580
690
  // method to check if given item has multiple content parts and each of them has the same link attrs with different inline style (plain, em, strong, u)
581
691
  // if so, it will return the link attrs and all the content parts whose link mark has been removed
582
692
  // warning: This method will not work if the any of the content parts are not link marks