@uniweb/semantic-parser 1.0.12 → 1.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -0
- package/package.json +1 -1
- package/src/processors/sequence.js +136 -26
package/README.md
CHANGED
|
@@ -322,10 +322,30 @@ Inline formatting is preserved as HTML tags:
|
|
|
322
322
|
// Input: Text with bold mark
|
|
323
323
|
// Output: "Text with <strong>bold</strong>"
|
|
324
324
|
|
|
325
|
+
// Input: Text with italic mark
|
|
326
|
+
// Output: "Text with <em>emphasis</em>"
|
|
327
|
+
|
|
325
328
|
// Input: Link mark
|
|
326
329
|
// Output: "Click <a href=\"/docs\">here</a>"
|
|
330
|
+
|
|
331
|
+
// Input: Span mark (bracketed spans)
|
|
332
|
+
// Output: "This is <span class=\"highlight\">highlighted</span> text"
|
|
327
333
|
```
|
|
328
334
|
|
|
335
|
+
### Span Marks
|
|
336
|
+
|
|
337
|
+
Bracketed spans (`[text]{.class}`) are converted to `<span>` elements with their attributes:
|
|
338
|
+
|
|
339
|
+
```js
|
|
340
|
+
// Input mark
|
|
341
|
+
{ type: "span", attrs: { class: "highlight", id: "note-1" } }
|
|
342
|
+
|
|
343
|
+
// Output HTML
|
|
344
|
+
'<span class="highlight" id="note-1">text</span>'
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
Spans can have classes, IDs, and custom attributes. They combine with other marks—a span with bold becomes `<strong><span class="...">text</span></strong>`.
|
|
348
|
+
|
|
329
349
|
## Documentation
|
|
330
350
|
|
|
331
351
|
- **[Content Writing Guide](./docs/guide.md)**: Learn how to structure content for optimal parsing
|
package/package.json
CHANGED
|
@@ -62,24 +62,19 @@ function processSequence(doc, options = {}) {
|
|
|
62
62
|
|
|
63
63
|
function processNode(node, sequence, options) {
|
|
64
64
|
if (node.content && Array.isArray(node.content)) {
|
|
65
|
-
// node.content?.forEach((child) => processNode(child, sequence, options));
|
|
66
|
-
// return;
|
|
67
65
|
node.content?.forEach((child) => {
|
|
68
|
-
const
|
|
69
|
-
|
|
70
|
-
if (
|
|
71
|
-
|
|
66
|
+
const result = createSequenceElement(child, options);
|
|
67
|
+
|
|
68
|
+
if (result) {
|
|
69
|
+
// Handle case where element returns multiple items (e.g., paragraph with only links)
|
|
70
|
+
if (Array.isArray(result)) {
|
|
71
|
+
sequence.push(...result);
|
|
72
|
+
} else {
|
|
73
|
+
sequence.push(result);
|
|
74
|
+
}
|
|
72
75
|
}
|
|
73
76
|
});
|
|
74
77
|
}
|
|
75
|
-
|
|
76
|
-
// Create element based on node type
|
|
77
|
-
// const element = createSequenceElement(node, options);
|
|
78
|
-
|
|
79
|
-
// //Skip empty paragraph when create sequence
|
|
80
|
-
// if (element) {
|
|
81
|
-
// sequence.push(element);
|
|
82
|
-
// }
|
|
83
78
|
}
|
|
84
79
|
|
|
85
80
|
function createSequenceElement(node, options = {}) {
|
|
@@ -95,6 +90,12 @@ function createSequenceElement(node, options = {}) {
|
|
|
95
90
|
};
|
|
96
91
|
}
|
|
97
92
|
|
|
93
|
+
// Check for paragraph containing only multiple links (no other text)
|
|
94
|
+
const multipleLinks = isOnlyLinks(node);
|
|
95
|
+
if (multipleLinks) {
|
|
96
|
+
return multipleLinks; // Returns array of link elements
|
|
97
|
+
}
|
|
98
|
+
|
|
98
99
|
const styledLink = isStyledLink(node);
|
|
99
100
|
|
|
100
101
|
if (styledLink) return styledLink;
|
|
@@ -281,6 +282,26 @@ function getTextContent(content, options = {}) {
|
|
|
281
282
|
styledText = `<span style="background-color: var(--highlight)">${styledText}</span>`;
|
|
282
283
|
}
|
|
283
284
|
|
|
285
|
+
// span (bracketed spans with class/id/attributes)
|
|
286
|
+
if (marks.some((mark) => mark.type === "span")) {
|
|
287
|
+
const spanMark = marks.find((mark) => mark.type === "span");
|
|
288
|
+
const attrs = spanMark?.attrs || {};
|
|
289
|
+
const attrParts = [];
|
|
290
|
+
|
|
291
|
+
if (attrs.class) attrParts.push(`class="${attrs.class}"`);
|
|
292
|
+
if (attrs.id) attrParts.push(`id="${attrs.id}"`);
|
|
293
|
+
|
|
294
|
+
// Add any other custom attributes (data-*, etc.)
|
|
295
|
+
for (const [key, value] of Object.entries(attrs)) {
|
|
296
|
+
if (key !== 'class' && key !== 'id') {
|
|
297
|
+
attrParts.push(`${key}="${value}"`);
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
const attrString = attrParts.length > 0 ? ` ${attrParts.join(' ')}` : '';
|
|
302
|
+
styledText = `<span${attrString}>${styledText}</span>`;
|
|
303
|
+
}
|
|
304
|
+
|
|
284
305
|
// bold
|
|
285
306
|
if (marks.some((mark) => mark.type === "bold")) {
|
|
286
307
|
styledText = `<strong>${styledText}</strong>`;
|
|
@@ -434,7 +455,7 @@ function parseDocumentBlock(itemAttrs) {
|
|
|
434
455
|
}
|
|
435
456
|
|
|
436
457
|
function parseUniwebIcon(itemAttrs) {
|
|
437
|
-
let { svg, url, size, color, preserveColors } = itemAttrs;
|
|
458
|
+
let { svg, url, size, color, preserveColors, href, target } = itemAttrs || {};
|
|
438
459
|
|
|
439
460
|
return {
|
|
440
461
|
svg,
|
|
@@ -442,6 +463,8 @@ function parseUniwebIcon(itemAttrs) {
|
|
|
442
463
|
size,
|
|
443
464
|
color,
|
|
444
465
|
preserveColors,
|
|
466
|
+
href,
|
|
467
|
+
target,
|
|
445
468
|
};
|
|
446
469
|
}
|
|
447
470
|
|
|
@@ -461,12 +484,13 @@ function parseImgBlock(itemAttrs) {
|
|
|
461
484
|
alt = "",
|
|
462
485
|
url,
|
|
463
486
|
href = "",
|
|
487
|
+
target = "",
|
|
464
488
|
theme,
|
|
465
489
|
role,
|
|
466
490
|
credit = "",
|
|
467
491
|
} = itemAttrs;
|
|
468
492
|
|
|
469
|
-
let { contentType, viewType, contentId, identifier } = imgInfo;
|
|
493
|
+
let { contentType, viewType, contentId, identifier } = imgInfo || {};
|
|
470
494
|
|
|
471
495
|
const sizes = {
|
|
472
496
|
center: "basic",
|
|
@@ -493,6 +517,7 @@ function parseImgBlock(itemAttrs) {
|
|
|
493
517
|
imgPos: direction === "left" || direction === "right" ? direction : "",
|
|
494
518
|
size: sizes[direction] || "basic",
|
|
495
519
|
href,
|
|
520
|
+
target,
|
|
496
521
|
theme,
|
|
497
522
|
role,
|
|
498
523
|
credit,
|
|
@@ -507,6 +532,8 @@ function parseVideoBlock(itemAttrs) {
|
|
|
507
532
|
info = {},
|
|
508
533
|
coverImg = {},
|
|
509
534
|
alt,
|
|
535
|
+
href = "",
|
|
536
|
+
target = "",
|
|
510
537
|
} = itemAttrs;
|
|
511
538
|
|
|
512
539
|
let video = makeAssetUrl({
|
|
@@ -520,6 +547,8 @@ function parseVideoBlock(itemAttrs) {
|
|
|
520
547
|
direction,
|
|
521
548
|
coverImg: makeAssetUrl(coverImg),
|
|
522
549
|
alt,
|
|
550
|
+
href,
|
|
551
|
+
target,
|
|
523
552
|
};
|
|
524
553
|
}
|
|
525
554
|
|
|
@@ -539,35 +568,64 @@ function stripTags(htmlString) {
|
|
|
539
568
|
}
|
|
540
569
|
|
|
541
570
|
function isLink(item) {
|
|
542
|
-
//
|
|
571
|
+
// Detect paragraphs/headings that are semantically "just a link"
|
|
572
|
+
// (single link text, possibly with decorative icons)
|
|
573
|
+
//
|
|
574
|
+
// For single-link paragraphs, the icon-link association is unambiguous:
|
|
575
|
+
// - Icons before the link text → iconBefore
|
|
576
|
+
// - Icons after the link text → iconAfter
|
|
577
|
+
//
|
|
578
|
+
// This supports natural content authoring: insert icon, type link text, add href
|
|
543
579
|
if (["paragraph", "heading"].includes(item.type)) {
|
|
544
|
-
|
|
580
|
+
const originalContent = item?.content || [];
|
|
545
581
|
|
|
546
|
-
//
|
|
547
|
-
|
|
582
|
+
// Filter out icons and whitespace to check for single link
|
|
583
|
+
const textContent = originalContent.filter((c) => {
|
|
548
584
|
if (c.type === "UniwebIcon") {
|
|
549
585
|
return false;
|
|
550
586
|
} else if (c.type === "text") {
|
|
551
587
|
return (c.text || "").trim() !== "";
|
|
552
588
|
}
|
|
553
|
-
|
|
554
589
|
return true;
|
|
555
590
|
});
|
|
556
591
|
|
|
557
|
-
if (
|
|
558
|
-
let contentItem =
|
|
592
|
+
if (textContent.length === 1) {
|
|
593
|
+
let contentItem = textContent[0];
|
|
559
594
|
let marks = contentItem?.marks || [];
|
|
560
595
|
|
|
561
596
|
for (let l = 0; l < marks.length; l++) {
|
|
562
597
|
let mark = marks[l];
|
|
563
598
|
|
|
564
|
-
|
|
599
|
+
if (mark?.type === "link") {
|
|
600
|
+
// Find the position of the link text in the original content
|
|
601
|
+
const linkIndex = originalContent.findIndex(
|
|
602
|
+
(c) => c.type === "text" && c.text === contentItem.text
|
|
603
|
+
);
|
|
604
|
+
|
|
605
|
+
// Collect icons before and after the link text
|
|
606
|
+
let iconBefore = null;
|
|
607
|
+
let iconAfter = null;
|
|
608
|
+
|
|
609
|
+
for (let i = 0; i < originalContent.length; i++) {
|
|
610
|
+
if (originalContent[i].type === "UniwebIcon") {
|
|
611
|
+
const iconAttrs = parseUniwebIcon(originalContent[i].attrs);
|
|
612
|
+
if (i < linkIndex) {
|
|
613
|
+
// Take the last icon before the link
|
|
614
|
+
iconBefore = iconAttrs;
|
|
615
|
+
} else if (i > linkIndex) {
|
|
616
|
+
// Take the first icon after the link
|
|
617
|
+
if (!iconAfter) iconAfter = iconAttrs;
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
}
|
|
565
621
|
|
|
566
|
-
if (markType === "link") {
|
|
567
622
|
return {
|
|
568
623
|
href: mark?.attrs?.href,
|
|
569
624
|
label: contentItem?.text || "",
|
|
570
|
-
|
|
625
|
+
iconBefore,
|
|
626
|
+
iconAfter,
|
|
627
|
+
// Preserve all inline elements for advanced rendering
|
|
628
|
+
children: processInlineElements(originalContent),
|
|
571
629
|
};
|
|
572
630
|
}
|
|
573
631
|
}
|
|
@@ -577,6 +635,58 @@ function isLink(item) {
|
|
|
577
635
|
return false;
|
|
578
636
|
}
|
|
579
637
|
|
|
638
|
+
/**
|
|
639
|
+
* Check if a paragraph contains ONLY links (multiple links, no other text)
|
|
640
|
+
* If so, return array of link data to be added to sequence separately.
|
|
641
|
+
*
|
|
642
|
+
* This handles the common pattern of writing links on consecutive lines:
|
|
643
|
+
* ```
|
|
644
|
+
* [Privacy Policy](/privacy)
|
|
645
|
+
* [Terms of Service](/terms)
|
|
646
|
+
* ```
|
|
647
|
+
* Markdown treats these as a single paragraph, but semantically they're separate links.
|
|
648
|
+
*
|
|
649
|
+
* @param {Object} item - Sequence item (paragraph)
|
|
650
|
+
* @returns {Array|false} Array of link objects or false
|
|
651
|
+
*/
|
|
652
|
+
function isOnlyLinks(item) {
|
|
653
|
+
if (item.type !== "paragraph") return false;
|
|
654
|
+
|
|
655
|
+
const content = item?.content || [];
|
|
656
|
+
if (!content.length) return false;
|
|
657
|
+
|
|
658
|
+
// Filter to get only significant content (no icons, no whitespace)
|
|
659
|
+
const textContent = content.filter((c) => {
|
|
660
|
+
if (c.type === "UniwebIcon") return false;
|
|
661
|
+
if (c.type === "text" && !(c.text || "").trim()) return false;
|
|
662
|
+
return true;
|
|
663
|
+
});
|
|
664
|
+
|
|
665
|
+
if (textContent.length < 2) return false; // Single link handled by isLink
|
|
666
|
+
|
|
667
|
+
// Check if ALL remaining content items are text nodes with link marks
|
|
668
|
+
const allLinks = textContent.every((c) => {
|
|
669
|
+
if (c.type !== "text") return false;
|
|
670
|
+
const hasLinkMark = c.marks?.some((m) => m.type === "link");
|
|
671
|
+
return hasLinkMark;
|
|
672
|
+
});
|
|
673
|
+
|
|
674
|
+
if (!allLinks) return false;
|
|
675
|
+
|
|
676
|
+
// Extract links as simple {href, label} objects
|
|
677
|
+
// Icons in this paragraph go to body.icons separately (no association)
|
|
678
|
+
return textContent.map((c) => {
|
|
679
|
+
const linkMark = c.marks.find((m) => m.type === "link");
|
|
680
|
+
return {
|
|
681
|
+
type: "link",
|
|
682
|
+
attrs: {
|
|
683
|
+
href: linkMark?.attrs?.href,
|
|
684
|
+
label: c.text || "",
|
|
685
|
+
},
|
|
686
|
+
};
|
|
687
|
+
});
|
|
688
|
+
}
|
|
689
|
+
|
|
580
690
|
// method to check if given item has multiple content parts and each of them has the same link attrs with different inline style (plain, em, strong, u)
|
|
581
691
|
// if so, it will return the link attrs and all the content parts whose link mark has been removed
|
|
582
692
|
// warning: This method will not work if the any of the content parts are not link marks
|