npm - @uniweb/semantic-parser - Versions diffs - 1.1.4 → 1.1.6 - Mend

@uniweb/semantic-parser 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/AGENTS.md +8 -11
package/README.md +3 -160
package/package.json +2 -5
package/src/index.js +1 -2
package/src/processors/groups.js +16 -15
package/docs/api.md +0 -350
package/docs/entity-consolidation.md +0 -470
package/docs/file-structure.md +0 -50
package/docs/guide.md +0 -206
package/docs/mapping-patterns.md +0 -928
package/docs/text-component-reference.md +0 -515
package/reference/README.md +0 -195
package/reference/Text.js +0 -188
package/src/mappers/accessor.js +0 -312
package/src/mappers/extractors.js +0 -416
package/src/mappers/helpers.js +0 -234
package/src/mappers/index.js +0 -28
package/src/mappers/types.js +0 -495
package/src/processors/groups_backup.js +0 -379
package/src/processors/groups_doc.md +0 -179
package/src/processors/sequence_backup.js +0 -402
package/src/processors_old/byType.js +0 -129
package/src/processors_old/groups.js +0 -240
package/src/processors_old/sequence.js +0 -140

package/src/processors/groups_backup.js DELETED Viewed

@@ -1,379 +0,0 @@
-/**
- * Transform a sequence into content groups with semantic structure
- * @param {Array} sequence Flat sequence of elements
- * @param {Object} options Parsing options
- * @returns {Object} Content organized into groups with identified main content
- */
-function processGroups(sequence, options = {}) {
-    const result = {
-        main: null,
-        items: [],
-        metadata: {
-            dividerMode: false,
-            groups: 0,
-        },
-    };
-    if (!sequence.length) return result;
-    // Check if using divider mode
-    result.metadata.dividerMode = sequence.some((el) => el.type === "divider");
-    // Split sequence into raw groups
-    const groups = result.metadata.dividerMode
-        ? splitByDividers(sequence)
-        : splitByHeadings(sequence, options);
-    // Process each group's structure
-    const processedGroups = groups.map((group) => processGroupContent(group));
-    // Special handling for first group in divider mode
-    if (result.metadata.dividerMode && groups.startsWithDivider) {
-        result.items = processedGroups;
-    } else {
-        // Organize into main content and items
-        const shouldBeMain = identifyMainContent(processedGroups);
-        if (shouldBeMain) {
-            result.main = processedGroups[0];
-            result.items = processedGroups.slice(1);
-        } else {
-            result.items = processedGroups;
-        }
-    }
-    // result.metadata.groups = processedGroups.length;
-    return result;
-}
-/**
- * Split sequence into groups using dividers
- */
-function splitByDividers(sequence) {
-    const groups = [];
-    let currentGroup = [];
-    let startsWithDivider = false;
-    // Check if content effectively starts with divider (ignoring whitespace etc)
-    for (let i = 0; i < sequence.length; i++) {
-        const element = sequence[i];
-        if (element.type === "divider") {
-            if (currentGroup.length === 0 && groups.length === 0) {
-                startsWithDivider = true;
-            } else if (currentGroup.length > 0) {
-                groups.push(currentGroup);
-                currentGroup = [];
-            }
-        } else {
-            currentGroup.push(element);
-        }
-    }
-    if (currentGroup.length > 0) {
-        groups.push(currentGroup);
-    }
-    groups.startsWithDivider = startsWithDivider;
-    return groups;
-}
-/**
- * Split sequence into groups using heading patterns
- */
-function splitByHeadings(sequence, options = {}) {
-    const groups = [];
-    let currentGroup = [];
-    let isPreOpened = false;
-    // Consider if current group is pre opened (only has banner or pretitle)
-    // before starting a new group.
-    const startGroup = (preOpen) => {
-        if (currentGroup.length && !isPreOpened) {
-            groups.push(currentGroup);
-            currentGroup = [];
-        }
-        isPreOpened = preOpen;
-    };
-    for (let i = 0; i < sequence.length; i++) {
-        // Only allow a banner for the first group
-        if (!groups.length && isBannerImage(sequence, i)) {
-            startGroup(true); // pre open a new group
-            currentGroup.push(sequence[i]);
-            i++; // move to known next element (it will be a heading)
-        }
-        // Handle special pretitle case before consuming all consecutive
-        // headings with increasing levels
-        if (isPreTitle(sequence, i)) {
-            startGroup(true); // pre open a new group
-            currentGroup.push(sequence[i]);
-            i++; // move to known next element (it will be a heading)
-        }
-        const element = sequence[i];
-        if (element.type === "heading") {
-            const headings = readHeadingGroup(sequence, i);
-            startGroup(false);
-            // Add headings to the current group
-            currentGroup.push(...headings);
-            i += headings.length - 1; // skip all the added headings
-        } else {
-            currentGroup.push(element);
-        }
-    }
-    if (currentGroup.length > 0) {
-        groups.push(currentGroup);
-    }
-    return groups;
-}
-/**
- * Check if this is a pretitle - any heading followed by a more important heading
- * (e.g., H3→H1, H2→H1, H6→H5, etc.)
- */
-function isPreTitle(sequence, i) {
-    return (
-        i + 1 < sequence.length &&
-        sequence[i].type === "heading" &&
-        sequence[i + 1].type === "heading" &&
-        sequence[i].level > sequence[i + 1].level // Smaller heading before larger
-    );
-}
-function isBannerImage(sequence, i) {
-    return (
-        i + 1 < sequence.length &&
-        sequence[i].type === "image" &&
-        (sequence[i].role === "banner" || sequence[i + 1].type === "heading")
-    );
-}
-/**
- * Eagerly consume all consecutive headings with increasing levels
- * and return them as an array.
- */
-function readHeadingGroup(sequence, i) {
-    const elements = [sequence[i]];
-    for (i++; i < sequence.length; i++) {
-        const element = sequence[i];
-        if (
-            element.type === "heading" &&
-            element.level > sequence[i - 1].level
-        ) {
-            elements.push(element);
-        } else {
-            break;
-        }
-    }
-    return elements;
-}
-/**
- * Process a group's content to identify its structure
- */
-function processGroupContent(elements) {
-    const header = {
-        pretitle: "",
-        title: "",
-        subtitle: "",
-        subtitle2: "",
-        alignment: null,
-    };
-    let banner = null;
-    const body = {
-        imgs: [],
-        icons: [],
-        videos: [],
-        paragraphs: [],
-        links: [],
-        lists: [],
-        buttons: [],
-        properties: {},
-        propertyBlocks: [],
-        cards: [],
-        documents: [],
-        forms: [],
-        quotes: [],
-        headings: [],
-    };
-    const metadata = {
-        level: null,
-        contentTypes: new Set(),
-    };
-    let inBody = false; // Track when we've finished header section
-    for (let i = 0; i < elements.length; i++) {
-        if (isPreTitle(elements, i)) {
-            header.pretitle = elements[i].content;
-            i++; // move to known next heading (H1 or h2)
-        }
-        if (isBannerImage(elements, i)) {
-            banner = {
-                url: elements[i].src,
-                caption: elements[i].caption,
-                alt: elements[i].alt,
-            };
-            i++;
-        }
-        const element = elements[i];
-        if (element.type === "heading") {
-            metadata.level ??= element.level;
-            // Extract alignment from first heading
-            if (!header.alignment && element.attrs?.textAlign) {
-                header.alignment = element.attrs.textAlign;
-            }
-            // Assign to header fields
-            if (!header.title) {
-                header.title = element.content;
-            } else if (!header.subtitle) {
-                header.subtitle = element.content;
-            } else if (!header.subtitle2) {
-                header.subtitle2 = element.content;
-            } else {
-                // After subtitle2, we're in body - collect heading
-                inBody = true;
-                body.headings.push(element.content);
-            }
-        } else if (element.type === "list") {
-            inBody = true;
-            body.lists.push(processListContent(element));
-        } else {
-            inBody = true;
-            switch (element.type) {
-                case "paragraph":
-                    body.paragraphs.push(element.content);
-                    break;
-                case "image":
-                    body.imgs.push({
-                        url: element.src,
-                        caption: element.caption,
-                        alt: element.alt,
-                    });
-                    break;
-                case "link":
-                    body.links.push({
-                        href: element.content.href,
-                        label: element.content.label,
-                    });
-                    break;
-                case "styledLink":
-                    // Styled link (multi-part with same href)
-                    body.links.push({
-                        href: element.href,
-                        label: element.content,
-                        target: element.target,
-                    });
-                    break;
-                case "icon":
-                    body.icons.push(element.svg);
-                    break;
-                case "button":
-                    body.buttons.push(element);
-                    break;
-                case "video":
-                    body.videos.push({
-                        src: element.src,
-                        caption: element.caption,
-                        alt: element.alt,
-                    });
-                    break;
-                case "blockquote":
-                    // Process blockquote content recursively
-                    const quoteContent = processGroupContent(
-                        element.content,
-                        options
-                    );
-                    body.quotes.push(quoteContent.body);
-                    break;
-                case "codeBlock":
-                    // Use parsed JSON if available, otherwise use text content
-                    const codeData =
-                        element.parsed !== null
-                            ? element.parsed
-                            : element.content;
-                    body.properties = codeData; // Last one
-                    body.propertyBlocks.push(codeData); // All of them
-                    break;
-                case "card-group":
-                    body.cards.push(...element.cards);
-                    break;
-                case "document-group":
-                    body.documents.push(...element.documents);
-                    break;
-                case "form":
-                    body.forms.push(element.data || element.attrs);
-                    break;
-            }
-        }
-    }
-    return {
-        header,
-        body,
-        banner,
-        metadata,
-    };
-}
-function processListContent(list) {
-    const { items } = list;
-    return items.map((item) => {
-        const { items: nestedList, content: listContent } = item;
-        const parsedContent = processGroupContent(listContent).body;
-        if (nestedList.length) {
-            const parsedNestedList = nestedList.map(
-                (nestedItem) => processGroupContent(nestedItem.content).body
-            );
-            parsedContent.lists = [parsedNestedList];
-        }
-        return parsedContent;
-    });
-}
-/**
- * Determine if the first group should be treated as main content
- */
-function identifyMainContent(groups) {
-    if (groups.length === 0) return false;
-    // Single group is main content
-    if (groups.length === 1) return true;
-    // First group should be more important (lower level) than second to be main
-    const first = groups[0].metadata.level;
-    const second = groups[1].metadata.level;
-    return first ? !second || first < second : false;
-}
-export { processGroups };

package/src/processors/groups_doc.md DELETED Viewed

@@ -1,179 +0,0 @@
-# Content Grouping Logic
-This document outlines how the `processGroups` function interprets flat arrays of content (Headings, Paragraphs, etc.) and organizes them into semantic **Main Content** and **List Items**.
-## The Core Challenge
-The parser must distinguish between two visually similar but semantically different patterns:
-1. **Subtitles:** A smaller heading that belongs to the main title (Merge).
-2. **List Items:** A smaller heading that starts a new list item (Split).
-## The Logic (Heuristics)
-To make this decision, the parser looks ahead at the structure:
-1. **Sibling Boundary:** If we are at Level X and encounter another Level X, it is always a sibling. We **Split**.
-2. **Peer Detection:** If we are stepping down (H1 → H2), we check if that H2 has a "peer" (another H2) later in the section.
-3. **Leaf vs. Branch:**
-    - **Leaf:** A heading with no sub-headings underneath it.
-    - **Branch:** A heading with sub-headings (e.g., H2 followed by H3 dates).
-## Supported Patterns & Behavior
-### 1. The "Resume" Pattern (Items)
--   **Structure:** `H1` → `H2 (Branch)` → `H2 (Branch)`
--   **Use Case:** Academic Experience, Work History.
--   **Behavior:** The parser sees the first H2 has a peer. Both are "Branches" (have children).
--   **Result:** **Split**. The H1 becomes Main; the H2s become separate Items.
-**Input Data Structure:**
-```
-{
-  "type": "doc",
-  "content": [
-    { "type": "heading", "attrs": { "level": 1 }, "content": [{ "type": "text", "text": "Academic Experience" }] },
-    { "type": "heading", "attrs": { "level": 2 }, "content": [{ "type": "text", "text": "Ph.D. in CS" }] },
-    { "type": "heading", "attrs": { "level": 3 }, "content": [{ "type": "text", "text": "2014-2018" }] },
-    { "type": "paragraph", "content": [{ "type": "text", "text": "MIT" }] },
-    { "type": "heading", "attrs": { "level": 2 }, "content": [{ "type": "text", "text": "Masters in Data" }] },
-    { "type": "heading", "attrs": { "level": 3 }, "content": [{ "type": "text", "text": "2012-2014" }] },
-    { "type": "paragraph", "content": [{ "type": "text", "text": "Berkeley" }] }
-  ]
-}
-```
-**Parsed Output::**
-```
-[Main]  title: Academic Experience
-[Item]  title: Ph.D. in CS
-        subtitle: 2014-2018
-[Item]  title: Masters in Data
-        subtitle: 2012-2014
-```
-### 2. The "Standard" Pattern (Leaf Items)
--   **Structure:** `H1` → `H2 (Leaf)` → `H2 (Leaf)`
--   **Use Case:** Features list, standard sections.
--   **Behavior:** Even though the H2s are leaves (no children), the parser detects a peer (another H2).
--   **Result:** **Split**. Sibling detection forces them into separate items.
-**Input Data Structure:**
-```
-{
-  "type": "doc",
-  "content": [
-    { "type": "heading", "attrs": { "level": 1 }, "content": [{ "type": "text", "text": "Features" }] },
-    { "type": "paragraph", "content": [{ "type": "text", "text": "Our main features." }] },
-    { "type": "heading", "attrs": { "level": 2 }, "content": [{ "type": "text", "text": "Feature One" }] },
-    { "type": "paragraph", "content": [{ "type": "text", "text": "First feature description." }] },
-    { "type": "heading", "attrs": { "level": 2 }, "content": [{ "type": "text", "text": "Feature Two" }] },
-    { "type": "paragraph", "content": [{ "type": "text", "text": "Second feature description." }] }
-  ]
-}
-```
-**Parsed Output::**
-```
-[Main]  title: Features
-        body: Our main features.
-[Item]  title: Feature One
-        body: First feature description.
-[Item]  title: Feature Two
-        body: Second feature description.
-```
-### 3. The "Hybrid" Pattern (Intro Subtitle + Items)
--   **Structure:** `H1` → `H2 (Leaf)` → `H2 (Branch)`
--   **Use Case:** A section with a summary heading before the list starts.
--   **Behavior:** The parser compares the first H2 (Leaf) against the second H2 (Branch).
--   **Result:** **Merge then Split**. The first H2 merges into Main. The second H2 starts the first Item.
-**Input Data Structure:**
-```
-{
-  "type": "doc",
-  "content": [
-    { "type": "heading", "attrs": { "level": 1 }, "content": [{ "type": "text", "text": "Work History" }] },
-    { "type": "heading", "attrs": { "level": 2 }, "content": [{ "type": "text", "text": "A summary of my roles." }] },
-    { "type": "heading", "attrs": { "level": 2 }, "content": [{ "type": "text", "text": "Google" }] },
-    { "type": "heading", "attrs": { "level": 3 }, "content": [{ "type": "text", "text": "2020-Present" }] },
-    { "type": "heading", "attrs": { "level": 2 }, "content": [{ "type": "text", "text": "Facebook" }] },
-    { "type": "heading", "attrs": { "level": 3 }, "content": [{ "type": "text", "text": "2018-2020" }] }
-  ]
-}
-```
-**Parsed Output::**
-```
-[Main]  title: Work History
-        subtitle: "A summary of my roles."
-[Item]  title: Google
-        subtitle: 2020-Present
-[Item]  title: Facebook
-        subtitle: 2018-2020
-```
-### 4. The "Deep Header" Pattern
--   **Structure:** `H3` → `H1` → `H2` -> `H3`
--   **Use Case:** Complex Hero sections with pre-titles and multiple subtitles.
--   **Behavior:** The headings are strictly sequential or hierarchical components of a single block.
--   **Result:** **Merge then Split**. Treats the hierarchy as a single deep header block.
-**Input Data Structure:**
-```
-{
-  "type": "doc",
-  "content": [
-    { "type": "heading", "attrs": { "level": 3 }, "content": [{ "type": "text", "text": "WELCOME" }] },
-    { "type": "heading", "attrs": { "level": 1 }, "content": [{ "type": "text", "text": "Main Title" }] },
-    { "type": "heading", "attrs": { "level": 2 }, "content": [{ "type": "text", "text": "Subtitle" }] },
-    { "type": "heading", "attrs": { "level": 3 }, "content": [{ "type": "text", "text": "Subsubtitle" }] },
-    { "type": "paragraph", "content": [{ "type": "text", "text": "Content." }] }
-  ]
-}
-```
-**Parsed Output::**
-```
-[Main]  title: Main Title
-        pretitle: WELCOME
-        subtitle: Subtitle
-        subtitle2: Subsubtitle
-```