npm - mdld-parse - Versions diffs - 0.7.1 → 0.7.3 - Mend

mdld-parse 0.7.1 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "mdld-parse",
-	"version": "0.7.1",
+	"version": "0.7.3",
 	"description": "A standards-compliant parser for **MD-LD (Markdown-Linked Data)** — a human-friendly RDF authoring format that extends Markdown with semantic annotations.",
 	"type": "module",
 	"main": "index.js",
@@ -38,5 +38,8 @@
 	"homepage": "https://mdld.js.org",
 	"bugs": {
 		"url": "https://github.com/davay42/mdld-parse/issues"
+	},
+	"dependencies": {
+		"rdfa-parse": "^1.0.1"
 	}
 }

package/src/generate.js CHANGED Viewed

@@ -1,4 +1,5 @@
-import { shortenIRI, expandIRI, DEFAULT_CONTEXT, DataFactory } from './utils.js';
+import { shortenIRI, expandIRI, DataFactory } from './utils.js';
+import { DEFAULT_CONTEXT } from './shared.js';
 // Helper functions for cleaner term type checking
 function isLiteral(term) {

package/src/index.js CHANGED Viewed

@@ -3,8 +3,8 @@ export { merge } from './merge.js';
 export { generate } from './generate.js';
 export { locate } from './locate.js';
 export { render } from './render.js';
+export { DEFAULT_CONTEXT } from './shared.js';
 export {
-    DEFAULT_CONTEXT,
     DataFactory,
     hash,
     expandIRI,

package/src/merge.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import { parse } from './parse.js';
-import { DEFAULT_CONTEXT } from './utils.js';
+import { DEFAULT_CONTEXT } from './shared.js';
 /**
  * Creates a unique key for quad identity matching
@@ -34,13 +34,14 @@ function normalizeInput(input, options, docContext) {
  * Merges multiple MDLD documents with diff polarity resolution
  * @param {Array<string|ParseResult>} docs
  * @param {Object} options
- * @returns {Object}
+ * @returns {Object} Merge result with quads, remove, statements, origin, and context
  */
 export function merge(docs, options = {}) {
     const sessionBuffer = new Map(); // Use Map instead of Set for proper quad storage
     const sessionRemoveSet = new Set();
     const allDocuments = [];
     const quadIndex = new Map();
+    const allStatements = []; // Collect statements from all documents
     // Process each document in order
     for (let i = 0; i < docs.length; i++) {
@@ -57,10 +58,16 @@ export function merge(docs, options = {}) {
             index: i,
             input: typeof input === 'string' ? 'string' : 'ParseResult',
             origin: doc.origin,
-            context: doc.context
+            context: doc.context,
+            statementsCount: doc.statements?.length || 0 // Track statements count
         };
         allDocuments.push(documentOrigin);
+        // Collect statements from this document
+        if (doc.statements && doc.statements.length > 0) {
+            allStatements.push(...doc.statements);
+        }
         // Fold assertions into session buffer
         for (const quad of doc.quads) {
             const key = quadKey(quad);
@@ -125,6 +132,7 @@ export function merge(docs, options = {}) {
     return {
         quads: filteredQuads,
         remove: filteredRemove,
+        statements: allStatements, // Include all collected statements
         origin: mergeOrigin,
         context: finalContext
     };

package/src/parse.js CHANGED Viewed

@@ -1,5 +1,4 @@
 import {
-    DEFAULT_CONTEXT,
     DataFactory,
     expandIRI,
     parseSemanticBlock,
@@ -7,18 +6,7 @@ import {
     createLiteral,
     hash
 } from './utils.js';
-const URL_REGEX = /^[a-zA-Z][a-zA-Z0-9+.-]*:/;
-const FENCE_REGEX = /^(`{3,}|~{3,})(.*)/;
-const PREFIX_REGEX = /^\[([^\]]+)\]\s*<([^>]+)>/;
-const HEADING_REGEX = /^(#{1,6})\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
-const UNORDERED_LIST_REGEX = /^(\s*)([-*+]|\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
-const BLOCKQUOTE_REGEX = /^>\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
-const STANDALONE_SUBJECT_REGEX = /^\s*\{=(.*?)\}\s*$/;
-const INLINE_CARRIER_PATTERNS = {
-    EMPHASIS: /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y,
-    CODE_SPAN: /``(.+?)``\s*\{([^}]+)\}/y
-};
+import { DEFAULT_CONTEXT, URL_REGEX, FENCE_REGEX, PREFIX_REGEX, HEADING_REGEX, UNORDERED_LIST_REGEX, BLOCKQUOTE_REGEX, STANDALONE_SUBJECT_REGEX, INLINE_CARRIER_PATTERNS } from './shared.js';
 // Cache for fence regex patterns to avoid recreation
 const FENCE_CLOSE_PATTERNS = new Map();
@@ -210,6 +198,12 @@ function createCarrier(type, text, attrs, attrsRange, valueRange, range, pos, ex
     return { type, text, attrs, attrsRange, valueRange, range, pos, ...extra };
 }
+// Pre-compiled carrier patterns for better performance
+const CARRIER_PATTERN_ARRAY = [
+    ['EMPHASIS', /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y],
+    ['CODE_SPAN', /``(.+?)``\s*\{([^}]+)\}/y]
+];
 function extractInlineCarriers(text, baseOffset = 0) {
     const carriers = [];
     let pos = 0;
@@ -243,7 +237,8 @@ function extractInlineCarriers(text, baseOffset = 0) {
         const extractor = CARRIER_EXTRACTORS[text[pos]];
         if (extractor) return extractor(text, pos, baseOffset);
-        for (const [type, pattern] of Object.entries(INLINE_CARRIER_PATTERNS)) {
+        // Use pre-compiled patterns instead of Object.entries()
+        for (const [type, pattern] of CARRIER_PATTERN_ARRAY) {
             pattern.lastIndex = pos;
             const match = pattern.exec(text);
             if (match) {
@@ -342,6 +337,150 @@ function determineCarrierType(url) {
     return { carrierType: 'span', resourceIRI: null };
 }
+function createBlockEntry(token, state) {
+    const blockId = token._blockId || hash(`${token.type}:${token.range?.[0]}:${token.range?.[1]}`);
+    token._blockId = blockId; // Store for later reference
+    const cleanText = extractCleanText(token);
+    const blockEntry = {
+        id: blockId,
+        type: token.type,
+        range: token.range,
+        text: cleanText,
+        subject: null,
+        types: [],
+        predicates: [],
+        carriers: [],
+        listLevel: token.indent || 0,
+        parentBlockId: state.blockStack.length > 0 ? state.blockStack[state.blockStack.length - 1] : null,
+        quadKeys: [] // Will be populated during quad emission
+    };
+    // Store block and add to document structure
+    state.origin.blocks.set(blockId, blockEntry);
+    state.origin.documentStructure.push(blockEntry);
+    return blockEntry;
+}
+function extractCleanText(token) {
+    if (!token.text) return '';
+    let text = token.text;
+    // Remove semantic annotations
+    if (token.attrsRange) {
+        const beforeAttrs = text.substring(0, token.attrsRange[0] - (token.range?.[0] || 0));
+        const afterAttrs = text.substring(token.attrsRange[1] - (token.range?.[0] || 0));
+        text = beforeAttrs + afterAttrs;
+    }
+    // Clean based on token type
+    switch (token.type) {
+        case 'heading':
+            return text.replace(/^#+\s*/, '').trim();
+        case 'list':
+            return text.replace(/^[-*+]\s*/, '').trim();
+        case 'blockquote':
+            return text.replace(/^>\s*/, '').trim();
+        default:
+            return text.trim();
+    }
+}
+function enrichBlockFromAnnotation(blockEntry, sem, carrier, state) {
+    // Update subject if available
+    if (sem.subject && sem.subject !== 'RESET') {
+        const resolvedSubject = resolveSubject(sem, state);
+        if (resolvedSubject) {
+            blockEntry.subject = resolvedSubject.value;
+        }
+    }
+    // Add types
+    if (sem.types && sem.types.length > 0) {
+        sem.types.forEach(t => {
+            const typeIRI = typeof t === 'string' ? t : t.iri;
+            const expanded = expandIRI(typeIRI, state.ctx);
+            if (!blockEntry.types.includes(expanded)) {
+                blockEntry.types.push(expanded);
+            }
+        });
+    }
+    // Add predicates
+    if (sem.predicates && sem.predicates.length > 0) {
+        sem.predicates.forEach(pred => {
+            const expandedPred = {
+                iri: expandIRI(pred.iri, state.ctx),
+                form: pred.form || '',
+                object: null // Will be filled during quad emission
+            };
+            blockEntry.predicates.push(expandedPred);
+        });
+    }
+    // Add carrier information
+    if (carrier) {
+        const carrierInfo = {
+            type: carrier.type,
+            range: carrier.range,
+            text: carrier.text,
+            subject: null,
+            predicates: []
+        };
+        // Extract carrier-specific semantics
+        if (carrier.attrs) {
+            const carrierSem = parseSemCached(carrier.attrs);
+            if (carrierSem.types) {
+                carrierInfo.predicates = carrierSem.predicates || [];
+            }
+        }
+        blockEntry.carriers.push(carrierInfo);
+    }
+}
+function processAnnotationWithBlockTracking(carrier, sem, state, options = {}) {
+    const { preserveGlobalSubject = false, implicitSubject = null } = options;
+    if (sem.subject === 'RESET') {
+        state.currentSubject = null;
+        return;
+    }
+    const previousSubject = state.currentSubject;
+    const newSubject = resolveSubject(sem, state);
+    const localObject = resolveObject(sem, state);
+    const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
+    if (newSubject && !preserveGlobalSubject && !implicitSubject) {
+        state.currentSubject = newSubject;
+    }
+    const S = preserveGlobalSubject ? (newSubject || previousSubject) : (implicitSubject || state.currentSubject);
+    if (!S) return;
+    const block = createBlock(
+        S.value, sem.types, sem.predicates,
+        carrier.range, carrier.attrsRange || null, carrier.valueRange || null,
+        carrier.type || null, state.ctx, carrier.text
+    );
+    const L = createLiteral(carrier.text, sem.datatype, sem.language, state.ctx, state.df);
+    const carrierO = carrier.url ? state.df.namedNode(expandIRI(carrier.url, state.ctx)) : null;
+    const newSubjectOrCarrierO = newSubject || carrierO;
+    // Enrich current block with semantic information
+    if (state.currentBlock) {
+        enrichBlockFromAnnotation(state.currentBlock, sem, carrier, state);
+    }
+    processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state, carrier);
+    processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state, carrier);
+}
 function createBlock(subject, types, predicates, range, attrsRange, valueRange, carrierType, ctx, text) {
     const expanded = {
         subject,
@@ -364,7 +503,7 @@ function createBlock(subject, types, predicates, range, attrsRange, valueRange,
     };
 }
-function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predicate, object, dataFactory, meta = null, statements = null, statementCandidates = null) {
+function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predicate, object, dataFactory, meta = null, statements = null, statementCandidates = null, state = null) {
     if (!subject || !predicate || !object) return;
     const quad = dataFactory.quad(subject, predicate, object);
@@ -400,19 +539,27 @@ function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predi
         // Detect rdf:Statement pattern during single-pass parsing
         detectStatementPatternSinglePass(quad, dataFactory, meta, statements, statementCandidates);
-        // Create lean origin entry
+        // Create lean origin entry - avoid spread operator for better performance
         const originEntry = {
             blockId: block.id,
             range: block.range,
             carrierType: block.carrierType,
             subject: subject.value,
             predicate: predicate.value,
-            context: { ...block.context },
+            context: block.context, // Direct reference instead of spread
             polarity: meta?.remove ? '-' : '+',
             value: block.text || ''
         };
         quadIndex.set(quadKey, originEntry);
+        // Link block to this quad for reverse lookup during rendering
+        if (state.currentBlock && block.id === state.currentBlock.id) {
+            if (!state.currentBlock.quadKeys) {
+                state.currentBlock.quadKeys = [];
+            }
+            state.currentBlock.quadKeys.push(quadKey);
+        }
     }
 }
@@ -474,8 +621,10 @@ function detectStatementPatternSinglePass(quad, dataFactory, meta, statements =
 const resolveFragment = (fragment, state) => {
     if (!state.currentSubject) return null;
-    const baseIRI = state.currentSubject.value.split('#')[0];
-    return state.df.namedNode(`${baseIRI}#${fragment}`);
+    const subjectValue = state.currentSubject.value;
+    const hashIndex = subjectValue.indexOf('#');
+    const baseIRI = hashIndex > -1 ? subjectValue.slice(0, hashIndex) : subjectValue;
+    return state.df.namedNode(baseIRI + '#' + fragment);
 };
 function resolveSubject(sem, state) {
@@ -504,7 +653,8 @@ const createTypeQuad = (typeIRI, subject, state, block, entryIndex = null) => {
         state.df.namedNode(expandedType),
         state.df,
         { kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex: typeInfo.entryIndex, remove: typeInfo.remove },
-        state.statements, state.statementCandidates
+        state.statements, state.statementCandidates,
+        state
     );
 };
@@ -548,43 +698,16 @@ function processPredicateAnnotations(sem, newSubject, previousSubject, localObje
             emitQuad(state.quads, state.quadBuffer, state.removeSet, state.origin.quadIndex, block,
                 role.subject, P, role.object, state.df,
                 { kind: 'pred', token: `${pred.form}${pred.iri}`, form: pred.form, expandedPredicate: P.value, entryIndex: pred.entryIndex, remove: pred.remove || false },
-                state.statements, state.statementCandidates
+                state.statements, state.statementCandidates,
+                state
             );
         }
     });
 }
 function processAnnotation(carrier, sem, state, options = {}) {
-    const { preserveGlobalSubject = false, implicitSubject = null } = options;
-    if (sem.subject === 'RESET') {
-        state.currentSubject = null;
-        return;
-    }
-    const previousSubject = state.currentSubject;
-    const newSubject = resolveSubject(sem, state);
-    const localObject = resolveObject(sem, state);
-    const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
-    if (newSubject && !preserveGlobalSubject && !implicitSubject) {
-        state.currentSubject = newSubject;
-    }
-    const S = preserveGlobalSubject ? (newSubject || previousSubject) : (implicitSubject || state.currentSubject);
-    if (!S) return;
-    const block = createBlock(
-        S.value, sem.types, sem.predicates,
-        carrier.range, carrier.attrsRange || null, carrier.valueRange || null,
-        carrier.type || null, state.ctx, carrier.text
-    );
-    const L = createLiteral(carrier.text, sem.datatype, sem.language, state.ctx, state.df);
-    const carrierO = carrier.url ? state.df.namedNode(expandIRI(carrier.url, state.ctx)) : null;
-    const newSubjectOrCarrierO = newSubject || carrierO;
-    processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state, carrier);
-    processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state, carrier);
+    // Use the enhanced block tracking version
+    processAnnotationWithBlockTracking(carrier, sem, state, options);
 }
@@ -626,20 +749,60 @@ function processStandaloneSubject(token, state) {
 const TOKEN_PROCESSORS = {
     heading: (token, state) => {
+        const blockEntry = createBlockEntry(token, state);
+        state.currentBlock = blockEntry;
+        state.blockStack.push(blockEntry.id);
         processTokenAnnotations(token, state, token.type);
+        state.blockStack.pop();
+        state.currentBlock = state.blockStack.length > 0 ?
+            state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
     },
     code: (token, state) => {
+        const blockEntry = createBlockEntry(token, state);
+        state.currentBlock = blockEntry;
+        state.blockStack.push(blockEntry.id);
         processTokenAnnotations(token, state, token.type);
+        state.blockStack.pop();
+        state.currentBlock = state.blockStack.length > 0 ?
+            state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
     },
     blockquote: (token, state) => {
+        const blockEntry = createBlockEntry(token, state);
+        state.currentBlock = blockEntry;
+        state.blockStack.push(blockEntry.id);
         processTokenAnnotations(token, state, token.type);
+        state.blockStack.pop();
+        state.currentBlock = state.blockStack.length > 0 ?
+            state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
     },
     para: (token, state) => {
+        const blockEntry = createBlockEntry(token, state);
+        state.currentBlock = blockEntry;
+        state.blockStack.push(blockEntry.id);
         processStandaloneSubject(token, state);
         processTokenAnnotations(token, state, token.type);
+        state.blockStack.pop();
+        state.currentBlock = state.blockStack.length > 0 ?
+            state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
     },
     list: (token, state) => {
+        const blockEntry = createBlockEntry(token, state);
+        state.currentBlock = blockEntry;
+        state.blockStack.push(blockEntry.id);
         processTokenAnnotations(token, state, token.type);
+        state.blockStack.pop();
+        state.currentBlock = state.blockStack.length > 0 ?
+            state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
     },
 };
@@ -650,28 +813,37 @@ export function parse(text, options = {}) {
         quads: [],
         quadBuffer: new Map(),
         removeSet: new Set(),
-        origin: { quadIndex: new Map() },
+        origin: {
+            quadIndex: new Map(),
+            blocks: new Map(),
+            documentStructure: []
+        },
         currentSubject: null,
         tokens: null,
         currentTokenIndex: -1,
         statements: [],
-        statementCandidates: new Map() // Track incomplete rdf:Statement patterns
+        statementCandidates: new Map(), // Track incomplete rdf:Statement patterns
+        currentBlock: null,
+        blockStack: []
     };
     state.tokens = scanTokens(text);
-    state.tokens.filter(t => t.type === 'prefix').forEach(t => {
-        let resolvedIri = t.iri;
-        if (t.iri.includes(':')) {
-            const colonIndex = t.iri.indexOf(':');
-            const potentialPrefix = t.iri.substring(0, colonIndex);
-            const reference = t.iri.substring(colonIndex + 1);
-            if (state.ctx[potentialPrefix] && potentialPrefix !== '@vocab') {
-                resolvedIri = state.ctx[potentialPrefix] + reference;
+    // Single loop instead of filter+forEach for better performance
+    for (const token of state.tokens) {
+        if (token.type === 'prefix') {
+            let resolvedIri = token.iri;
+            if (token.iri.includes(':')) {
+                const colonIndex = token.iri.indexOf(':');
+                const potentialPrefix = token.iri.substring(0, colonIndex);
+                const reference = token.iri.substring(colonIndex + 1);
+                if (state.ctx[potentialPrefix] && potentialPrefix !== '@vocab') {
+                    resolvedIri = state.ctx[potentialPrefix] + reference;
+                }
             }
+            state.ctx[token.prefix] = resolvedIri;
         }
-        state.ctx[t.prefix] = resolvedIri;
-    });
+    }
     for (let i = 0; i < state.tokens.length; i++) {
         const token = state.tokens[i];
@@ -679,18 +851,20 @@ export function parse(text, options = {}) {
         TOKEN_PROCESSORS[token.type]?.(token, state);
     }
-    // Convert removeSet to array and ensure hard invariant: quads ∩ remove = ∅
-    const removeArray = Array.from(state.removeSet);
+    // Optimize array operations - avoid Array.from() and filter()
     const quadKeys = new Set();
-    state.quads.forEach(q => {
-        quadKeys.add(quadIndexKey(q.subject, q.predicate, q.object));
-    });
+    for (const quad of state.quads) {
+        quadKeys.add(quadIndexKey(quad.subject, quad.predicate, quad.object));
+    }
-    // Filter removeArray to ensure no overlap with quads
-    const filteredRemove = removeArray.filter(quad => {
+    // Direct iteration instead of Array.from() + filter()
+    const filteredRemove = [];
+    for (const quad of state.removeSet) {
         const key = quadIndexKey(quad.subject, quad.predicate, quad.object);
-        return !quadKeys.has(key);
-    });
+        if (!quadKeys.has(key)) {
+            filteredRemove.push(quad);
+        }
+    }
     return { quads: state.quads, remove: filteredRemove, statements: state.statements, origin: state.origin, context: state.ctx };
 }