mdld-parse 0.7.0 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -2
- package/package.json +1 -4
- package/src/merge.js +10 -2
- package/src/parse.js +106 -29
package/README.md
CHANGED
|
@@ -56,6 +56,7 @@ console.log(result.quads);
|
|
|
56
56
|
- **🧩 Fragments** - Document structuring with `{=#fragment}`
|
|
57
57
|
- **⚡ Polarity system** - Sophisticated diff authoring with `+` and `-` prefixes
|
|
58
58
|
- **📍 Origin tracking** - Complete provenance with lean quad-to-source mapping
|
|
59
|
+
- **🎯 Elevated statements** - Automatic rdf:Statement pattern detection for "golden" graph extraction
|
|
59
60
|
|
|
60
61
|
## 🌟 What is MD-LD?
|
|
61
62
|
|
|
@@ -122,7 +123,24 @@ Each predicate form determines the graph edge:
|
|
|
122
123
|
| `?p` | S → O | `[NASA] {=ex:nasa ?org}` | object property |
|
|
123
124
|
| `!p` | O → S | `[Parent] {=ex:p !hasPart}` | reverse object |
|
|
124
125
|
|
|
125
|
-
##
|
|
126
|
+
## � Elevated Statements
|
|
127
|
+
|
|
128
|
+
MD-LD automatically detects `rdf:Statement` patterns during parsing and extracts elevated SPO quads for convenient consumption by applications.
|
|
129
|
+
|
|
130
|
+
### Pattern Detection
|
|
131
|
+
|
|
132
|
+
When the parser encounters a complete `rdf:Statement` pattern with `rdf:subject`, `rdf:predicate`, and `rdf:object`, it automatically adds the corresponding SPO quad to the `statements` array:
|
|
133
|
+
|
|
134
|
+
```markdown
|
|
135
|
+
[ex] <http://example.org/>
|
|
136
|
+
|
|
137
|
+
## Elevated statement {=ex:stmt1 .rdf:Statement}
|
|
138
|
+
**Alice** {+ex:alice ?rdf:subject} *knows* {+ex:knows ?rdf:predicate} **Bob** {+ex:bob ?rdf:object}
|
|
139
|
+
|
|
140
|
+
Direct statement:**Alice** {=ex:alice} knows **Bob** {?ex:knows +ex:bob}
|
|
141
|
+
``
|
|
142
|
+
|
|
143
|
+
## �🎨 Syntax Quick Reference
|
|
126
144
|
|
|
127
145
|
### Subject Declaration
|
|
128
146
|
Set current subject (emits no quads):
|
|
@@ -180,10 +198,11 @@ Parse MD-LD markdown and return RDF quads with lean origin tracking.
|
|
|
180
198
|
- `context` (object) — Prefix mappings (default: `{ '@vocab': 'http://www.w3.org/2000/01/rdf-schema#', rdf, rdfs, xsd, sh, prov }`)
|
|
181
199
|
- `dataFactory` (object) — Custom RDF/JS DataFactory
|
|
182
200
|
|
|
183
|
-
**Returns:** `{ quads, remove, origin, context }`
|
|
201
|
+
**Returns:** `{ quads, remove, statements, origin, context }`
|
|
184
202
|
|
|
185
203
|
- `quads` — Array of RDF/JS Quads (final resolved graph state)
|
|
186
204
|
- `remove` — Array of RDF/JS Quads (external retractions targeting prior state)
|
|
205
|
+
- `statements` — Array of elevated RDF/JS Quads extracted from rdf:Statement patterns
|
|
187
206
|
- `origin` — Lean origin tracking object with quadIndex for UI navigation
|
|
188
207
|
- `context` — Final context used (includes prefixes)
|
|
189
208
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mdld-parse",
|
|
3
|
-
"version": "0.7.
|
|
3
|
+
"version": "0.7.2",
|
|
4
4
|
"description": "A standards-compliant parser for **MD-LD (Markdown-Linked Data)** — a human-friendly RDF authoring format that extends Markdown with semantic annotations.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -38,8 +38,5 @@
|
|
|
38
38
|
"homepage": "https://mdld.js.org",
|
|
39
39
|
"bugs": {
|
|
40
40
|
"url": "https://github.com/davay42/mdld-parse/issues"
|
|
41
|
-
},
|
|
42
|
-
"devDependencies": {
|
|
43
|
-
"n3": "^2.0.1"
|
|
44
41
|
}
|
|
45
42
|
}
|
package/src/merge.js
CHANGED
|
@@ -34,13 +34,14 @@ function normalizeInput(input, options, docContext) {
|
|
|
34
34
|
* Merges multiple MDLD documents with diff polarity resolution
|
|
35
35
|
* @param {Array<string|ParseResult>} docs
|
|
36
36
|
* @param {Object} options
|
|
37
|
-
* @returns {Object}
|
|
37
|
+
* @returns {Object} Merge result with quads, remove, statements, origin, and context
|
|
38
38
|
*/
|
|
39
39
|
export function merge(docs, options = {}) {
|
|
40
40
|
const sessionBuffer = new Map(); // Use Map instead of Set for proper quad storage
|
|
41
41
|
const sessionRemoveSet = new Set();
|
|
42
42
|
const allDocuments = [];
|
|
43
43
|
const quadIndex = new Map();
|
|
44
|
+
const allStatements = []; // Collect statements from all documents
|
|
44
45
|
|
|
45
46
|
// Process each document in order
|
|
46
47
|
for (let i = 0; i < docs.length; i++) {
|
|
@@ -57,10 +58,16 @@ export function merge(docs, options = {}) {
|
|
|
57
58
|
index: i,
|
|
58
59
|
input: typeof input === 'string' ? 'string' : 'ParseResult',
|
|
59
60
|
origin: doc.origin,
|
|
60
|
-
context: doc.context
|
|
61
|
+
context: doc.context,
|
|
62
|
+
statementsCount: doc.statements?.length || 0 // Track statements count
|
|
61
63
|
};
|
|
62
64
|
allDocuments.push(documentOrigin);
|
|
63
65
|
|
|
66
|
+
// Collect statements from this document
|
|
67
|
+
if (doc.statements && doc.statements.length > 0) {
|
|
68
|
+
allStatements.push(...doc.statements);
|
|
69
|
+
}
|
|
70
|
+
|
|
64
71
|
// Fold assertions into session buffer
|
|
65
72
|
for (const quad of doc.quads) {
|
|
66
73
|
const key = quadKey(quad);
|
|
@@ -125,6 +132,7 @@ export function merge(docs, options = {}) {
|
|
|
125
132
|
return {
|
|
126
133
|
quads: filteredQuads,
|
|
127
134
|
remove: filteredRemove,
|
|
135
|
+
statements: allStatements, // Include all collected statements
|
|
128
136
|
origin: mergeOrigin,
|
|
129
137
|
context: finalContext
|
|
130
138
|
};
|
package/src/parse.js
CHANGED
|
@@ -210,6 +210,12 @@ function createCarrier(type, text, attrs, attrsRange, valueRange, range, pos, ex
|
|
|
210
210
|
return { type, text, attrs, attrsRange, valueRange, range, pos, ...extra };
|
|
211
211
|
}
|
|
212
212
|
|
|
213
|
+
// Pre-compiled carrier patterns for better performance
|
|
214
|
+
const CARRIER_PATTERN_ARRAY = [
|
|
215
|
+
['EMPHASIS', /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y],
|
|
216
|
+
['CODE_SPAN', /``(.+?)``\s*\{([^}]+)\}/y]
|
|
217
|
+
];
|
|
218
|
+
|
|
213
219
|
function extractInlineCarriers(text, baseOffset = 0) {
|
|
214
220
|
const carriers = [];
|
|
215
221
|
let pos = 0;
|
|
@@ -243,7 +249,8 @@ function extractInlineCarriers(text, baseOffset = 0) {
|
|
|
243
249
|
const extractor = CARRIER_EXTRACTORS[text[pos]];
|
|
244
250
|
if (extractor) return extractor(text, pos, baseOffset);
|
|
245
251
|
|
|
246
|
-
|
|
252
|
+
// Use pre-compiled patterns instead of Object.entries()
|
|
253
|
+
for (const [type, pattern] of CARRIER_PATTERN_ARRAY) {
|
|
247
254
|
pattern.lastIndex = pos;
|
|
248
255
|
const match = pattern.exec(text);
|
|
249
256
|
if (match) {
|
|
@@ -364,7 +371,7 @@ function createBlock(subject, types, predicates, range, attrsRange, valueRange,
|
|
|
364
371
|
};
|
|
365
372
|
}
|
|
366
373
|
|
|
367
|
-
function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predicate, object, dataFactory, meta = null) {
|
|
374
|
+
function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predicate, object, dataFactory, meta = null, statements = null, statementCandidates = null) {
|
|
368
375
|
if (!subject || !predicate || !object) return;
|
|
369
376
|
|
|
370
377
|
const quad = dataFactory.quad(subject, predicate, object);
|
|
@@ -397,14 +404,17 @@ function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predi
|
|
|
397
404
|
quadBuffer.set(quadKey, quad);
|
|
398
405
|
quads.push(quad);
|
|
399
406
|
|
|
400
|
-
//
|
|
407
|
+
// Detect rdf:Statement pattern during single-pass parsing
|
|
408
|
+
detectStatementPatternSinglePass(quad, dataFactory, meta, statements, statementCandidates);
|
|
409
|
+
|
|
410
|
+
// Create lean origin entry - avoid spread operator for better performance
|
|
401
411
|
const originEntry = {
|
|
402
412
|
blockId: block.id,
|
|
403
413
|
range: block.range,
|
|
404
414
|
carrierType: block.carrierType,
|
|
405
415
|
subject: subject.value,
|
|
406
416
|
predicate: predicate.value,
|
|
407
|
-
context:
|
|
417
|
+
context: block.context, // Direct reference instead of spread
|
|
408
418
|
polarity: meta?.remove ? '-' : '+',
|
|
409
419
|
value: block.text || ''
|
|
410
420
|
};
|
|
@@ -413,10 +423,68 @@ function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predi
|
|
|
413
423
|
}
|
|
414
424
|
}
|
|
415
425
|
|
|
426
|
+
// Extract RDF constants once at module level for efficiency
|
|
427
|
+
const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
|
|
428
|
+
const RDF_STATEMENT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement';
|
|
429
|
+
const RDF_SUBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject';
|
|
430
|
+
const RDF_PREDICATE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate';
|
|
431
|
+
const RDF_OBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#object';
|
|
432
|
+
|
|
433
|
+
function detectStatementPatternSinglePass(quad, dataFactory, meta, statements = null, statementCandidates = null) {
|
|
434
|
+
// Skip if not called from parse context (for testing compatibility)
|
|
435
|
+
if (!statements || !statementCandidates) return;
|
|
436
|
+
|
|
437
|
+
const predicate = quad.predicate.value;
|
|
438
|
+
|
|
439
|
+
// Early filter: only process rdf:Statement related predicates
|
|
440
|
+
if (predicate !== RDF_TYPE &&
|
|
441
|
+
predicate !== RDF_SUBJECT &&
|
|
442
|
+
predicate !== RDF_PREDICATE &&
|
|
443
|
+
predicate !== RDF_OBJECT) {
|
|
444
|
+
return;
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
// Check if this quad starts a new rdf:Statement pattern
|
|
448
|
+
if (predicate === RDF_TYPE && quad.object.value === RDF_STATEMENT) {
|
|
449
|
+
statementCandidates.set(quad.subject.value, { spo: {} });
|
|
450
|
+
return;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// Check if this quad completes part of an existing rdf:Statement pattern
|
|
454
|
+
const candidate = statementCandidates.get(quad.subject.value);
|
|
455
|
+
if (!candidate) return;
|
|
456
|
+
|
|
457
|
+
// Direct property assignment instead of switch for better performance
|
|
458
|
+
if (predicate === RDF_SUBJECT) {
|
|
459
|
+
candidate.spo.subject = quad.object;
|
|
460
|
+
} else if (predicate === RDF_PREDICATE) {
|
|
461
|
+
candidate.spo.predicate = quad.object;
|
|
462
|
+
} else if (predicate === RDF_OBJECT) {
|
|
463
|
+
candidate.spo.object = quad.object;
|
|
464
|
+
// Store the original quad for potential literal extraction
|
|
465
|
+
candidate.objectQuad = quad;
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
// Check if pattern is complete and create elevated SPO quad
|
|
469
|
+
if (candidate.spo.subject && candidate.spo.predicate && candidate.spo.object) {
|
|
470
|
+
// Use the object directly - literal detection happens at parse time
|
|
471
|
+
const spoQuad = dataFactory.quad(
|
|
472
|
+
candidate.spo.subject,
|
|
473
|
+
candidate.spo.predicate,
|
|
474
|
+
candidate.spo.object
|
|
475
|
+
);
|
|
476
|
+
statements.push(spoQuad);
|
|
477
|
+
// Clean up candidate to avoid duplicate detection
|
|
478
|
+
statementCandidates.delete(quad.subject.value);
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
|
|
416
482
|
const resolveFragment = (fragment, state) => {
|
|
417
483
|
if (!state.currentSubject) return null;
|
|
418
|
-
const
|
|
419
|
-
|
|
484
|
+
const subjectValue = state.currentSubject.value;
|
|
485
|
+
const hashIndex = subjectValue.indexOf('#');
|
|
486
|
+
const baseIRI = hashIndex > -1 ? subjectValue.slice(0, hashIndex) : subjectValue;
|
|
487
|
+
return state.df.namedNode(baseIRI + '#' + fragment);
|
|
420
488
|
};
|
|
421
489
|
|
|
422
490
|
function resolveSubject(sem, state) {
|
|
@@ -444,7 +512,8 @@ const createTypeQuad = (typeIRI, subject, state, block, entryIndex = null) => {
|
|
|
444
512
|
state.df.namedNode(expandIRI('rdf:type', state.ctx)),
|
|
445
513
|
state.df.namedNode(expandedType),
|
|
446
514
|
state.df,
|
|
447
|
-
{ kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex: typeInfo.entryIndex, remove: typeInfo.remove }
|
|
515
|
+
{ kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex: typeInfo.entryIndex, remove: typeInfo.remove },
|
|
516
|
+
state.statements, state.statementCandidates
|
|
448
517
|
);
|
|
449
518
|
};
|
|
450
519
|
|
|
@@ -487,7 +556,8 @@ function processPredicateAnnotations(sem, newSubject, previousSubject, localObje
|
|
|
487
556
|
const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
|
|
488
557
|
emitQuad(state.quads, state.quadBuffer, state.removeSet, state.origin.quadIndex, block,
|
|
489
558
|
role.subject, P, role.object, state.df,
|
|
490
|
-
{ kind: 'pred', token: `${pred.form}${pred.iri}`, form: pred.form, expandedPredicate: P.value, entryIndex: pred.entryIndex, remove: pred.remove || false }
|
|
559
|
+
{ kind: 'pred', token: `${pred.form}${pred.iri}`, form: pred.form, expandedPredicate: P.value, entryIndex: pred.entryIndex, remove: pred.remove || false },
|
|
560
|
+
state.statements, state.statementCandidates
|
|
491
561
|
);
|
|
492
562
|
}
|
|
493
563
|
});
|
|
@@ -592,23 +662,28 @@ export function parse(text, options = {}) {
|
|
|
592
662
|
origin: { quadIndex: new Map() },
|
|
593
663
|
currentSubject: null,
|
|
594
664
|
tokens: null,
|
|
595
|
-
currentTokenIndex: -1
|
|
665
|
+
currentTokenIndex: -1,
|
|
666
|
+
statements: [],
|
|
667
|
+
statementCandidates: new Map() // Track incomplete rdf:Statement patterns
|
|
596
668
|
};
|
|
597
669
|
|
|
598
670
|
state.tokens = scanTokens(text);
|
|
599
671
|
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
if (
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
672
|
+
// Single loop instead of filter+forEach for better performance
|
|
673
|
+
for (const token of state.tokens) {
|
|
674
|
+
if (token.type === 'prefix') {
|
|
675
|
+
let resolvedIri = token.iri;
|
|
676
|
+
if (token.iri.includes(':')) {
|
|
677
|
+
const colonIndex = token.iri.indexOf(':');
|
|
678
|
+
const potentialPrefix = token.iri.substring(0, colonIndex);
|
|
679
|
+
const reference = token.iri.substring(colonIndex + 1);
|
|
680
|
+
if (state.ctx[potentialPrefix] && potentialPrefix !== '@vocab') {
|
|
681
|
+
resolvedIri = state.ctx[potentialPrefix] + reference;
|
|
682
|
+
}
|
|
608
683
|
}
|
|
684
|
+
state.ctx[token.prefix] = resolvedIri;
|
|
609
685
|
}
|
|
610
|
-
|
|
611
|
-
});
|
|
686
|
+
}
|
|
612
687
|
|
|
613
688
|
for (let i = 0; i < state.tokens.length; i++) {
|
|
614
689
|
const token = state.tokens[i];
|
|
@@ -616,18 +691,20 @@ export function parse(text, options = {}) {
|
|
|
616
691
|
TOKEN_PROCESSORS[token.type]?.(token, state);
|
|
617
692
|
}
|
|
618
693
|
|
|
619
|
-
//
|
|
620
|
-
const removeArray = Array.from(state.removeSet);
|
|
694
|
+
// Optimize array operations - avoid Array.from() and filter()
|
|
621
695
|
const quadKeys = new Set();
|
|
622
|
-
state.quads
|
|
623
|
-
quadKeys.add(quadIndexKey(
|
|
624
|
-
}
|
|
696
|
+
for (const quad of state.quads) {
|
|
697
|
+
quadKeys.add(quadIndexKey(quad.subject, quad.predicate, quad.object));
|
|
698
|
+
}
|
|
625
699
|
|
|
626
|
-
//
|
|
627
|
-
const filteredRemove =
|
|
700
|
+
// Direct iteration instead of Array.from() + filter()
|
|
701
|
+
const filteredRemove = [];
|
|
702
|
+
for (const quad of state.removeSet) {
|
|
628
703
|
const key = quadIndexKey(quad.subject, quad.predicate, quad.object);
|
|
629
|
-
|
|
630
|
-
|
|
704
|
+
if (!quadKeys.has(key)) {
|
|
705
|
+
filteredRemove.push(quad);
|
|
706
|
+
}
|
|
707
|
+
}
|
|
631
708
|
|
|
632
|
-
return { quads: state.quads, remove: filteredRemove, origin: state.origin, context: state.ctx };
|
|
709
|
+
return { quads: state.quads, remove: filteredRemove, statements: state.statements, origin: state.origin, context: state.ctx };
|
|
633
710
|
}
|