mdld-parse 0.7.2 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -1
- package/src/generate.js +2 -1
- package/src/index.js +1 -1
- package/src/merge.js +1 -1
- package/src/parse.js +208 -48
- package/src/render.js +345 -345
- package/src/shared.js +212 -0
- package/src/utils.js +2 -9
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mdld-parse",
|
|
3
|
-
"version": "0.7.
|
|
3
|
+
"version": "0.7.3",
|
|
4
4
|
"description": "A standards-compliant parser for **MD-LD (Markdown-Linked Data)** — a human-friendly RDF authoring format that extends Markdown with semantic annotations.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -38,5 +38,8 @@
|
|
|
38
38
|
"homepage": "https://mdld.js.org",
|
|
39
39
|
"bugs": {
|
|
40
40
|
"url": "https://github.com/davay42/mdld-parse/issues"
|
|
41
|
+
},
|
|
42
|
+
"dependencies": {
|
|
43
|
+
"rdfa-parse": "^1.0.1"
|
|
41
44
|
}
|
|
42
45
|
}
|
package/src/generate.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { shortenIRI, expandIRI,
|
|
1
|
+
import { shortenIRI, expandIRI, DataFactory } from './utils.js';
|
|
2
|
+
import { DEFAULT_CONTEXT } from './shared.js';
|
|
2
3
|
|
|
3
4
|
// Helper functions for cleaner term type checking
|
|
4
5
|
function isLiteral(term) {
|
package/src/index.js
CHANGED
|
@@ -3,8 +3,8 @@ export { merge } from './merge.js';
|
|
|
3
3
|
export { generate } from './generate.js';
|
|
4
4
|
export { locate } from './locate.js';
|
|
5
5
|
export { render } from './render.js';
|
|
6
|
+
export { DEFAULT_CONTEXT } from './shared.js';
|
|
6
7
|
export {
|
|
7
|
-
DEFAULT_CONTEXT,
|
|
8
8
|
DataFactory,
|
|
9
9
|
hash,
|
|
10
10
|
expandIRI,
|
package/src/merge.js
CHANGED
package/src/parse.js
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import {
|
|
2
|
-
DEFAULT_CONTEXT,
|
|
3
2
|
DataFactory,
|
|
4
3
|
expandIRI,
|
|
5
4
|
parseSemanticBlock,
|
|
@@ -7,18 +6,7 @@ import {
|
|
|
7
6
|
createLiteral,
|
|
8
7
|
hash
|
|
9
8
|
} from './utils.js';
|
|
10
|
-
|
|
11
|
-
const URL_REGEX = /^[a-zA-Z][a-zA-Z0-9+.-]*:/;
|
|
12
|
-
const FENCE_REGEX = /^(`{3,}|~{3,})(.*)/;
|
|
13
|
-
const PREFIX_REGEX = /^\[([^\]]+)\]\s*<([^>]+)>/;
|
|
14
|
-
const HEADING_REGEX = /^(#{1,6})\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
|
|
15
|
-
const UNORDERED_LIST_REGEX = /^(\s*)([-*+]|\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
|
|
16
|
-
const BLOCKQUOTE_REGEX = /^>\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
|
|
17
|
-
const STANDALONE_SUBJECT_REGEX = /^\s*\{=(.*?)\}\s*$/;
|
|
18
|
-
const INLINE_CARRIER_PATTERNS = {
|
|
19
|
-
EMPHASIS: /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y,
|
|
20
|
-
CODE_SPAN: /``(.+?)``\s*\{([^}]+)\}/y
|
|
21
|
-
};
|
|
9
|
+
import { DEFAULT_CONTEXT, URL_REGEX, FENCE_REGEX, PREFIX_REGEX, HEADING_REGEX, UNORDERED_LIST_REGEX, BLOCKQUOTE_REGEX, STANDALONE_SUBJECT_REGEX, INLINE_CARRIER_PATTERNS } from './shared.js';
|
|
22
10
|
|
|
23
11
|
// Cache for fence regex patterns to avoid recreation
|
|
24
12
|
const FENCE_CLOSE_PATTERNS = new Map();
|
|
@@ -349,6 +337,150 @@ function determineCarrierType(url) {
|
|
|
349
337
|
return { carrierType: 'span', resourceIRI: null };
|
|
350
338
|
}
|
|
351
339
|
|
|
340
|
+
function createBlockEntry(token, state) {
|
|
341
|
+
const blockId = token._blockId || hash(`${token.type}:${token.range?.[0]}:${token.range?.[1]}`);
|
|
342
|
+
token._blockId = blockId; // Store for later reference
|
|
343
|
+
|
|
344
|
+
const cleanText = extractCleanText(token);
|
|
345
|
+
|
|
346
|
+
const blockEntry = {
|
|
347
|
+
id: blockId,
|
|
348
|
+
type: token.type,
|
|
349
|
+
range: token.range,
|
|
350
|
+
text: cleanText,
|
|
351
|
+
subject: null,
|
|
352
|
+
types: [],
|
|
353
|
+
predicates: [],
|
|
354
|
+
carriers: [],
|
|
355
|
+
listLevel: token.indent || 0,
|
|
356
|
+
parentBlockId: state.blockStack.length > 0 ? state.blockStack[state.blockStack.length - 1] : null,
|
|
357
|
+
quadKeys: [] // Will be populated during quad emission
|
|
358
|
+
};
|
|
359
|
+
|
|
360
|
+
// Store block and add to document structure
|
|
361
|
+
state.origin.blocks.set(blockId, blockEntry);
|
|
362
|
+
state.origin.documentStructure.push(blockEntry);
|
|
363
|
+
|
|
364
|
+
return blockEntry;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
function extractCleanText(token) {
|
|
368
|
+
if (!token.text) return '';
|
|
369
|
+
|
|
370
|
+
let text = token.text;
|
|
371
|
+
|
|
372
|
+
// Remove semantic annotations
|
|
373
|
+
if (token.attrsRange) {
|
|
374
|
+
const beforeAttrs = text.substring(0, token.attrsRange[0] - (token.range?.[0] || 0));
|
|
375
|
+
const afterAttrs = text.substring(token.attrsRange[1] - (token.range?.[0] || 0));
|
|
376
|
+
text = beforeAttrs + afterAttrs;
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// Clean based on token type
|
|
380
|
+
switch (token.type) {
|
|
381
|
+
case 'heading':
|
|
382
|
+
return text.replace(/^#+\s*/, '').trim();
|
|
383
|
+
case 'list':
|
|
384
|
+
return text.replace(/^[-*+]\s*/, '').trim();
|
|
385
|
+
case 'blockquote':
|
|
386
|
+
return text.replace(/^>\s*/, '').trim();
|
|
387
|
+
default:
|
|
388
|
+
return text.trim();
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
function enrichBlockFromAnnotation(blockEntry, sem, carrier, state) {
|
|
393
|
+
// Update subject if available
|
|
394
|
+
if (sem.subject && sem.subject !== 'RESET') {
|
|
395
|
+
const resolvedSubject = resolveSubject(sem, state);
|
|
396
|
+
if (resolvedSubject) {
|
|
397
|
+
blockEntry.subject = resolvedSubject.value;
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
// Add types
|
|
402
|
+
if (sem.types && sem.types.length > 0) {
|
|
403
|
+
sem.types.forEach(t => {
|
|
404
|
+
const typeIRI = typeof t === 'string' ? t : t.iri;
|
|
405
|
+
const expanded = expandIRI(typeIRI, state.ctx);
|
|
406
|
+
if (!blockEntry.types.includes(expanded)) {
|
|
407
|
+
blockEntry.types.push(expanded);
|
|
408
|
+
}
|
|
409
|
+
});
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
// Add predicates
|
|
413
|
+
if (sem.predicates && sem.predicates.length > 0) {
|
|
414
|
+
sem.predicates.forEach(pred => {
|
|
415
|
+
const expandedPred = {
|
|
416
|
+
iri: expandIRI(pred.iri, state.ctx),
|
|
417
|
+
form: pred.form || '',
|
|
418
|
+
object: null // Will be filled during quad emission
|
|
419
|
+
};
|
|
420
|
+
blockEntry.predicates.push(expandedPred);
|
|
421
|
+
});
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
// Add carrier information
|
|
425
|
+
if (carrier) {
|
|
426
|
+
const carrierInfo = {
|
|
427
|
+
type: carrier.type,
|
|
428
|
+
range: carrier.range,
|
|
429
|
+
text: carrier.text,
|
|
430
|
+
subject: null,
|
|
431
|
+
predicates: []
|
|
432
|
+
};
|
|
433
|
+
|
|
434
|
+
// Extract carrier-specific semantics
|
|
435
|
+
if (carrier.attrs) {
|
|
436
|
+
const carrierSem = parseSemCached(carrier.attrs);
|
|
437
|
+
if (carrierSem.types) {
|
|
438
|
+
carrierInfo.predicates = carrierSem.predicates || [];
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
blockEntry.carriers.push(carrierInfo);
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
function processAnnotationWithBlockTracking(carrier, sem, state, options = {}) {
|
|
447
|
+
const { preserveGlobalSubject = false, implicitSubject = null } = options;
|
|
448
|
+
|
|
449
|
+
if (sem.subject === 'RESET') {
|
|
450
|
+
state.currentSubject = null;
|
|
451
|
+
return;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
const previousSubject = state.currentSubject;
|
|
455
|
+
const newSubject = resolveSubject(sem, state);
|
|
456
|
+
const localObject = resolveObject(sem, state);
|
|
457
|
+
|
|
458
|
+
const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
|
|
459
|
+
if (newSubject && !preserveGlobalSubject && !implicitSubject) {
|
|
460
|
+
state.currentSubject = newSubject;
|
|
461
|
+
}
|
|
462
|
+
const S = preserveGlobalSubject ? (newSubject || previousSubject) : (implicitSubject || state.currentSubject);
|
|
463
|
+
if (!S) return;
|
|
464
|
+
|
|
465
|
+
const block = createBlock(
|
|
466
|
+
S.value, sem.types, sem.predicates,
|
|
467
|
+
carrier.range, carrier.attrsRange || null, carrier.valueRange || null,
|
|
468
|
+
carrier.type || null, state.ctx, carrier.text
|
|
469
|
+
);
|
|
470
|
+
|
|
471
|
+
const L = createLiteral(carrier.text, sem.datatype, sem.language, state.ctx, state.df);
|
|
472
|
+
const carrierO = carrier.url ? state.df.namedNode(expandIRI(carrier.url, state.ctx)) : null;
|
|
473
|
+
const newSubjectOrCarrierO = newSubject || carrierO;
|
|
474
|
+
|
|
475
|
+
// Enrich current block with semantic information
|
|
476
|
+
if (state.currentBlock) {
|
|
477
|
+
enrichBlockFromAnnotation(state.currentBlock, sem, carrier, state);
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state, carrier);
|
|
481
|
+
processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state, carrier);
|
|
482
|
+
}
|
|
483
|
+
|
|
352
484
|
function createBlock(subject, types, predicates, range, attrsRange, valueRange, carrierType, ctx, text) {
|
|
353
485
|
const expanded = {
|
|
354
486
|
subject,
|
|
@@ -371,7 +503,7 @@ function createBlock(subject, types, predicates, range, attrsRange, valueRange,
|
|
|
371
503
|
};
|
|
372
504
|
}
|
|
373
505
|
|
|
374
|
-
function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predicate, object, dataFactory, meta = null, statements = null, statementCandidates = null) {
|
|
506
|
+
function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predicate, object, dataFactory, meta = null, statements = null, statementCandidates = null, state = null) {
|
|
375
507
|
if (!subject || !predicate || !object) return;
|
|
376
508
|
|
|
377
509
|
const quad = dataFactory.quad(subject, predicate, object);
|
|
@@ -420,6 +552,14 @@ function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predi
|
|
|
420
552
|
};
|
|
421
553
|
|
|
422
554
|
quadIndex.set(quadKey, originEntry);
|
|
555
|
+
|
|
556
|
+
// Link block to this quad for reverse lookup during rendering
|
|
557
|
+
if (state.currentBlock && block.id === state.currentBlock.id) {
|
|
558
|
+
if (!state.currentBlock.quadKeys) {
|
|
559
|
+
state.currentBlock.quadKeys = [];
|
|
560
|
+
}
|
|
561
|
+
state.currentBlock.quadKeys.push(quadKey);
|
|
562
|
+
}
|
|
423
563
|
}
|
|
424
564
|
}
|
|
425
565
|
|
|
@@ -513,7 +653,8 @@ const createTypeQuad = (typeIRI, subject, state, block, entryIndex = null) => {
|
|
|
513
653
|
state.df.namedNode(expandedType),
|
|
514
654
|
state.df,
|
|
515
655
|
{ kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex: typeInfo.entryIndex, remove: typeInfo.remove },
|
|
516
|
-
state.statements, state.statementCandidates
|
|
656
|
+
state.statements, state.statementCandidates,
|
|
657
|
+
state
|
|
517
658
|
);
|
|
518
659
|
};
|
|
519
660
|
|
|
@@ -557,43 +698,16 @@ function processPredicateAnnotations(sem, newSubject, previousSubject, localObje
|
|
|
557
698
|
emitQuad(state.quads, state.quadBuffer, state.removeSet, state.origin.quadIndex, block,
|
|
558
699
|
role.subject, P, role.object, state.df,
|
|
559
700
|
{ kind: 'pred', token: `${pred.form}${pred.iri}`, form: pred.form, expandedPredicate: P.value, entryIndex: pred.entryIndex, remove: pred.remove || false },
|
|
560
|
-
state.statements, state.statementCandidates
|
|
701
|
+
state.statements, state.statementCandidates,
|
|
702
|
+
state
|
|
561
703
|
);
|
|
562
704
|
}
|
|
563
705
|
});
|
|
564
706
|
}
|
|
565
707
|
|
|
566
708
|
function processAnnotation(carrier, sem, state, options = {}) {
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
if (sem.subject === 'RESET') {
|
|
570
|
-
state.currentSubject = null;
|
|
571
|
-
return;
|
|
572
|
-
}
|
|
573
|
-
|
|
574
|
-
const previousSubject = state.currentSubject;
|
|
575
|
-
const newSubject = resolveSubject(sem, state);
|
|
576
|
-
const localObject = resolveObject(sem, state);
|
|
577
|
-
|
|
578
|
-
const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
|
|
579
|
-
if (newSubject && !preserveGlobalSubject && !implicitSubject) {
|
|
580
|
-
state.currentSubject = newSubject;
|
|
581
|
-
}
|
|
582
|
-
const S = preserveGlobalSubject ? (newSubject || previousSubject) : (implicitSubject || state.currentSubject);
|
|
583
|
-
if (!S) return;
|
|
584
|
-
|
|
585
|
-
const block = createBlock(
|
|
586
|
-
S.value, sem.types, sem.predicates,
|
|
587
|
-
carrier.range, carrier.attrsRange || null, carrier.valueRange || null,
|
|
588
|
-
carrier.type || null, state.ctx, carrier.text
|
|
589
|
-
);
|
|
590
|
-
|
|
591
|
-
const L = createLiteral(carrier.text, sem.datatype, sem.language, state.ctx, state.df);
|
|
592
|
-
const carrierO = carrier.url ? state.df.namedNode(expandIRI(carrier.url, state.ctx)) : null;
|
|
593
|
-
const newSubjectOrCarrierO = newSubject || carrierO;
|
|
594
|
-
|
|
595
|
-
processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state, carrier);
|
|
596
|
-
processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state, carrier);
|
|
709
|
+
// Use the enhanced block tracking version
|
|
710
|
+
processAnnotationWithBlockTracking(carrier, sem, state, options);
|
|
597
711
|
}
|
|
598
712
|
|
|
599
713
|
|
|
@@ -635,20 +749,60 @@ function processStandaloneSubject(token, state) {
|
|
|
635
749
|
|
|
636
750
|
const TOKEN_PROCESSORS = {
|
|
637
751
|
heading: (token, state) => {
|
|
752
|
+
const blockEntry = createBlockEntry(token, state);
|
|
753
|
+
state.currentBlock = blockEntry;
|
|
754
|
+
state.blockStack.push(blockEntry.id);
|
|
755
|
+
|
|
638
756
|
processTokenAnnotations(token, state, token.type);
|
|
757
|
+
|
|
758
|
+
state.blockStack.pop();
|
|
759
|
+
state.currentBlock = state.blockStack.length > 0 ?
|
|
760
|
+
state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
|
|
639
761
|
},
|
|
640
762
|
code: (token, state) => {
|
|
763
|
+
const blockEntry = createBlockEntry(token, state);
|
|
764
|
+
state.currentBlock = blockEntry;
|
|
765
|
+
state.blockStack.push(blockEntry.id);
|
|
766
|
+
|
|
641
767
|
processTokenAnnotations(token, state, token.type);
|
|
768
|
+
|
|
769
|
+
state.blockStack.pop();
|
|
770
|
+
state.currentBlock = state.blockStack.length > 0 ?
|
|
771
|
+
state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
|
|
642
772
|
},
|
|
643
773
|
blockquote: (token, state) => {
|
|
774
|
+
const blockEntry = createBlockEntry(token, state);
|
|
775
|
+
state.currentBlock = blockEntry;
|
|
776
|
+
state.blockStack.push(blockEntry.id);
|
|
777
|
+
|
|
644
778
|
processTokenAnnotations(token, state, token.type);
|
|
779
|
+
|
|
780
|
+
state.blockStack.pop();
|
|
781
|
+
state.currentBlock = state.blockStack.length > 0 ?
|
|
782
|
+
state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
|
|
645
783
|
},
|
|
646
784
|
para: (token, state) => {
|
|
785
|
+
const blockEntry = createBlockEntry(token, state);
|
|
786
|
+
state.currentBlock = blockEntry;
|
|
787
|
+
state.blockStack.push(blockEntry.id);
|
|
788
|
+
|
|
647
789
|
processStandaloneSubject(token, state);
|
|
648
790
|
processTokenAnnotations(token, state, token.type);
|
|
791
|
+
|
|
792
|
+
state.blockStack.pop();
|
|
793
|
+
state.currentBlock = state.blockStack.length > 0 ?
|
|
794
|
+
state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
|
|
649
795
|
},
|
|
650
796
|
list: (token, state) => {
|
|
797
|
+
const blockEntry = createBlockEntry(token, state);
|
|
798
|
+
state.currentBlock = blockEntry;
|
|
799
|
+
state.blockStack.push(blockEntry.id);
|
|
800
|
+
|
|
651
801
|
processTokenAnnotations(token, state, token.type);
|
|
802
|
+
|
|
803
|
+
state.blockStack.pop();
|
|
804
|
+
state.currentBlock = state.blockStack.length > 0 ?
|
|
805
|
+
state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
|
|
652
806
|
},
|
|
653
807
|
};
|
|
654
808
|
|
|
@@ -659,12 +813,18 @@ export function parse(text, options = {}) {
|
|
|
659
813
|
quads: [],
|
|
660
814
|
quadBuffer: new Map(),
|
|
661
815
|
removeSet: new Set(),
|
|
662
|
-
origin: {
|
|
816
|
+
origin: {
|
|
817
|
+
quadIndex: new Map(),
|
|
818
|
+
blocks: new Map(),
|
|
819
|
+
documentStructure: []
|
|
820
|
+
},
|
|
663
821
|
currentSubject: null,
|
|
664
822
|
tokens: null,
|
|
665
823
|
currentTokenIndex: -1,
|
|
666
824
|
statements: [],
|
|
667
|
-
statementCandidates: new Map() // Track incomplete rdf:Statement patterns
|
|
825
|
+
statementCandidates: new Map(), // Track incomplete rdf:Statement patterns
|
|
826
|
+
currentBlock: null,
|
|
827
|
+
blockStack: []
|
|
668
828
|
};
|
|
669
829
|
|
|
670
830
|
state.tokens = scanTokens(text);
|