mdld-parse 0.7.2 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mdld-parse",
3
- "version": "0.7.2",
3
+ "version": "0.7.3",
4
4
  "description": "A standards-compliant parser for **MD-LD (Markdown-Linked Data)** — a human-friendly RDF authoring format that extends Markdown with semantic annotations.",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -38,5 +38,8 @@
38
38
  "homepage": "https://mdld.js.org",
39
39
  "bugs": {
40
40
  "url": "https://github.com/davay42/mdld-parse/issues"
41
+ },
42
+ "dependencies": {
43
+ "rdfa-parse": "^1.0.1"
41
44
  }
42
45
  }
package/src/generate.js CHANGED
@@ -1,4 +1,5 @@
1
- import { shortenIRI, expandIRI, DEFAULT_CONTEXT, DataFactory } from './utils.js';
1
+ import { shortenIRI, expandIRI, DataFactory } from './utils.js';
2
+ import { DEFAULT_CONTEXT } from './shared.js';
2
3
 
3
4
  // Helper functions for cleaner term type checking
4
5
  function isLiteral(term) {
package/src/index.js CHANGED
@@ -3,8 +3,8 @@ export { merge } from './merge.js';
3
3
  export { generate } from './generate.js';
4
4
  export { locate } from './locate.js';
5
5
  export { render } from './render.js';
6
+ export { DEFAULT_CONTEXT } from './shared.js';
6
7
  export {
7
- DEFAULT_CONTEXT,
8
8
  DataFactory,
9
9
  hash,
10
10
  expandIRI,
package/src/merge.js CHANGED
@@ -1,5 +1,5 @@
1
1
  import { parse } from './parse.js';
2
- import { DEFAULT_CONTEXT } from './utils.js';
2
+ import { DEFAULT_CONTEXT } from './shared.js';
3
3
 
4
4
  /**
5
5
  * Creates a unique key for quad identity matching
package/src/parse.js CHANGED
@@ -1,5 +1,4 @@
1
1
  import {
2
- DEFAULT_CONTEXT,
3
2
  DataFactory,
4
3
  expandIRI,
5
4
  parseSemanticBlock,
@@ -7,18 +6,7 @@ import {
7
6
  createLiteral,
8
7
  hash
9
8
  } from './utils.js';
10
-
11
- const URL_REGEX = /^[a-zA-Z][a-zA-Z0-9+.-]*:/;
12
- const FENCE_REGEX = /^(`{3,}|~{3,})(.*)/;
13
- const PREFIX_REGEX = /^\[([^\]]+)\]\s*<([^>]+)>/;
14
- const HEADING_REGEX = /^(#{1,6})\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
15
- const UNORDERED_LIST_REGEX = /^(\s*)([-*+]|\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
16
- const BLOCKQUOTE_REGEX = /^>\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
17
- const STANDALONE_SUBJECT_REGEX = /^\s*\{=(.*?)\}\s*$/;
18
- const INLINE_CARRIER_PATTERNS = {
19
- EMPHASIS: /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y,
20
- CODE_SPAN: /``(.+?)``\s*\{([^}]+)\}/y
21
- };
9
+ import { DEFAULT_CONTEXT, URL_REGEX, FENCE_REGEX, PREFIX_REGEX, HEADING_REGEX, UNORDERED_LIST_REGEX, BLOCKQUOTE_REGEX, STANDALONE_SUBJECT_REGEX, INLINE_CARRIER_PATTERNS } from './shared.js';
22
10
 
23
11
  // Cache for fence regex patterns to avoid recreation
24
12
  const FENCE_CLOSE_PATTERNS = new Map();
@@ -349,6 +337,150 @@ function determineCarrierType(url) {
349
337
  return { carrierType: 'span', resourceIRI: null };
350
338
  }
351
339
 
340
+ function createBlockEntry(token, state) {
341
+ const blockId = token._blockId || hash(`${token.type}:${token.range?.[0]}:${token.range?.[1]}`);
342
+ token._blockId = blockId; // Store for later reference
343
+
344
+ const cleanText = extractCleanText(token);
345
+
346
+ const blockEntry = {
347
+ id: blockId,
348
+ type: token.type,
349
+ range: token.range,
350
+ text: cleanText,
351
+ subject: null,
352
+ types: [],
353
+ predicates: [],
354
+ carriers: [],
355
+ listLevel: token.indent || 0,
356
+ parentBlockId: state.blockStack.length > 0 ? state.blockStack[state.blockStack.length - 1] : null,
357
+ quadKeys: [] // Will be populated during quad emission
358
+ };
359
+
360
+ // Store block and add to document structure
361
+ state.origin.blocks.set(blockId, blockEntry);
362
+ state.origin.documentStructure.push(blockEntry);
363
+
364
+ return blockEntry;
365
+ }
366
+
367
+ function extractCleanText(token) {
368
+ if (!token.text) return '';
369
+
370
+ let text = token.text;
371
+
372
+ // Remove semantic annotations
373
+ if (token.attrsRange) {
374
+ const beforeAttrs = text.substring(0, token.attrsRange[0] - (token.range?.[0] || 0));
375
+ const afterAttrs = text.substring(token.attrsRange[1] - (token.range?.[0] || 0));
376
+ text = beforeAttrs + afterAttrs;
377
+ }
378
+
379
+ // Clean based on token type
380
+ switch (token.type) {
381
+ case 'heading':
382
+ return text.replace(/^#+\s*/, '').trim();
383
+ case 'list':
384
+ return text.replace(/^[-*+]\s*/, '').trim();
385
+ case 'blockquote':
386
+ return text.replace(/^>\s*/, '').trim();
387
+ default:
388
+ return text.trim();
389
+ }
390
+ }
391
+
392
+ function enrichBlockFromAnnotation(blockEntry, sem, carrier, state) {
393
+ // Update subject if available
394
+ if (sem.subject && sem.subject !== 'RESET') {
395
+ const resolvedSubject = resolveSubject(sem, state);
396
+ if (resolvedSubject) {
397
+ blockEntry.subject = resolvedSubject.value;
398
+ }
399
+ }
400
+
401
+ // Add types
402
+ if (sem.types && sem.types.length > 0) {
403
+ sem.types.forEach(t => {
404
+ const typeIRI = typeof t === 'string' ? t : t.iri;
405
+ const expanded = expandIRI(typeIRI, state.ctx);
406
+ if (!blockEntry.types.includes(expanded)) {
407
+ blockEntry.types.push(expanded);
408
+ }
409
+ });
410
+ }
411
+
412
+ // Add predicates
413
+ if (sem.predicates && sem.predicates.length > 0) {
414
+ sem.predicates.forEach(pred => {
415
+ const expandedPred = {
416
+ iri: expandIRI(pred.iri, state.ctx),
417
+ form: pred.form || '',
418
+ object: null // Will be filled during quad emission
419
+ };
420
+ blockEntry.predicates.push(expandedPred);
421
+ });
422
+ }
423
+
424
+ // Add carrier information
425
+ if (carrier) {
426
+ const carrierInfo = {
427
+ type: carrier.type,
428
+ range: carrier.range,
429
+ text: carrier.text,
430
+ subject: null,
431
+ predicates: []
432
+ };
433
+
434
+ // Extract carrier-specific semantics
435
+ if (carrier.attrs) {
436
+ const carrierSem = parseSemCached(carrier.attrs);
437
+ if (carrierSem.types) {
438
+ carrierInfo.predicates = carrierSem.predicates || [];
439
+ }
440
+ }
441
+
442
+ blockEntry.carriers.push(carrierInfo);
443
+ }
444
+ }
445
+
446
+ function processAnnotationWithBlockTracking(carrier, sem, state, options = {}) {
447
+ const { preserveGlobalSubject = false, implicitSubject = null } = options;
448
+
449
+ if (sem.subject === 'RESET') {
450
+ state.currentSubject = null;
451
+ return;
452
+ }
453
+
454
+ const previousSubject = state.currentSubject;
455
+ const newSubject = resolveSubject(sem, state);
456
+ const localObject = resolveObject(sem, state);
457
+
458
+ const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
459
+ if (newSubject && !preserveGlobalSubject && !implicitSubject) {
460
+ state.currentSubject = newSubject;
461
+ }
462
+ const S = preserveGlobalSubject ? (newSubject || previousSubject) : (implicitSubject || state.currentSubject);
463
+ if (!S) return;
464
+
465
+ const block = createBlock(
466
+ S.value, sem.types, sem.predicates,
467
+ carrier.range, carrier.attrsRange || null, carrier.valueRange || null,
468
+ carrier.type || null, state.ctx, carrier.text
469
+ );
470
+
471
+ const L = createLiteral(carrier.text, sem.datatype, sem.language, state.ctx, state.df);
472
+ const carrierO = carrier.url ? state.df.namedNode(expandIRI(carrier.url, state.ctx)) : null;
473
+ const newSubjectOrCarrierO = newSubject || carrierO;
474
+
475
+ // Enrich current block with semantic information
476
+ if (state.currentBlock) {
477
+ enrichBlockFromAnnotation(state.currentBlock, sem, carrier, state);
478
+ }
479
+
480
+ processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state, carrier);
481
+ processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state, carrier);
482
+ }
483
+
352
484
  function createBlock(subject, types, predicates, range, attrsRange, valueRange, carrierType, ctx, text) {
353
485
  const expanded = {
354
486
  subject,
@@ -371,7 +503,7 @@ function createBlock(subject, types, predicates, range, attrsRange, valueRange,
371
503
  };
372
504
  }
373
505
 
374
- function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predicate, object, dataFactory, meta = null, statements = null, statementCandidates = null) {
506
+ function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predicate, object, dataFactory, meta = null, statements = null, statementCandidates = null, state = null) {
375
507
  if (!subject || !predicate || !object) return;
376
508
 
377
509
  const quad = dataFactory.quad(subject, predicate, object);
@@ -420,6 +552,14 @@ function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predi
420
552
  };
421
553
 
422
554
  quadIndex.set(quadKey, originEntry);
555
+
556
+ // Link block to this quad for reverse lookup during rendering
557
+ if (state.currentBlock && block.id === state.currentBlock.id) {
558
+ if (!state.currentBlock.quadKeys) {
559
+ state.currentBlock.quadKeys = [];
560
+ }
561
+ state.currentBlock.quadKeys.push(quadKey);
562
+ }
423
563
  }
424
564
  }
425
565
 
@@ -513,7 +653,8 @@ const createTypeQuad = (typeIRI, subject, state, block, entryIndex = null) => {
513
653
  state.df.namedNode(expandedType),
514
654
  state.df,
515
655
  { kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex: typeInfo.entryIndex, remove: typeInfo.remove },
516
- state.statements, state.statementCandidates
656
+ state.statements, state.statementCandidates,
657
+ state
517
658
  );
518
659
  };
519
660
 
@@ -557,43 +698,16 @@ function processPredicateAnnotations(sem, newSubject, previousSubject, localObje
557
698
  emitQuad(state.quads, state.quadBuffer, state.removeSet, state.origin.quadIndex, block,
558
699
  role.subject, P, role.object, state.df,
559
700
  { kind: 'pred', token: `${pred.form}${pred.iri}`, form: pred.form, expandedPredicate: P.value, entryIndex: pred.entryIndex, remove: pred.remove || false },
560
- state.statements, state.statementCandidates
701
+ state.statements, state.statementCandidates,
702
+ state
561
703
  );
562
704
  }
563
705
  });
564
706
  }
565
707
 
566
708
  function processAnnotation(carrier, sem, state, options = {}) {
567
- const { preserveGlobalSubject = false, implicitSubject = null } = options;
568
-
569
- if (sem.subject === 'RESET') {
570
- state.currentSubject = null;
571
- return;
572
- }
573
-
574
- const previousSubject = state.currentSubject;
575
- const newSubject = resolveSubject(sem, state);
576
- const localObject = resolveObject(sem, state);
577
-
578
- const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
579
- if (newSubject && !preserveGlobalSubject && !implicitSubject) {
580
- state.currentSubject = newSubject;
581
- }
582
- const S = preserveGlobalSubject ? (newSubject || previousSubject) : (implicitSubject || state.currentSubject);
583
- if (!S) return;
584
-
585
- const block = createBlock(
586
- S.value, sem.types, sem.predicates,
587
- carrier.range, carrier.attrsRange || null, carrier.valueRange || null,
588
- carrier.type || null, state.ctx, carrier.text
589
- );
590
-
591
- const L = createLiteral(carrier.text, sem.datatype, sem.language, state.ctx, state.df);
592
- const carrierO = carrier.url ? state.df.namedNode(expandIRI(carrier.url, state.ctx)) : null;
593
- const newSubjectOrCarrierO = newSubject || carrierO;
594
-
595
- processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state, carrier);
596
- processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state, carrier);
709
+ // Use the enhanced block tracking version
710
+ processAnnotationWithBlockTracking(carrier, sem, state, options);
597
711
  }
598
712
 
599
713
 
@@ -635,20 +749,60 @@ function processStandaloneSubject(token, state) {
635
749
 
636
750
  const TOKEN_PROCESSORS = {
637
751
  heading: (token, state) => {
752
+ const blockEntry = createBlockEntry(token, state);
753
+ state.currentBlock = blockEntry;
754
+ state.blockStack.push(blockEntry.id);
755
+
638
756
  processTokenAnnotations(token, state, token.type);
757
+
758
+ state.blockStack.pop();
759
+ state.currentBlock = state.blockStack.length > 0 ?
760
+ state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
639
761
  },
640
762
  code: (token, state) => {
763
+ const blockEntry = createBlockEntry(token, state);
764
+ state.currentBlock = blockEntry;
765
+ state.blockStack.push(blockEntry.id);
766
+
641
767
  processTokenAnnotations(token, state, token.type);
768
+
769
+ state.blockStack.pop();
770
+ state.currentBlock = state.blockStack.length > 0 ?
771
+ state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
642
772
  },
643
773
  blockquote: (token, state) => {
774
+ const blockEntry = createBlockEntry(token, state);
775
+ state.currentBlock = blockEntry;
776
+ state.blockStack.push(blockEntry.id);
777
+
644
778
  processTokenAnnotations(token, state, token.type);
779
+
780
+ state.blockStack.pop();
781
+ state.currentBlock = state.blockStack.length > 0 ?
782
+ state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
645
783
  },
646
784
  para: (token, state) => {
785
+ const blockEntry = createBlockEntry(token, state);
786
+ state.currentBlock = blockEntry;
787
+ state.blockStack.push(blockEntry.id);
788
+
647
789
  processStandaloneSubject(token, state);
648
790
  processTokenAnnotations(token, state, token.type);
791
+
792
+ state.blockStack.pop();
793
+ state.currentBlock = state.blockStack.length > 0 ?
794
+ state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
649
795
  },
650
796
  list: (token, state) => {
797
+ const blockEntry = createBlockEntry(token, state);
798
+ state.currentBlock = blockEntry;
799
+ state.blockStack.push(blockEntry.id);
800
+
651
801
  processTokenAnnotations(token, state, token.type);
802
+
803
+ state.blockStack.pop();
804
+ state.currentBlock = state.blockStack.length > 0 ?
805
+ state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
652
806
  },
653
807
  };
654
808
 
@@ -659,12 +813,18 @@ export function parse(text, options = {}) {
659
813
  quads: [],
660
814
  quadBuffer: new Map(),
661
815
  removeSet: new Set(),
662
- origin: { quadIndex: new Map() },
816
+ origin: {
817
+ quadIndex: new Map(),
818
+ blocks: new Map(),
819
+ documentStructure: []
820
+ },
663
821
  currentSubject: null,
664
822
  tokens: null,
665
823
  currentTokenIndex: -1,
666
824
  statements: [],
667
- statementCandidates: new Map() // Track incomplete rdf:Statement patterns
825
+ statementCandidates: new Map(), // Track incomplete rdf:Statement patterns
826
+ currentBlock: null,
827
+ blockStack: []
668
828
  };
669
829
 
670
830
  state.tokens = scanTokens(text);