mdld-parse 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/parse.js CHANGED
@@ -9,23 +9,20 @@ import {
9
9
  hash
10
10
  } from './utils.js';
11
11
 
12
- // Constants and patterns
13
12
  const URL_REGEX = /^[a-zA-Z][a-zA-Z0-9+.-]*:/;
14
13
  const FENCE_REGEX = /^(`{3,})(.*)/;
15
14
  const PREFIX_REGEX = /^\[([^\]]+)\]\s*<([^>]+)>/;
16
15
  const HEADING_REGEX = /^(#{1,6})\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
17
- const LIST_REGEX = /^(\s*)([-*+]|\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
16
+ const UNORDERED_LIST_REGEX = /^(\s*)([-*+])\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
17
+ const ORDERED_LIST_REGEX = /^(\s*)(\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
18
18
  const BLOCKQUOTE_REGEX = /^>\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
19
19
  const STANDALONE_SUBJECT_REGEX = /^\s*\{=(.*?)\}\s*$/;
20
20
  const LIST_CONTEXT_REGEX = /^(.+?)\s*\{([^}]+)\}$/;
21
-
22
- // Inline carrier pattern constants
23
21
  const INLINE_CARRIER_PATTERNS = {
24
22
  EMPHASIS: /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y,
25
23
  CODE_SPAN: /``(.+?)``\s*\{([^}]+)\}/y
26
24
  };
27
25
 
28
- // Semantic block cache to avoid repeated parsing
29
26
  const semCache = {};
30
27
  const EMPTY_SEM = Object.freeze({ predicates: [], types: [], subject: null });
31
28
 
@@ -39,41 +36,50 @@ function parseSemCached(attrs) {
39
36
  return sem;
40
37
  }
41
38
 
39
+ function calcRangeInfo(line, attrs, lineStart, prefixLength, valueLength) {
40
+ const wsLength = prefixLength < line.length && line[prefixLength] === ' ' ? 1 :
41
+ line.slice(prefixLength).match(/^\s+/)?.[0]?.length || 0;
42
+ const valueStartInLine = prefixLength + wsLength;
43
+ return {
44
+ valueRange: [lineStart + valueStartInLine, lineStart + valueStartInLine + valueLength],
45
+ attrsRange: calcAttrsRange(line, attrs, lineStart)
46
+ };
47
+ }
48
+
42
49
  function calcAttrsRange(line, attrs, lineStart) {
43
50
  if (!attrs) return null;
44
51
  const attrsStartInLine = line.lastIndexOf(attrs);
45
52
  return attrsStartInLine >= 0 ? [lineStart + attrsStartInLine, lineStart + attrsStartInLine + attrs.length] : null;
46
53
  }
47
54
 
48
- function calcValueRange(lineStart, valueStartInLine, valueEndInLine) {
49
- return [lineStart + valueStartInLine, lineStart + valueEndInLine];
50
- }
51
-
52
55
  function createToken(type, range, text, attrs = null, attrsRange = null, valueRange = null, extra = {}) {
53
56
  const token = { type, range, text, attrs, attrsRange, valueRange, ...extra };
54
- // Add lazy carrier caching
55
57
  Object.defineProperty(token, '_carriers', {
56
- enumerable: false,
57
- writable: true,
58
- value: null
58
+ enumerable: false, writable: true, value: null
59
59
  });
60
60
  return token;
61
61
  }
62
62
 
63
63
  function getCarriers(token) {
64
- if (!token._carriers) {
65
- token._carriers = extractInlineCarriers(token.text, token.range[0]);
66
- }
67
- return token._carriers;
64
+ if (token.type === 'code') return [];
65
+ return token._carriers || (token._carriers = extractInlineCarriers(token.text, token.range[0]));
68
66
  }
69
67
 
68
+ const createListToken = (type, line, lineStart, pos, match, indent = null) => {
69
+ const attrs = match[4] || null;
70
+ const prefix = match[1].length + (match[2] ? match[2].length : 0);
71
+ const rangeInfo = calcRangeInfo(line, attrs, lineStart, prefix, match[3].length);
72
+ const extra = indent !== null ? { indent } : { indent: match[1].length, number: parseInt(match[2]) };
73
+ return createToken(type, [lineStart, pos - 1], match[3].trim(), attrs,
74
+ rangeInfo.attrsRange, rangeInfo.valueRange, extra);
75
+ };
76
+
70
77
  function scanTokens(text) {
71
78
  const tokens = [];
72
79
  const lines = text.split('\n');
73
80
  let pos = 0;
74
81
  let codeBlock = null;
75
82
 
76
- // Token processors in order of priority
77
83
  const processors = [
78
84
  {
79
85
  test: line => line.startsWith('```'),
@@ -109,14 +115,14 @@ function scanTokens(text) {
109
115
  });
110
116
  codeBlock = null;
111
117
  }
112
- return true; // handled
118
+ return true;
113
119
  }
114
120
  },
115
121
  {
116
122
  test: () => codeBlock,
117
123
  process: line => {
118
124
  codeBlock.content.push(line);
119
- return true; // handled
125
+ return true;
120
126
  }
121
127
  },
122
128
  {
@@ -124,7 +130,7 @@ function scanTokens(text) {
124
130
  process: (line, lineStart, pos) => {
125
131
  const match = PREFIX_REGEX.exec(line);
126
132
  tokens.push({ type: 'prefix', prefix: match[1], iri: match[2].trim() });
127
- return true; // handled
133
+ return true;
128
134
  }
129
135
  },
130
136
  {
@@ -133,32 +139,26 @@ function scanTokens(text) {
133
139
  const match = HEADING_REGEX.exec(line);
134
140
  const attrs = match[3] || null;
135
141
  const afterHashes = match[1].length;
136
- const wsLength = afterHashes < line.length && line[afterHashes] === ' ' ? 1 :
137
- line.slice(afterHashes).match(/^\s+/)?.[0]?.length || 0;
138
- const valueStartInLine = afterHashes + wsLength;
139
- const valueEndInLine = valueStartInLine + match[2].length;
142
+ const rangeInfo = calcRangeInfo(line, attrs, lineStart, afterHashes, match[2].length);
140
143
  tokens.push(createToken('heading', [lineStart, pos - 1], match[2].trim(), attrs,
141
- calcAttrsRange(line, attrs, lineStart),
142
- calcValueRange(lineStart, valueStartInLine, valueEndInLine),
143
- { depth: match[1].length }));
144
- return true; // handled
144
+ rangeInfo.attrsRange, rangeInfo.valueRange, { depth: match[1].length }));
145
+ return true;
145
146
  }
146
147
  },
147
148
  {
148
- test: line => LIST_REGEX.test(line),
149
+ test: line => UNORDERED_LIST_REGEX.test(line),
149
150
  process: (line, lineStart, pos) => {
150
- const match = LIST_REGEX.exec(line);
151
- const attrs = match[4] || null;
152
- const prefix = match[1].length + match[2].length;
153
- const wsLength = prefix < line.length && line[prefix] === ' ' ? 1 :
154
- line.slice(prefix).match(/^\s+/)?.[0]?.length || 0;
155
- const valueStartInLine = prefix + wsLength;
156
- const valueEndInLine = valueStartInLine + match[3].length;
157
- tokens.push(createToken('list', [lineStart, pos - 1], match[3].trim(), attrs,
158
- calcAttrsRange(line, attrs, lineStart),
159
- calcValueRange(lineStart, valueStartInLine, valueEndInLine),
160
- { indent: match[1].length }));
161
- return true; // handled
151
+ const match = UNORDERED_LIST_REGEX.exec(line);
152
+ tokens.push(createListToken('unordered-list', line, lineStart, pos, match, match[1].length));
153
+ return true;
154
+ }
155
+ },
156
+ {
157
+ test: line => ORDERED_LIST_REGEX.test(line),
158
+ process: (line, lineStart, pos) => {
159
+ const match = ORDERED_LIST_REGEX.exec(line);
160
+ tokens.push(createListToken('ordered-list', line, lineStart, pos, match));
161
+ return true;
162
162
  }
163
163
  },
164
164
  {
@@ -170,15 +170,15 @@ function scanTokens(text) {
170
170
  const valueEndInLine = valueStartInLine + match[1].length;
171
171
  tokens.push(createToken('blockquote', [lineStart, pos - 1], match[1].trim(), attrs,
172
172
  calcAttrsRange(line, attrs, lineStart),
173
- calcValueRange(lineStart, valueStartInLine, valueEndInLine)));
174
- return true; // handled
173
+ [lineStart + valueStartInLine, lineStart + valueEndInLine]));
174
+ return true;
175
175
  }
176
176
  },
177
177
  {
178
178
  test: line => line.trim(),
179
179
  process: (line, lineStart, pos) => {
180
180
  tokens.push(createToken('para', [lineStart, pos - 1], line.trim()));
181
- return true; // handled
181
+ return true;
182
182
  }
183
183
  }
184
184
  ];
@@ -191,7 +191,7 @@ function scanTokens(text) {
191
191
  // Try each processor until one handles the line
192
192
  for (const processor of processors) {
193
193
  if (processor.test(line) && processor.process(line, lineStart, pos)) {
194
- break; // line handled, move to next line
194
+ break;
195
195
  }
196
196
  }
197
197
  }
@@ -207,44 +207,35 @@ function extractInlineCarriers(text, baseOffset = 0) {
207
207
  const carriers = [];
208
208
  let pos = 0;
209
209
 
210
- // Unified carrier extractor with pattern-based handlers
211
- const extractCarrier = (text, pos, baseOffset) => {
212
- // Angle-bracket URLs: <URL>{...}
213
- if (text[pos] === '<') {
210
+ const CARRIER_EXTRACTORS = {
211
+ '<': (text, pos, baseOffset) => {
214
212
  const angleEnd = text.indexOf('>', pos);
215
- if (angleEnd !== -1) {
216
- const url = text.slice(pos + 1, angleEnd);
217
- if (URL_REGEX.test(url)) {
218
- const { attrs, attrsRange, finalSpanEnd } = extractAttributesFromText(text, angleEnd + 1, baseOffset);
219
- return createCarrier('link', url, attrs, attrsRange,
220
- [baseOffset + pos + 1, baseOffset + angleEnd],
221
- [baseOffset + pos, baseOffset + finalSpanEnd],
222
- finalSpanEnd, { url });
223
- }
224
- }
225
- return null;
226
- }
227
-
228
- // Bracketed links: [text](URL){...} and [text]{...}
229
- if (text[pos] === '[') {
213
+ if (angleEnd === -1) return null;
214
+ const url = text.slice(pos + 1, angleEnd);
215
+ if (!URL_REGEX.test(url)) return null;
216
+ const { attrs, attrsRange, finalSpanEnd } = extractAttributesFromText(text, angleEnd + 1, baseOffset);
217
+ return createCarrier('link', url, attrs, attrsRange,
218
+ [baseOffset + pos + 1, baseOffset + angleEnd],
219
+ [baseOffset + pos, baseOffset + finalSpanEnd], finalSpanEnd, { url });
220
+ },
221
+ '[': (text, pos, baseOffset) => {
230
222
  const bracketEnd = findMatchingBracket(text, pos);
231
- if (bracketEnd) {
232
- const carrierText = text.slice(pos + 1, bracketEnd - 1);
233
- const { url, spanEnd } = extractUrlFromBrackets(text, bracketEnd);
234
- const { attrs, attrsRange, finalSpanEnd } = extractAttributesFromText(text, spanEnd, baseOffset);
235
- const { carrierType, resourceIRI } = determineCarrierType(url);
236
-
237
- if (url?.startsWith('=')) return { skip: true, pos: finalSpanEnd };
238
-
239
- return createCarrier(carrierType, carrierText, attrs, attrsRange,
240
- [baseOffset + pos + 1, baseOffset + bracketEnd - 1],
241
- [baseOffset + pos, baseOffset + finalSpanEnd],
242
- finalSpanEnd, { url: resourceIRI });
243
- }
244
- return null;
223
+ if (!bracketEnd) return null;
224
+ const carrierText = text.slice(pos + 1, bracketEnd - 1);
225
+ const { url, spanEnd } = extractUrlFromBrackets(text, bracketEnd);
226
+ const { attrs, attrsRange, finalSpanEnd } = extractAttributesFromText(text, spanEnd, baseOffset);
227
+ const { carrierType, resourceIRI } = determineCarrierType(url);
228
+ if (url?.startsWith('=')) return { skip: true, pos: finalSpanEnd };
229
+ return createCarrier(carrierType, carrierText, attrs, attrsRange,
230
+ [baseOffset + pos + 1, baseOffset + bracketEnd - 1],
231
+ [baseOffset + pos, baseOffset + finalSpanEnd], finalSpanEnd, { url: resourceIRI });
245
232
  }
233
+ };
234
+
235
+ const extractCarrier = (text, pos, baseOffset) => {
236
+ const extractor = CARRIER_EXTRACTORS[text[pos]];
237
+ if (extractor) return extractor(text, pos, baseOffset);
246
238
 
247
- // Regex-based carriers: emphasis and code spans
248
239
  for (const [type, pattern] of Object.entries(INLINE_CARRIER_PATTERNS)) {
249
240
  pattern.lastIndex = pos;
250
241
  const match = pattern.exec(text);
@@ -255,7 +246,6 @@ function extractInlineCarriers(text, baseOffset = 0) {
255
246
  ranges.attrsRange, ranges.valueRange, ranges.range, ranges.pos);
256
247
  }
257
248
  }
258
-
259
249
  return null;
260
250
  };
261
251
 
@@ -352,14 +342,9 @@ function createBlock(subject, types, predicates, entries, range, attrsRange, val
352
342
  predicates: predicates.map(p => ({ iri: expandIRI(p.iri, ctx), form: p.form }))
353
343
  };
354
344
 
355
- const signature = [
356
- subject,
357
- carrierType || 'unknown',
358
- expanded.types.join(','),
359
- expanded.predicates.map(p => `${p.form}${p.iri}`).join(',')
360
- ].join('|');
361
-
345
+ const signature = [subject, carrierType || 'unknown', expanded.types.join(','), expanded.predicates.map(p => `${p.form}${p.iri}`).join(',')].join('|');
362
346
  const blockId = hash(signature);
347
+
363
348
  return {
364
349
  id: blockId,
365
350
  range: { start: range[0], end: range[1] },
@@ -386,93 +371,75 @@ function emitQuad(quads, quadIndex, blockId, subject, predicate, object, dataFac
386
371
 
387
372
  quadIndex.set(quadIndexKey(quad.subject, quad.predicate, quad.object), slotInfo);
388
373
  }
374
+ const resolveFragment = (fragment, state) => {
375
+ if (!state.currentSubject) return null;
376
+ const baseIRI = state.currentSubject.value.split('#')[0];
377
+ return state.df.namedNode(`${baseIRI}#${fragment}`);
378
+ };
379
+
389
380
  function resolveSubject(sem, state) {
390
381
  if (!sem.subject) return null;
391
382
  if (sem.subject === 'RESET') {
392
383
  state.currentSubject = null;
393
384
  return null;
394
385
  }
395
- if (sem.subject.startsWith('=#')) {
396
- const fragment = sem.subject.substring(2);
397
- if (state.currentSubject) {
398
- const baseIRI = state.currentSubject.value.split('#')[0];
399
- return state.df.namedNode(`${baseIRI}#${fragment}`);
400
- }
401
- return null;
402
- }
386
+ if (sem.subject.startsWith('=#')) return resolveFragment(sem.subject.substring(2), state);
403
387
  return state.df.namedNode(expandIRI(sem.subject, state.ctx));
404
388
  }
405
389
 
406
390
  function resolveObject(sem, state) {
407
391
  if (!sem.object) return null;
408
- if (sem.object.startsWith('#')) {
409
- const fragment = sem.object.substring(1);
410
- if (state.currentSubject) {
411
- const baseIRI = state.currentSubject.value.split('#')[0];
412
- return state.df.namedNode(`${baseIRI}#${fragment}`);
413
- }
414
- return null;
415
- }
392
+ if (sem.object.startsWith('#')) return resolveFragment(sem.object.substring(1), state);
416
393
  return state.df.namedNode(expandIRI(sem.object, state.ctx));
417
394
  }
418
395
 
396
+ const createTypeQuad = (typeIRI, subject, state, blockId, entryIndex = null) => {
397
+ const expandedType = expandIRI(typeIRI, state.ctx);
398
+ emitQuad(
399
+ state.quads, state.origin.quadIndex, blockId,
400
+ subject,
401
+ state.df.namedNode(expandIRI('rdf:type', state.ctx)),
402
+ state.df.namedNode(expandedType),
403
+ state.df,
404
+ { kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex }
405
+ );
406
+ };
407
+
419
408
  function processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state, carrier) {
420
409
  sem.types.forEach(t => {
421
410
  const typeIRI = typeof t === 'string' ? t : t.iri;
422
411
  const entryIndex = typeof t === 'string' ? null : t.entryIndex;
423
-
424
- // For angle-bracket URLs and bracketed links [text](URL), use the URL as the subject
425
- // for type declarations when there's no explicit subject declaration.
426
- // This implements {+URL} soft subject behavior.
427
412
  let typeSubject = newSubject ? newSubject : (localObject || carrierO || S);
428
413
  if (carrier?.type === 'link' && carrier?.url && !newSubject) {
429
- typeSubject = carrierO; // Use URL as subject for type declarations
414
+ typeSubject = carrierO;
430
415
  }
431
-
432
- const expandedType = expandIRI(typeIRI, state.ctx);
433
-
434
- emitQuad(
435
- state.quads, state.origin.quadIndex, block.id,
436
- typeSubject,
437
- state.df.namedNode(expandIRI('rdf:type', state.ctx)),
438
- state.df.namedNode(expandedType),
439
- state.df,
440
- { kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex }
441
- );
416
+ createTypeQuad(typeIRI, typeSubject, state, block.id, entryIndex);
442
417
  });
443
418
  }
444
419
 
420
+ const determinePredicateRole = (pred, carrier, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L) => {
421
+ if (pred.form === '' && carrier?.type === 'link' && carrier?.url && carrier.text === carrier.url) {
422
+ return null;
423
+ }
424
+ switch (pred.form) {
425
+ case '':
426
+ return carrier?.type === 'link' && carrier?.url && carrier.text !== carrier.url && !newSubject
427
+ ? { subject: newSubjectOrCarrierO, object: L }
428
+ : { subject: localObject || S, object: L };
429
+ case '?':
430
+ return { subject: newSubject ? previousSubject : S, object: localObject || newSubjectOrCarrierO };
431
+ case '!':
432
+ return { subject: localObject || newSubjectOrCarrierO, object: newSubject ? previousSubject : S };
433
+ default:
434
+ return null;
435
+ }
436
+ };
437
+
445
438
  function processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state, carrier) {
446
439
  sem.predicates.forEach(pred => {
447
- const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
448
-
449
- // Skip literal predicates for angle-bracket URLs only
450
- if (pred.form === '' && carrier?.type === 'link' && carrier?.url && carrier.text === carrier.url) {
451
- return;
452
- }
453
-
454
- // Determine subject/object roles based on predicate form
455
- let role;
456
- switch (pred.form) {
457
- case '':
458
- // For bracketed links with literal predicates and no explicit subject, use URL as subject
459
- if (carrier?.type === 'link' && carrier?.url && carrier.text !== carrier.url && !newSubject) {
460
- role = { subject: newSubjectOrCarrierO, object: L };
461
- } else {
462
- role = { subject: localObject || S, object: L };
463
- }
464
- break;
465
- case '?':
466
- role = { subject: newSubject ? previousSubject : S, object: localObject || newSubjectOrCarrierO };
467
- break;
468
- case '!':
469
- role = { subject: localObject || newSubjectOrCarrierO, object: newSubject ? previousSubject : S };
470
- break;
471
- default:
472
- role = null;
473
- }
474
-
440
+ const role = determinePredicateRole(pred, carrier, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L);
475
441
  if (role) {
442
+ const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
476
443
  emitQuad(state.quads, state.origin.quadIndex, block.id,
477
444
  role.subject, P, role.object, state.df,
478
445
  { kind: 'pred', token: `${pred.form}${pred.iri}`, form: pred.form, expandedPredicate: P.value, entryIndex: pred.entryIndex }
@@ -493,7 +460,6 @@ function processAnnotation(carrier, sem, state, options = {}) {
493
460
  const newSubject = resolveSubject(sem, state);
494
461
  const localObject = resolveObject(sem, state);
495
462
 
496
- // Use implicit subject if provided (for list items)
497
463
  const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
498
464
  if (newSubject && !preserveGlobalSubject && !implicitSubject) {
499
465
  state.currentSubject = newSubject;
@@ -542,7 +508,6 @@ export function findItemSubject(listToken, carriers, state) {
542
508
  }
543
509
 
544
510
  function hasOwnPredicates(listToken, carriers) {
545
- // Check for explicit predicates (excluding subject declarations)
546
511
  if (listToken.attrs) {
547
512
  const attrs = parseSemCached(listToken.attrs);
548
513
  if (attrs.predicates.some(p => !p.subject && p.iri !== 'RESET')) {
@@ -555,9 +520,7 @@ function hasOwnPredicates(listToken, carriers) {
555
520
  });
556
521
  }
557
522
 
558
- // Unified list context processing
559
- function processContextSem({ sem, itemSubject, contextSubject, inheritLiterals = false, state, blockId = 'list-context' }) {
560
- // Emit types
523
+ const processContextSem = ({ sem, itemSubject, contextSubject, inheritLiterals = false, state, blockId = 'list-context' }) => {
561
524
  sem.types.forEach(t => {
562
525
  const typeIRI = typeof t === 'string' ? t : t.iri;
563
526
  emitQuad(
@@ -569,7 +532,6 @@ function processContextSem({ sem, itemSubject, contextSubject, inheritLiterals =
569
532
  );
570
533
  });
571
534
 
572
- // Emit directional predicates
573
535
  sem.predicates.forEach(pred => {
574
536
  const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
575
537
  if (pred.form === '!') {
@@ -579,46 +541,35 @@ function processContextSem({ sem, itemSubject, contextSubject, inheritLiterals =
579
541
  }
580
542
  });
581
543
 
582
- // Optionally inherit literal predicates
583
544
  if (inheritLiterals) {
584
545
  const literalPredicates = sem.predicates.filter(p => p.form === '');
585
546
  if (literalPredicates.length > 0) {
586
547
  return {
587
- subject: null,
588
- object: null,
589
- types: [],
548
+ subject: null, object: null, types: [],
590
549
  predicates: literalPredicates.map(p => ({ iri: p.iri, form: p.form, entryIndex: p.entryIndex })),
591
- datatype: null,
592
- language: null,
593
- entries: []
550
+ datatype: null, language: null, entries: []
594
551
  };
595
552
  }
596
553
  }
597
554
  return null;
598
- }
555
+ };
599
556
 
600
- // List stack management functions
601
- function manageListStack(token, state) {
602
- // Pop stack frames for lists that have ended (strictly less indent)
603
- while (
604
- state.listStack.length &&
605
- token.indent < state.listStack[state.listStack.length - 1].indent
606
- ) {
557
+ const manageListStack = (token, state) => {
558
+ while (state.listStack.length && token.indent < state.listStack[state.listStack.length - 1].indent) {
607
559
  state.listStack.pop();
608
560
  }
609
561
 
610
- // If we have pending context, always create a new frame for it
611
562
  if (state.pendingListContext) {
612
563
  state.listStack.push({
613
564
  indent: token.indent,
614
565
  anchorSubject: state.pendingListContext.subject,
615
566
  contextSubject: state.pendingListContext.subject,
616
- contextSem: state.pendingListContext.sem
567
+ contextSem: state.pendingListContext.sem,
568
+ contextText: state.pendingListContext.contextText,
569
+ contextToken: state.pendingListContext.contextToken // Store context token for origins
617
570
  });
618
571
  state.pendingListContext = null;
619
572
  } else if (state.listStack.length === 0 || token.indent > state.listStack[state.listStack.length - 1].indent) {
620
- // Push empty frame for nested lists without explicit context
621
- // Inherit anchorSubject from parent frame if available
622
573
  const parentFrame = state.listStack.length > 0 ? state.listStack[state.listStack.length - 1] : null;
623
574
  state.listStack.push({
624
575
  indent: token.indent,
@@ -627,138 +578,278 @@ function manageListStack(token, state) {
627
578
  contextSem: null
628
579
  });
629
580
  }
630
- // If token.indent == current frame indent and no pending context, we're at same level - do nothing
631
- }
581
+ };
632
582
 
633
- function processListItem(token, state) {
634
- const carriers = getCarriers(token);
583
+ const combineSemanticInfo = (token, carriers, listFrame, state, itemSubject) => {
584
+ const combinedSem = { subject: null, object: null, types: [], predicates: [], datatype: null, language: null, entries: [] };
585
+ const addSem = (sem) => { combinedSem.types.push(...sem.types); combinedSem.predicates.push(...sem.predicates); combinedSem.entries.push(...sem.entries); };
635
586
 
636
- // Find item subject from list token or inline carriers
587
+ if (listFrame?.contextSem) {
588
+ const inheritedSem = processContextSem({ sem: listFrame.contextSem, itemSubject, contextSubject: listFrame.contextSubject, inheritLiterals: true, state });
589
+ if (inheritedSem) addSem(inheritedSem);
590
+ }
591
+
592
+ if (token.attrs) addSem(parseSemCached(token.attrs));
593
+ carriers.forEach(carrier => { if (carrier.attrs) addSem(parseSemCached(carrier.attrs)); });
594
+
595
+ return combinedSem;
596
+ };
597
+
598
+ const processListItem = (token, state) => {
599
+ const carriers = getCarriers(token);
637
600
  const itemInfo = findItemSubject(token, carriers, state);
638
601
  if (!itemInfo) return;
639
602
 
640
603
  const { subject: itemSubject } = itemInfo;
641
-
642
- // Update the current list frame to track this item's subject for nested contexts
643
- if (state.listStack.length > 0) {
644
- const currentFrame = state.listStack[state.listStack.length - 1];
645
- currentFrame.anchorSubject = itemSubject;
646
- }
604
+ if (state.listStack.length > 0) state.listStack[state.listStack.length - 1].anchorSubject = itemSubject;
647
605
 
648
606
  const listFrame = state.listStack[state.listStack.length - 1];
607
+ const combinedSem = combineSemanticInfo(token, carriers, listFrame, state, itemSubject);
649
608
 
650
- // Collect all semantic information for this list item
651
- let combinedSem = {
652
- subject: null,
653
- object: null,
654
- types: [],
655
- predicates: [],
656
- datatype: null,
657
- language: null,
658
- entries: []
659
- };
609
+ if (combinedSem.entries.length > 0) {
610
+ const prevSubject = state.currentSubject;
611
+ state.currentSubject = itemSubject;
660
612
 
661
- // Apply list context if available - inherit everything
662
- if (listFrame?.contextSem) {
663
- const inheritedSem = processContextSem({
664
- sem: listFrame.contextSem,
665
- itemSubject,
666
- contextSubject: listFrame.contextSubject,
667
- inheritLiterals: true,
668
- state
669
- });
613
+ processAnnotation({ type: 'list', text: token.text, range: token.range, attrsRange: token.attrsRange || null, valueRange: token.valueRange || null }, combinedSem, state, { preserveGlobalSubject: !state.listStack.length, implicitSubject: itemSubject });
670
614
 
671
- if (inheritedSem) {
672
- combinedSem.types.push(...inheritedSem.types);
673
- combinedSem.predicates.push(...inheritedSem.predicates);
674
- combinedSem.entries.push(...inheritedSem.entries);
675
- }
615
+ state.currentSubject = prevSubject;
676
616
  }
617
+ };
677
618
 
678
- // Add item's own annotations
679
- if (token.attrs) {
680
- const sem = parseSemCached(token.attrs);
681
- combinedSem.types.push(...sem.types);
682
- combinedSem.predicates.push(...sem.predicates);
683
- combinedSem.entries.push(...sem.entries);
619
+ const applyListAnchorAnnotations = (itemSubject, contextSem, state, listItemText, contextToken) => {
620
+ // Use the context token's ranges for proper origin tracking
621
+ const baseToken = contextToken || { range: [0, 0], attrsRange: [0, 0] };
622
+
623
+ const paragraphText = baseToken.text || '';
624
+ const annotationMatch = paragraphText.match(/\{[^}]+\}/);
625
+
626
+ let annotationStart;
627
+ if (annotationMatch && baseToken.range) {
628
+ // Found annotation in paragraph, calculate its absolute position
629
+ const relativeStart = paragraphText.indexOf(annotationMatch[0]);
630
+ annotationStart = baseToken.range[0] + relativeStart;
631
+ } else {
632
+ // Fallback to start of token
633
+ annotationStart = baseToken.range ? baseToken.range[0] : 0;
684
634
  }
685
635
 
686
- // Add inline carriers' annotations
687
- carriers.forEach(carrier => {
688
- if (carrier.attrs) {
689
- const sem = parseSemCached(carrier.attrs);
690
- combinedSem.types.push(...sem.types);
691
- combinedSem.predicates.push(...sem.predicates);
692
- combinedSem.entries.push(...sem.entries);
636
+ // Apply types with proper ranges
637
+ contextSem.types.forEach(type => {
638
+ const entry = contextSem.entries.find(e => e.kind === 'type' && e.iri === type.iri);
639
+ if (entry && entry.relRange) {
640
+ // Calculate absolute range: annotation start + relative range within annotation
641
+ const typeRange = [annotationStart + entry.relRange.start, annotationStart + entry.relRange.end];
642
+
643
+ emitQuad(state.quads, state.origin.quadIndex, 'list-anchor-type',
644
+ itemSubject,
645
+ state.df.namedNode(expandIRI('rdf:type', state.ctx)),
646
+ state.df.namedNode(expandIRI(type.iri, state.ctx)),
647
+ state.df,
648
+ { type: 'list-anchor', range: typeRange, entryIndex: type.entryIndex }
649
+ );
693
650
  }
694
651
  });
695
652
 
696
- // Only create a block if we have semantic information
697
- if (combinedSem.entries.length > 0) {
698
- const prevSubject = state.currentSubject;
699
- state.currentSubject = itemSubject;
653
+ // Apply predicates with proper ranges
654
+ contextSem.predicates.forEach(pred => {
655
+ if (pred.form !== '?' && pred.form !== '!') { // Skip context predicates
656
+ const entry = contextSem.entries.find(e => e.kind === 'property' && e.iri === pred.iri);
657
+ if (entry && entry.relRange) {
658
+ // Calculate absolute range: annotation start + relative range within annotation
659
+ const predRange = [annotationStart + entry.relRange.start, annotationStart + entry.relRange.end];
700
660
 
701
- processAnnotation({
702
- type: 'list',
703
- text: token.text,
704
- range: token.range,
705
- attrsRange: token.attrsRange || null,
706
- valueRange: token.valueRange || null
707
- }, combinedSem, state, {
708
- preserveGlobalSubject: !state.listStack.length,
709
- implicitSubject: itemSubject
710
- });
661
+ const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
711
662
 
712
- state.currentSubject = prevSubject;
663
+ // For literal predicates, the value comes from the list item text
664
+ let objectValue;
665
+ if (pred.form === '') {
666
+ objectValue = state.df.literal(listItemText || '');
667
+ } else {
668
+ // For other forms, this would need more complex handling
669
+ objectValue = state.df.literal(listItemText || '');
670
+ }
671
+
672
+ emitQuad(state.quads, state.origin.quadIndex, 'list-anchor-predicate',
673
+ itemSubject, P, objectValue, state.df,
674
+ { type: 'list-anchor', range: predRange, entryIndex: pred.entryIndex }
675
+ );
676
+ }
677
+ }
678
+ });
679
+ }
680
+
681
+ function processOrderedListItem(token, state) {
682
+ if (!state.isProcessingOrderedList) {
683
+ state.listCounter = (state.listCounter || 0) + 1;
684
+ state.rdfListIndex = 0;
685
+ state.firstListNode = null;
686
+ state.previousListNode = null;
687
+ state.contextConnected = false;
688
+ state.isProcessingOrderedList = true;
689
+ }
690
+
691
+ generateRdfListTriples(token, state);
692
+
693
+ const listFrame = state.listStack[state.listStack.length - 1];
694
+ if (listFrame?.contextSem) {
695
+ const carriers = getCarriers(token);
696
+ const itemInfo = findItemSubject(token, carriers, state);
697
+ if (itemInfo?.subject) {
698
+ applyListAnchorAnnotations(itemInfo.subject, listFrame.contextSem, state, token.text, listFrame.contextToken);
699
+ }
700
+ }
701
+
702
+ if (listFrame?.contextSem && listFrame?.contextSubject && !state.contextConnected) {
703
+ listFrame.contextSem.predicates.forEach(pred => {
704
+ if (pred.form === '?') {
705
+ const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
706
+ const firstListNode = state.firstListNode;
707
+ if (firstListNode) {
708
+ emitQuad(state.quads, state.origin.quadIndex, 'ordered-list-context',
709
+ listFrame.contextSubject, P, state.df.namedNode(firstListNode), state.df);
710
+ state.contextConnected = true;
711
+ }
712
+ }
713
+ });
713
714
  }
714
715
  }
715
716
 
716
- function processListContextFromParagraph(token, state) {
717
- const contextMatch = LIST_CONTEXT_REGEX.exec(token.text);
717
+ function generateRdfListTriples(token, state) {
718
+ const carriers = getCarriers(token);
719
+ const listIndex = (state.rdfListIndex || 0) + 1;
720
+ state.rdfListIndex = listIndex;
721
+ const listNodeName = `list-${state.listCounter}-${listIndex}`;
722
+
723
+ const listFrame = state.listStack[state.listStack.length - 1];
724
+ const contextSubject = listFrame?.contextSubject || state.currentSubject || state.documentSubject;
725
+ const baseIRI = contextSubject ? contextSubject.value : (state.ctx[''] || '');
726
+
727
+ const listNodeIri = baseIRI.includes('#')
728
+ ? `${baseIRI.split('#')[0]}#${listNodeName}`
729
+ : `${baseIRI}#${listNodeName}`;
730
+
731
+ if (!state.firstListNode) state.firstListNode = listNodeIri;
732
+
733
+ // Emit rdf:type triple with origin tracking
734
+ emitQuad(state.quads, state.origin.quadIndex, 'ordered-list-rdf-type',
735
+ DataFactory.namedNode(listNodeIri),
736
+ DataFactory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
737
+ DataFactory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#List'),
738
+ DataFactory,
739
+ { type: 'ordered-list', range: token.valueRange || token.range, listNodeName }
740
+ );
718
741
 
719
- if (contextMatch) {
720
- const contextSem = parseSemCached(`{${contextMatch[2]}}`);
742
+ const itemInfo = findItemSubject(token, carriers, state);
743
+ let firstObject;
744
+ if (itemInfo?.value) {
745
+ firstObject = itemInfo.value;
746
+ } else if (itemInfo?.subject) {
747
+ firstObject = itemInfo.subject;
748
+ } else {
749
+ firstObject = DataFactory.literal(token.text);
750
+ }
721
751
 
722
- // Context subject resolution:
723
- // 1. For top-level lists: use current subject or document subject
724
- // 2. For nested lists: use parent list item's subject
725
- let contextSubject = state.currentSubject || state.documentSubject;
752
+ // Determine the appropriate range based on object type
753
+ let originRange;
754
+ if (itemInfo?.subject) {
755
+ // For IRIs, target the annotation range
756
+ originRange = token.attrsRange || token.valueRange || token.range;
757
+ } else {
758
+ // For literals, target the value range
759
+ originRange = token.valueRange || token.range;
760
+ }
726
761
 
727
- // Check if this is a nested list context by looking ahead
728
- const nextTokenIndex = state.currentTokenIndex + 1;
729
- const nextToken = state.tokens && state.tokens[nextTokenIndex];
762
+ // Emit rdf:first triple with origin tracking
763
+ emitQuad(state.quads, state.origin.quadIndex, 'ordered-list-rdf-first',
764
+ DataFactory.namedNode(listNodeIri),
765
+ DataFactory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#first'),
766
+ firstObject,
767
+ DataFactory,
768
+ { type: 'ordered-list', range: originRange, listNodeName }
769
+ );
770
+
771
+ if (state.previousListNode) {
772
+ // Find and remove the previous rdf:rest -> rdf:nil quad, then emit a new one
773
+ const prevRestQuadIndex = state.quads.findIndex(q =>
774
+ q.subject.value === state.previousListNode &&
775
+ q.predicate.value === 'http://www.w3.org/1999/02/22-rdf-syntax-ns#rest'
776
+ );
777
+ if (prevRestQuadIndex !== -1) {
778
+ // Remove the old quad
779
+ state.quads.splice(prevRestQuadIndex, 1);
780
+
781
+ // Emit new rdf:rest quad with proper origin tracking
782
+ emitQuad(state.quads, state.origin.quadIndex, 'ordered-list-rdf-rest-update',
783
+ DataFactory.namedNode(state.previousListNode),
784
+ DataFactory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#rest'),
785
+ DataFactory.namedNode(listNodeIri),
786
+ DataFactory,
787
+ { type: 'ordered-list', range: token.valueRange || token.range, listNodeName: state.previousListNode }
788
+ );
789
+ }
790
+ }
730
791
 
731
- if (state.listStack.length > 0 && nextToken && nextToken.type === 'list') {
732
- const currentFrame = state.listStack[state.listStack.length - 1];
733
- if (currentFrame.anchorSubject && nextToken.indent > currentFrame.indent) {
734
- contextSubject = currentFrame.anchorSubject;
792
+ // Emit rdf:rest triple with origin tracking
793
+ emitQuad(state.quads, state.origin.quadIndex, 'ordered-list-rdf-rest',
794
+ DataFactory.namedNode(listNodeIri),
795
+ DataFactory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#rest'),
796
+ DataFactory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#nil'),
797
+ DataFactory,
798
+ { type: 'ordered-list', range: token.valueRange || token.range, listNodeName }
799
+ );
800
+
801
+ state.previousListNode = listNodeIri;
802
+ }
803
+
804
+ function processListContextFromParagraph(token, state) {
805
+ const contextMatch = LIST_CONTEXT_REGEX.exec(token.text);
806
+ if (!contextMatch) return;
807
+
808
+ const contextSem = parseSemCached(`{${contextMatch[2]}}`);
809
+ let contextSubject = state.currentSubject || state.documentSubject;
810
+
811
+ if (!contextSubject && state.tokens) {
812
+ for (let i = state.currentTokenIndex - 1; i >= 0; i--) {
813
+ const prevToken = state.tokens[i];
814
+ if (prevToken.type === 'heading' && prevToken.attrs) {
815
+ const prevSem = parseSemCached(prevToken.attrs);
816
+ if (prevSem.subject) {
817
+ const resolvedSubject = resolveSubject(prevSem, state);
818
+ if (resolvedSubject) {
819
+ contextSubject = resolvedSubject.value;
820
+ break;
821
+ }
822
+ }
735
823
  }
736
824
  }
825
+ }
737
826
 
738
- state.pendingListContext = {
739
- sem: contextSem,
740
- subject: contextSubject
741
- };
827
+ const nextToken = state.tokens?.[state.currentTokenIndex + 1];
828
+ if (state.listStack.length > 0 && nextToken && (nextToken.type === 'unordered-list' || nextToken.type === 'ordered-list')) {
829
+ const currentFrame = state.listStack[state.listStack.length - 1];
830
+ if (currentFrame.anchorSubject && nextToken.indent > currentFrame.indent) {
831
+ contextSubject = currentFrame.anchorSubject;
832
+ }
742
833
  }
834
+
835
+ state.pendingListContext = {
836
+ sem: contextSem,
837
+ subject: contextSubject,
838
+ contextText: contextMatch[1].replace(':', '').trim(),
839
+ contextToken: token // Store the context token for origin ranges
840
+ };
743
841
  }
744
842
 
745
- // Helper functions for token processing
746
843
  function processTokenAnnotations(token, state, tokenType) {
747
- // Process token's own attributes
748
844
  if (token.attrs) {
749
845
  const sem = parseSemCached(token.attrs);
750
846
  processAnnotation({
751
- type: tokenType,
752
- text: token.text,
753
- range: token.range,
754
- attrsRange: token.attrsRange || null,
755
- valueRange: token.valueRange || null
847
+ type: tokenType, text: token.text, range: token.range,
848
+ attrsRange: token.attrsRange || null, valueRange: token.valueRange || null
756
849
  }, sem, state);
757
850
  }
758
851
 
759
- // Process inline carriers
760
- const carriers = getCarriers(token);
761
- carriers.forEach(carrier => {
852
+ getCarriers(token).forEach(carrier => {
762
853
  if (carrier.attrs) {
763
854
  const sem = parseSemCached(carrier.attrs);
764
855
  processAnnotation(carrier, sem, state);
@@ -772,17 +863,52 @@ function processStandaloneSubject(token, state) {
772
863
 
773
864
  const sem = parseSemCached(`{=${match[1]}}`);
774
865
  const attrsStart = token.range[0] + token.text.indexOf('{=');
775
- const attrsEnd = attrsStart + (match[1] ? match[1].length : 0);
776
-
777
866
  processAnnotation({
778
- type: 'standalone',
779
- text: '',
780
- range: token.range,
781
- attrsRange: [attrsStart, attrsEnd],
867
+ type: 'standalone', text: '', range: token.range,
868
+ attrsRange: [attrsStart, attrsStart + (match[1] ? match[1].length : 0)],
782
869
  valueRange: null
783
870
  }, sem, state);
784
871
  }
785
872
 
873
+ const TOKEN_PROCESSORS = {
874
+ heading: (token, state) => {
875
+ state.isProcessingOrderedList = false;
876
+ if (token.attrs) {
877
+ const headingSem = parseSemCached(token.attrs);
878
+ if (headingSem.subject) {
879
+ const subject = resolveSubject(headingSem, state);
880
+ if (subject) state.documentSubject = subject;
881
+ }
882
+ }
883
+ processTokenAnnotations(token, state, token.type);
884
+ },
885
+ code: (token, state) => {
886
+ state.isProcessingOrderedList = false;
887
+ processTokenAnnotations(token, state, token.type);
888
+ },
889
+ blockquote: (token, state) => {
890
+ state.isProcessingOrderedList = false;
891
+ processTokenAnnotations(token, state, token.type);
892
+ },
893
+ para: (token, state) => {
894
+ if (!token.text.includes('{?') && !token.text.includes('{!')) {
895
+ state.isProcessingOrderedList = false;
896
+ }
897
+ processStandaloneSubject(token, state);
898
+ processListContextFromParagraph(token, state);
899
+ processTokenAnnotations(token, state, token.type);
900
+ },
901
+ 'unordered-list': (token, state) => {
902
+ state.isProcessingOrderedList = false;
903
+ manageListStack(token, state);
904
+ processListItem(token, state);
905
+ },
906
+ 'ordered-list': (token, state) => {
907
+ manageListStack(token, state);
908
+ processOrderedListItem(token, state);
909
+ }
910
+ };
911
+
786
912
  export function parse(text, options = {}) {
787
913
  const state = {
788
914
  ctx: { ...DEFAULT_CONTEXT, ...(options.context || {}) },
@@ -790,65 +916,38 @@ export function parse(text, options = {}) {
790
916
  quads: [],
791
917
  origin: { blocks: new Map(), quadIndex: new Map() },
792
918
  currentSubject: null,
793
- documentSubject: null, // Track main document subject from headings
919
+ documentSubject: null,
794
920
  listStack: [],
795
921
  pendingListContext: null,
796
- tokens: null, // Store tokens for lookahead
797
- currentTokenIndex: -1 // Track current token index
922
+ tokens: null,
923
+ currentTokenIndex: -1,
924
+ listCounter: 0,
925
+ rdfListIndex: 0,
926
+ firstListNode: null,
927
+ previousListNode: null,
928
+ contextConnected: false,
929
+ isProcessingOrderedList: false
798
930
  };
799
931
 
800
932
  state.tokens = scanTokens(text);
801
933
 
802
- // Process prefix declarations first with prefix folding support
803
934
  state.tokens.filter(t => t.type === 'prefix').forEach(t => {
804
- // Check if the IRI value contains a CURIE that references a previously defined prefix
805
935
  let resolvedIri = t.iri;
806
936
  if (t.iri.includes(':')) {
807
- const [potentialPrefix, ...referenceParts] = t.iri.split(':');
808
- const reference = referenceParts.join(':'); // Preserve any additional colons in reference
937
+ const colonIndex = t.iri.indexOf(':');
938
+ const potentialPrefix = t.iri.substring(0, colonIndex);
939
+ const reference = t.iri.substring(colonIndex + 1);
809
940
  if (state.ctx[potentialPrefix] && potentialPrefix !== '@vocab') {
810
- // This is a CURIE referencing an existing prefix - resolve it
811
941
  resolvedIri = state.ctx[potentialPrefix] + reference;
812
942
  }
813
943
  }
814
944
  state.ctx[t.prefix] = resolvedIri;
815
945
  });
816
946
 
817
- // Process all other tokens
818
947
  for (let i = 0; i < state.tokens.length; i++) {
819
948
  const token = state.tokens[i];
820
949
  state.currentTokenIndex = i;
821
-
822
- switch (token.type) {
823
- case 'heading':
824
- // Update document subject when processing headings
825
- if (token.attrs) {
826
- const headingSem = parseSemCached(token.attrs);
827
- if (headingSem.subject) {
828
- const subject = resolveSubject(headingSem, state);
829
- if (subject) {
830
- state.documentSubject = subject;
831
- }
832
- }
833
- }
834
- processTokenAnnotations(token, state, token.type);
835
- break;
836
- case 'code':
837
- case 'blockquote':
838
- processTokenAnnotations(token, state, token.type);
839
- break;
840
-
841
- case 'para':
842
- processStandaloneSubject(token, state);
843
- processListContextFromParagraph(token, state);
844
- processTokenAnnotations(token, state, token.type);
845
- break;
846
-
847
- case 'list':
848
- manageListStack(token, state);
849
- processListItem(token, state);
850
- break;
851
- }
950
+ TOKEN_PROCESSORS[token.type]?.(token, state);
852
951
  }
853
952
 
854
953
  return { quads: state.quads, origin: state.origin, context: state.ctx };