mdld-parse 0.4.3 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # MD-LD Parse v0.4.1
1
+ # MD-LD
2
2
 
3
3
  **Markdown-Linked Data (MD-LD)** — a deterministic, streaming-friendly RDF authoring format that extends Markdown with explicit `{...}` annotations.
4
4
 
@@ -40,7 +40,7 @@ ex:armstrong schema:name "Neil Armstrong" .
40
40
  - **Four predicate forms**: `p` (S→L), `?p` (S→O), `!p` (O→S)
41
41
  - **Type declarations**: `.Class` for rdf:type triples
42
42
  - **Datatypes & language**: `^^xsd:date` and `@en` support
43
- - **Lists**: Explicit subject declarations for structured data
43
+ - **Lists**: Explicit subject declarations and numbered ordered lists with `rdf:List` support
44
44
  - **Fragments**: Built-in document structuring with `{=#fragment}`
45
45
  - **Round-trip serialization**: Markdown ↔ RDF ↔ Markdown preserves structure
46
46
 
@@ -184,13 +184,12 @@ ex:armstrong a schema:Person .
184
184
 
185
185
  ### Lists
186
186
 
187
- Lists require explicit subjects per item:
187
+ Lists require explicit subjects per item.
188
188
 
189
189
  ```markdown
190
190
  # Recipe {=ex:recipe}
191
191
 
192
192
  Ingredients: {?ingredient .Ingredient}
193
-
194
193
  - Flour {=ex:flour name}
195
194
  - Water {=ex:water name}
196
195
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mdld-parse",
3
- "version": "0.4.3",
3
+ "version": "0.5.1",
4
4
  "description": "A standards-compliant parser for **MD-LD (Markdown-Linked Data)** — a human-friendly RDF authoring format that extends Markdown with semantic annotations.",
5
5
  "type": "module",
6
6
  "main": "index.js",
package/src/parse.js CHANGED
@@ -9,23 +9,19 @@ import {
9
9
  hash
10
10
  } from './utils.js';
11
11
 
12
- // Constants and patterns
13
12
  const URL_REGEX = /^[a-zA-Z][a-zA-Z0-9+.-]*:/;
14
13
  const FENCE_REGEX = /^(`{3,})(.*)/;
15
14
  const PREFIX_REGEX = /^\[([^\]]+)\]\s*<([^>]+)>/;
16
15
  const HEADING_REGEX = /^(#{1,6})\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
17
- const LIST_REGEX = /^(\s*)([-*+]|\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
16
+ const UNORDERED_LIST_REGEX = /^(\s*)([-*+]|\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
18
17
  const BLOCKQUOTE_REGEX = /^>\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
19
18
  const STANDALONE_SUBJECT_REGEX = /^\s*\{=(.*?)\}\s*$/;
20
19
  const LIST_CONTEXT_REGEX = /^(.+?)\s*\{([^}]+)\}$/;
21
-
22
- // Inline carrier pattern constants
23
20
  const INLINE_CARRIER_PATTERNS = {
24
21
  EMPHASIS: /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y,
25
22
  CODE_SPAN: /``(.+?)``\s*\{([^}]+)\}/y
26
23
  };
27
24
 
28
- // Semantic block cache to avoid repeated parsing
29
25
  const semCache = {};
30
26
  const EMPTY_SEM = Object.freeze({ predicates: [], types: [], subject: null });
31
27
 
@@ -39,46 +35,50 @@ function parseSemCached(attrs) {
39
35
  return sem;
40
36
  }
41
37
 
38
+ function calcRangeInfo(line, attrs, lineStart, prefixLength, valueLength) {
39
+ const wsLength = prefixLength < line.length && line[prefixLength] === ' ' ? 1 :
40
+ line.slice(prefixLength).match(/^\s+/)?.[0]?.length || 0;
41
+ const valueStartInLine = prefixLength + wsLength;
42
+ return {
43
+ valueRange: [lineStart + valueStartInLine, lineStart + valueStartInLine + valueLength],
44
+ attrsRange: calcAttrsRange(line, attrs, lineStart)
45
+ };
46
+ }
47
+
42
48
  function calcAttrsRange(line, attrs, lineStart) {
43
49
  if (!attrs) return null;
44
50
  const attrsStartInLine = line.lastIndexOf(attrs);
45
51
  return attrsStartInLine >= 0 ? [lineStart + attrsStartInLine, lineStart + attrsStartInLine + attrs.length] : null;
46
52
  }
47
53
 
48
- function calcValueRange(lineStart, valueStartInLine, valueEndInLine) {
49
- return [lineStart + valueStartInLine, lineStart + valueEndInLine];
50
- }
51
-
52
54
  function createToken(type, range, text, attrs = null, attrsRange = null, valueRange = null, extra = {}) {
53
55
  const token = { type, range, text, attrs, attrsRange, valueRange, ...extra };
54
- // Add lazy carrier caching
55
56
  Object.defineProperty(token, '_carriers', {
56
- enumerable: false,
57
- writable: true,
58
- value: null
57
+ enumerable: false, writable: true, value: null
59
58
  });
60
59
  return token;
61
60
  }
62
61
 
63
62
  function getCarriers(token) {
64
- // Skip inline carrier extraction for code blocks to allow safe self-documentation
65
- if (token.type === 'code') {
66
- return [];
67
- }
68
-
69
- if (!token._carriers) {
70
- token._carriers = extractInlineCarriers(token.text, token.range[0]);
71
- }
72
- return token._carriers;
63
+ if (token.type === 'code') return [];
64
+ return token._carriers || (token._carriers = extractInlineCarriers(token.text, token.range[0]));
73
65
  }
74
66
 
67
+ const createListToken = (type, line, lineStart, pos, match, indent = null) => {
68
+ const attrs = match[4] || null;
69
+ const prefix = match[1].length + (match[2] ? match[2].length : 0);
70
+ const rangeInfo = calcRangeInfo(line, attrs, lineStart, prefix, match[3].length);
71
+ const extra = indent !== null ? { indent } : { indent: match[1].length };
72
+ return createToken(type, [lineStart, pos - 1], match[3].trim(), attrs,
73
+ rangeInfo.attrsRange, rangeInfo.valueRange, extra);
74
+ };
75
+
75
76
  function scanTokens(text) {
76
77
  const tokens = [];
77
78
  const lines = text.split('\n');
78
79
  let pos = 0;
79
80
  let codeBlock = null;
80
81
 
81
- // Token processors in order of priority
82
82
  const processors = [
83
83
  {
84
84
  test: line => line.startsWith('```'),
@@ -114,14 +114,14 @@ function scanTokens(text) {
114
114
  });
115
115
  codeBlock = null;
116
116
  }
117
- return true; // handled
117
+ return true;
118
118
  }
119
119
  },
120
120
  {
121
121
  test: () => codeBlock,
122
122
  process: line => {
123
123
  codeBlock.content.push(line);
124
- return true; // handled
124
+ return true;
125
125
  }
126
126
  },
127
127
  {
@@ -129,7 +129,7 @@ function scanTokens(text) {
129
129
  process: (line, lineStart, pos) => {
130
130
  const match = PREFIX_REGEX.exec(line);
131
131
  tokens.push({ type: 'prefix', prefix: match[1], iri: match[2].trim() });
132
- return true; // handled
132
+ return true;
133
133
  }
134
134
  },
135
135
  {
@@ -138,32 +138,18 @@ function scanTokens(text) {
138
138
  const match = HEADING_REGEX.exec(line);
139
139
  const attrs = match[3] || null;
140
140
  const afterHashes = match[1].length;
141
- const wsLength = afterHashes < line.length && line[afterHashes] === ' ' ? 1 :
142
- line.slice(afterHashes).match(/^\s+/)?.[0]?.length || 0;
143
- const valueStartInLine = afterHashes + wsLength;
144
- const valueEndInLine = valueStartInLine + match[2].length;
141
+ const rangeInfo = calcRangeInfo(line, attrs, lineStart, afterHashes, match[2].length);
145
142
  tokens.push(createToken('heading', [lineStart, pos - 1], match[2].trim(), attrs,
146
- calcAttrsRange(line, attrs, lineStart),
147
- calcValueRange(lineStart, valueStartInLine, valueEndInLine),
148
- { depth: match[1].length }));
149
- return true; // handled
143
+ rangeInfo.attrsRange, rangeInfo.valueRange, { depth: match[1].length }));
144
+ return true;
150
145
  }
151
146
  },
152
147
  {
153
- test: line => LIST_REGEX.test(line),
148
+ test: line => UNORDERED_LIST_REGEX.test(line),
154
149
  process: (line, lineStart, pos) => {
155
- const match = LIST_REGEX.exec(line);
156
- const attrs = match[4] || null;
157
- const prefix = match[1].length + match[2].length;
158
- const wsLength = prefix < line.length && line[prefix] === ' ' ? 1 :
159
- line.slice(prefix).match(/^\s+/)?.[0]?.length || 0;
160
- const valueStartInLine = prefix + wsLength;
161
- const valueEndInLine = valueStartInLine + match[3].length;
162
- tokens.push(createToken('list', [lineStart, pos - 1], match[3].trim(), attrs,
163
- calcAttrsRange(line, attrs, lineStart),
164
- calcValueRange(lineStart, valueStartInLine, valueEndInLine),
165
- { indent: match[1].length }));
166
- return true; // handled
150
+ const match = UNORDERED_LIST_REGEX.exec(line);
151
+ tokens.push(createListToken('list', line, lineStart, pos, match, match[1].length));
152
+ return true;
167
153
  }
168
154
  },
169
155
  {
@@ -175,15 +161,15 @@ function scanTokens(text) {
175
161
  const valueEndInLine = valueStartInLine + match[1].length;
176
162
  tokens.push(createToken('blockquote', [lineStart, pos - 1], match[1].trim(), attrs,
177
163
  calcAttrsRange(line, attrs, lineStart),
178
- calcValueRange(lineStart, valueStartInLine, valueEndInLine)));
179
- return true; // handled
164
+ [lineStart + valueStartInLine, lineStart + valueEndInLine]));
165
+ return true;
180
166
  }
181
167
  },
182
168
  {
183
169
  test: line => line.trim(),
184
170
  process: (line, lineStart, pos) => {
185
171
  tokens.push(createToken('para', [lineStart, pos - 1], line.trim()));
186
- return true; // handled
172
+ return true;
187
173
  }
188
174
  }
189
175
  ];
@@ -196,7 +182,7 @@ function scanTokens(text) {
196
182
  // Try each processor until one handles the line
197
183
  for (const processor of processors) {
198
184
  if (processor.test(line) && processor.process(line, lineStart, pos)) {
199
- break; // line handled, move to next line
185
+ break;
200
186
  }
201
187
  }
202
188
  }
@@ -212,44 +198,35 @@ function extractInlineCarriers(text, baseOffset = 0) {
212
198
  const carriers = [];
213
199
  let pos = 0;
214
200
 
215
- // Unified carrier extractor with pattern-based handlers
216
- const extractCarrier = (text, pos, baseOffset) => {
217
- // Angle-bracket URLs: <URL>{...}
218
- if (text[pos] === '<') {
201
+ const CARRIER_EXTRACTORS = {
202
+ '<': (text, pos, baseOffset) => {
219
203
  const angleEnd = text.indexOf('>', pos);
220
- if (angleEnd !== -1) {
221
- const url = text.slice(pos + 1, angleEnd);
222
- if (URL_REGEX.test(url)) {
223
- const { attrs, attrsRange, finalSpanEnd } = extractAttributesFromText(text, angleEnd + 1, baseOffset);
224
- return createCarrier('link', url, attrs, attrsRange,
225
- [baseOffset + pos + 1, baseOffset + angleEnd],
226
- [baseOffset + pos, baseOffset + finalSpanEnd],
227
- finalSpanEnd, { url });
228
- }
229
- }
230
- return null;
231
- }
232
-
233
- // Bracketed links: [text](URL){...} and [text]{...}
234
- if (text[pos] === '[') {
204
+ if (angleEnd === -1) return null;
205
+ const url = text.slice(pos + 1, angleEnd);
206
+ if (!URL_REGEX.test(url)) return null;
207
+ const { attrs, attrsRange, finalSpanEnd } = extractAttributesFromText(text, angleEnd + 1, baseOffset);
208
+ return createCarrier('link', url, attrs, attrsRange,
209
+ [baseOffset + pos + 1, baseOffset + angleEnd],
210
+ [baseOffset + pos, baseOffset + finalSpanEnd], finalSpanEnd, { url });
211
+ },
212
+ '[': (text, pos, baseOffset) => {
235
213
  const bracketEnd = findMatchingBracket(text, pos);
236
- if (bracketEnd) {
237
- const carrierText = text.slice(pos + 1, bracketEnd - 1);
238
- const { url, spanEnd } = extractUrlFromBrackets(text, bracketEnd);
239
- const { attrs, attrsRange, finalSpanEnd } = extractAttributesFromText(text, spanEnd, baseOffset);
240
- const { carrierType, resourceIRI } = determineCarrierType(url);
241
-
242
- if (url?.startsWith('=')) return { skip: true, pos: finalSpanEnd };
243
-
244
- return createCarrier(carrierType, carrierText, attrs, attrsRange,
245
- [baseOffset + pos + 1, baseOffset + bracketEnd - 1],
246
- [baseOffset + pos, baseOffset + finalSpanEnd],
247
- finalSpanEnd, { url: resourceIRI });
248
- }
249
- return null;
214
+ if (!bracketEnd) return null;
215
+ const carrierText = text.slice(pos + 1, bracketEnd - 1);
216
+ const { url, spanEnd } = extractUrlFromBrackets(text, bracketEnd);
217
+ const { attrs, attrsRange, finalSpanEnd } = extractAttributesFromText(text, spanEnd, baseOffset);
218
+ const { carrierType, resourceIRI } = determineCarrierType(url);
219
+ if (url?.startsWith('=')) return { skip: true, pos: finalSpanEnd };
220
+ return createCarrier(carrierType, carrierText, attrs, attrsRange,
221
+ [baseOffset + pos + 1, baseOffset + bracketEnd - 1],
222
+ [baseOffset + pos, baseOffset + finalSpanEnd], finalSpanEnd, { url: resourceIRI });
250
223
  }
224
+ };
225
+
226
+ const extractCarrier = (text, pos, baseOffset) => {
227
+ const extractor = CARRIER_EXTRACTORS[text[pos]];
228
+ if (extractor) return extractor(text, pos, baseOffset);
251
229
 
252
- // Regex-based carriers: emphasis and code spans
253
230
  for (const [type, pattern] of Object.entries(INLINE_CARRIER_PATTERNS)) {
254
231
  pattern.lastIndex = pos;
255
232
  const match = pattern.exec(text);
@@ -260,7 +237,6 @@ function extractInlineCarriers(text, baseOffset = 0) {
260
237
  ranges.attrsRange, ranges.valueRange, ranges.range, ranges.pos);
261
238
  }
262
239
  }
263
-
264
240
  return null;
265
241
  };
266
242
 
@@ -357,14 +333,9 @@ function createBlock(subject, types, predicates, entries, range, attrsRange, val
357
333
  predicates: predicates.map(p => ({ iri: expandIRI(p.iri, ctx), form: p.form }))
358
334
  };
359
335
 
360
- const signature = [
361
- subject,
362
- carrierType || 'unknown',
363
- expanded.types.join(','),
364
- expanded.predicates.map(p => `${p.form}${p.iri}`).join(',')
365
- ].join('|');
366
-
336
+ const signature = [subject, carrierType || 'unknown', expanded.types.join(','), expanded.predicates.map(p => `${p.form}${p.iri}`).join(',')].join('|');
367
337
  const blockId = hash(signature);
338
+
368
339
  return {
369
340
  id: blockId,
370
341
  range: { start: range[0], end: range[1] },
@@ -391,93 +362,75 @@ function emitQuad(quads, quadIndex, blockId, subject, predicate, object, dataFac
391
362
 
392
363
  quadIndex.set(quadIndexKey(quad.subject, quad.predicate, quad.object), slotInfo);
393
364
  }
365
+ const resolveFragment = (fragment, state) => {
366
+ if (!state.currentSubject) return null;
367
+ const baseIRI = state.currentSubject.value.split('#')[0];
368
+ return state.df.namedNode(`${baseIRI}#${fragment}`);
369
+ };
370
+
394
371
  function resolveSubject(sem, state) {
395
372
  if (!sem.subject) return null;
396
373
  if (sem.subject === 'RESET') {
397
374
  state.currentSubject = null;
398
375
  return null;
399
376
  }
400
- if (sem.subject.startsWith('=#')) {
401
- const fragment = sem.subject.substring(2);
402
- if (state.currentSubject) {
403
- const baseIRI = state.currentSubject.value.split('#')[0];
404
- return state.df.namedNode(`${baseIRI}#${fragment}`);
405
- }
406
- return null;
407
- }
377
+ if (sem.subject.startsWith('=#')) return resolveFragment(sem.subject.substring(2), state);
408
378
  return state.df.namedNode(expandIRI(sem.subject, state.ctx));
409
379
  }
410
380
 
411
381
  function resolveObject(sem, state) {
412
382
  if (!sem.object) return null;
413
- if (sem.object.startsWith('#')) {
414
- const fragment = sem.object.substring(1);
415
- if (state.currentSubject) {
416
- const baseIRI = state.currentSubject.value.split('#')[0];
417
- return state.df.namedNode(`${baseIRI}#${fragment}`);
418
- }
419
- return null;
420
- }
383
+ if (sem.object.startsWith('#')) return resolveFragment(sem.object.substring(1), state);
421
384
  return state.df.namedNode(expandIRI(sem.object, state.ctx));
422
385
  }
423
386
 
387
+ const createTypeQuad = (typeIRI, subject, state, blockId, entryIndex = null) => {
388
+ const expandedType = expandIRI(typeIRI, state.ctx);
389
+ emitQuad(
390
+ state.quads, state.origin.quadIndex, blockId,
391
+ subject,
392
+ state.df.namedNode(expandIRI('rdf:type', state.ctx)),
393
+ state.df.namedNode(expandedType),
394
+ state.df,
395
+ { kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex }
396
+ );
397
+ };
398
+
424
399
  function processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state, carrier) {
425
400
  sem.types.forEach(t => {
426
401
  const typeIRI = typeof t === 'string' ? t : t.iri;
427
402
  const entryIndex = typeof t === 'string' ? null : t.entryIndex;
428
-
429
- // For angle-bracket URLs and bracketed links [text](URL), use the URL as the subject
430
- // for type declarations when there's no explicit subject declaration.
431
- // This implements {+URL} soft subject behavior.
432
403
  let typeSubject = newSubject ? newSubject : (localObject || carrierO || S);
433
404
  if (carrier?.type === 'link' && carrier?.url && !newSubject) {
434
- typeSubject = carrierO; // Use URL as subject for type declarations
405
+ typeSubject = carrierO;
435
406
  }
436
-
437
- const expandedType = expandIRI(typeIRI, state.ctx);
438
-
439
- emitQuad(
440
- state.quads, state.origin.quadIndex, block.id,
441
- typeSubject,
442
- state.df.namedNode(expandIRI('rdf:type', state.ctx)),
443
- state.df.namedNode(expandedType),
444
- state.df,
445
- { kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex }
446
- );
407
+ createTypeQuad(typeIRI, typeSubject, state, block.id, entryIndex);
447
408
  });
448
409
  }
449
410
 
411
+ const determinePredicateRole = (pred, carrier, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L) => {
412
+ if (pred.form === '' && carrier?.type === 'link' && carrier?.url && carrier.text === carrier.url) {
413
+ return null;
414
+ }
415
+ switch (pred.form) {
416
+ case '':
417
+ return carrier?.type === 'link' && carrier?.url && carrier.text !== carrier.url && !newSubject
418
+ ? { subject: newSubjectOrCarrierO, object: L }
419
+ : { subject: localObject || S, object: L };
420
+ case '?':
421
+ return { subject: newSubject ? previousSubject : S, object: localObject || newSubjectOrCarrierO };
422
+ case '!':
423
+ return { subject: localObject || newSubjectOrCarrierO, object: newSubject ? previousSubject : S };
424
+ default:
425
+ return null;
426
+ }
427
+ };
428
+
450
429
  function processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state, carrier) {
451
430
  sem.predicates.forEach(pred => {
452
- const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
453
-
454
- // Skip literal predicates for angle-bracket URLs only
455
- if (pred.form === '' && carrier?.type === 'link' && carrier?.url && carrier.text === carrier.url) {
456
- return;
457
- }
458
-
459
- // Determine subject/object roles based on predicate form
460
- let role;
461
- switch (pred.form) {
462
- case '':
463
- // For bracketed links with literal predicates and no explicit subject, use URL as subject
464
- if (carrier?.type === 'link' && carrier?.url && carrier.text !== carrier.url && !newSubject) {
465
- role = { subject: newSubjectOrCarrierO, object: L };
466
- } else {
467
- role = { subject: localObject || S, object: L };
468
- }
469
- break;
470
- case '?':
471
- role = { subject: newSubject ? previousSubject : S, object: localObject || newSubjectOrCarrierO };
472
- break;
473
- case '!':
474
- role = { subject: localObject || newSubjectOrCarrierO, object: newSubject ? previousSubject : S };
475
- break;
476
- default:
477
- role = null;
478
- }
479
-
431
+ const role = determinePredicateRole(pred, carrier, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L);
480
432
  if (role) {
433
+ const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
481
434
  emitQuad(state.quads, state.origin.quadIndex, block.id,
482
435
  role.subject, P, role.object, state.df,
483
436
  { kind: 'pred', token: `${pred.form}${pred.iri}`, form: pred.form, expandedPredicate: P.value, entryIndex: pred.entryIndex }
@@ -498,7 +451,6 @@ function processAnnotation(carrier, sem, state, options = {}) {
498
451
  const newSubject = resolveSubject(sem, state);
499
452
  const localObject = resolveObject(sem, state);
500
453
 
501
- // Use implicit subject if provided (for list items)
502
454
  const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
503
455
  if (newSubject && !preserveGlobalSubject && !implicitSubject) {
504
456
  state.currentSubject = newSubject;
@@ -546,23 +498,7 @@ export function findItemSubject(listToken, carriers, state) {
546
498
  return null;
547
499
  }
548
500
 
549
- function hasOwnPredicates(listToken, carriers) {
550
- // Check for explicit predicates (excluding subject declarations)
551
- if (listToken.attrs) {
552
- const attrs = parseSemCached(listToken.attrs);
553
- if (attrs.predicates.some(p => !p.subject && p.iri !== 'RESET')) {
554
- return true;
555
- }
556
- }
557
- return carriers.some(carrier => {
558
- const carrierAttrs = parseSemCached(carrier.attrs);
559
- return carrierAttrs.predicates.some(p => !p.subject && p.iri !== 'RESET');
560
- });
561
- }
562
-
563
- // Unified list context processing
564
- function processContextSem({ sem, itemSubject, contextSubject, inheritLiterals = false, state, blockId = 'list-context' }) {
565
- // Emit types
501
+ const processContextSem = ({ sem, itemSubject, contextSubject, inheritLiterals = false, state, blockId = 'list-context' }) => {
566
502
  sem.types.forEach(t => {
567
503
  const typeIRI = typeof t === 'string' ? t : t.iri;
568
504
  emitQuad(
@@ -574,7 +510,6 @@ function processContextSem({ sem, itemSubject, contextSubject, inheritLiterals =
574
510
  );
575
511
  });
576
512
 
577
- // Emit directional predicates
578
513
  sem.predicates.forEach(pred => {
579
514
  const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
580
515
  if (pred.form === '!') {
@@ -584,46 +519,35 @@ function processContextSem({ sem, itemSubject, contextSubject, inheritLiterals =
584
519
  }
585
520
  });
586
521
 
587
- // Optionally inherit literal predicates
588
522
  if (inheritLiterals) {
589
523
  const literalPredicates = sem.predicates.filter(p => p.form === '');
590
524
  if (literalPredicates.length > 0) {
591
525
  return {
592
- subject: null,
593
- object: null,
594
- types: [],
526
+ subject: null, object: null, types: [],
595
527
  predicates: literalPredicates.map(p => ({ iri: p.iri, form: p.form, entryIndex: p.entryIndex })),
596
- datatype: null,
597
- language: null,
598
- entries: []
528
+ datatype: null, language: null, entries: []
599
529
  };
600
530
  }
601
531
  }
602
532
  return null;
603
- }
533
+ };
604
534
 
605
- // List stack management functions
606
- function manageListStack(token, state) {
607
- // Pop stack frames for lists that have ended (strictly less indent)
608
- while (
609
- state.listStack.length &&
610
- token.indent < state.listStack[state.listStack.length - 1].indent
611
- ) {
535
+ const manageListStack = (token, state) => {
536
+ while (state.listStack.length && token.indent < state.listStack[state.listStack.length - 1].indent) {
612
537
  state.listStack.pop();
613
538
  }
614
539
 
615
- // If we have pending context, always create a new frame for it
616
540
  if (state.pendingListContext) {
617
541
  state.listStack.push({
618
542
  indent: token.indent,
619
543
  anchorSubject: state.pendingListContext.subject,
620
544
  contextSubject: state.pendingListContext.subject,
621
- contextSem: state.pendingListContext.sem
545
+ contextSem: state.pendingListContext.sem,
546
+ contextText: state.pendingListContext.contextText,
547
+ contextToken: state.pendingListContext.contextToken // Store context token for origins
622
548
  });
623
549
  state.pendingListContext = null;
624
550
  } else if (state.listStack.length === 0 || token.indent > state.listStack[state.listStack.length - 1].indent) {
625
- // Push empty frame for nested lists without explicit context
626
- // Inherit anchorSubject from parent frame if available
627
551
  const parentFrame = state.listStack.length > 0 ? state.listStack[state.listStack.length - 1] : null;
628
552
  state.listStack.push({
629
553
  indent: token.indent,
@@ -632,138 +556,94 @@ function manageListStack(token, state) {
632
556
  contextSem: null
633
557
  });
634
558
  }
635
- // If token.indent == current frame indent and no pending context, we're at same level - do nothing
636
- }
637
-
638
- function processListItem(token, state) {
639
- const carriers = getCarriers(token);
640
-
641
- // Find item subject from list token or inline carriers
642
- const itemInfo = findItemSubject(token, carriers, state);
643
- if (!itemInfo) return;
559
+ };
644
560
 
645
- const { subject: itemSubject } = itemInfo;
561
+ const combineSemanticInfo = (token, carriers, listFrame, state, itemSubject) => {
562
+ const combinedSem = { subject: null, object: null, types: [], predicates: [], datatype: null, language: null, entries: [] };
563
+ const addSem = (sem) => { combinedSem.types.push(...sem.types); combinedSem.predicates.push(...sem.predicates); combinedSem.entries.push(...sem.entries); };
646
564
 
647
- // Update the current list frame to track this item's subject for nested contexts
648
- if (state.listStack.length > 0) {
649
- const currentFrame = state.listStack[state.listStack.length - 1];
650
- currentFrame.anchorSubject = itemSubject;
565
+ if (listFrame?.contextSem) {
566
+ const inheritedSem = processContextSem({ sem: listFrame.contextSem, itemSubject, contextSubject: listFrame.contextSubject, inheritLiterals: true, state });
567
+ if (inheritedSem) addSem(inheritedSem);
651
568
  }
652
569
 
653
- const listFrame = state.listStack[state.listStack.length - 1];
654
-
655
- // Collect all semantic information for this list item
656
- let combinedSem = {
657
- subject: null,
658
- object: null,
659
- types: [],
660
- predicates: [],
661
- datatype: null,
662
- language: null,
663
- entries: []
664
- };
570
+ if (token.attrs) addSem(parseSemCached(token.attrs));
571
+ carriers.forEach(carrier => { if (carrier.attrs) addSem(parseSemCached(carrier.attrs)); });
665
572
 
666
- // Apply list context if available - inherit everything
667
- if (listFrame?.contextSem) {
668
- const inheritedSem = processContextSem({
669
- sem: listFrame.contextSem,
670
- itemSubject,
671
- contextSubject: listFrame.contextSubject,
672
- inheritLiterals: true,
673
- state
674
- });
573
+ return combinedSem;
574
+ };
675
575
 
676
- if (inheritedSem) {
677
- combinedSem.types.push(...inheritedSem.types);
678
- combinedSem.predicates.push(...inheritedSem.predicates);
679
- combinedSem.entries.push(...inheritedSem.entries);
680
- }
681
- }
576
+ const processListItem = (token, state) => {
577
+ const carriers = getCarriers(token);
578
+ const itemInfo = findItemSubject(token, carriers, state);
579
+ if (!itemInfo) return;
682
580
 
683
- // Add item's own annotations
684
- if (token.attrs) {
685
- const sem = parseSemCached(token.attrs);
686
- combinedSem.types.push(...sem.types);
687
- combinedSem.predicates.push(...sem.predicates);
688
- combinedSem.entries.push(...sem.entries);
689
- }
581
+ const { subject: itemSubject } = itemInfo;
582
+ if (state.listStack.length > 0) state.listStack[state.listStack.length - 1].anchorSubject = itemSubject;
690
583
 
691
- // Add inline carriers' annotations
692
- carriers.forEach(carrier => {
693
- if (carrier.attrs) {
694
- const sem = parseSemCached(carrier.attrs);
695
- combinedSem.types.push(...sem.types);
696
- combinedSem.predicates.push(...sem.predicates);
697
- combinedSem.entries.push(...sem.entries);
698
- }
699
- });
584
+ const listFrame = state.listStack[state.listStack.length - 1];
585
+ const combinedSem = combineSemanticInfo(token, carriers, listFrame, state, itemSubject);
700
586
 
701
- // Only create a block if we have semantic information
702
587
  if (combinedSem.entries.length > 0) {
703
588
  const prevSubject = state.currentSubject;
704
589
  state.currentSubject = itemSubject;
705
590
 
706
- processAnnotation({
707
- type: 'list',
708
- text: token.text,
709
- range: token.range,
710
- attrsRange: token.attrsRange || null,
711
- valueRange: token.valueRange || null
712
- }, combinedSem, state, {
713
- preserveGlobalSubject: !state.listStack.length,
714
- implicitSubject: itemSubject
715
- });
591
+ processAnnotation({ type: 'list', text: token.text, range: token.range, attrsRange: token.attrsRange || null, valueRange: token.valueRange || null }, combinedSem, state, { preserveGlobalSubject: !state.listStack.length, implicitSubject: itemSubject });
716
592
 
717
593
  state.currentSubject = prevSubject;
718
594
  }
719
- }
595
+ };
596
+
720
597
 
721
598
  function processListContextFromParagraph(token, state) {
722
599
  const contextMatch = LIST_CONTEXT_REGEX.exec(token.text);
723
-
724
- if (contextMatch) {
725
- const contextSem = parseSemCached(`{${contextMatch[2]}}`);
726
-
727
- // Context subject resolution:
728
- // 1. For top-level lists: use current subject or document subject
729
- // 2. For nested lists: use parent list item's subject
730
- let contextSubject = state.currentSubject || state.documentSubject;
731
-
732
- // Check if this is a nested list context by looking ahead
733
- const nextTokenIndex = state.currentTokenIndex + 1;
734
- const nextToken = state.tokens && state.tokens[nextTokenIndex];
735
-
736
- if (state.listStack.length > 0 && nextToken && nextToken.type === 'list') {
737
- const currentFrame = state.listStack[state.listStack.length - 1];
738
- if (currentFrame.anchorSubject && nextToken.indent > currentFrame.indent) {
739
- contextSubject = currentFrame.anchorSubject;
600
+ if (!contextMatch) return;
601
+
602
+ const contextSem = parseSemCached(`{${contextMatch[2]}}`);
603
+ let contextSubject = state.currentSubject || state.documentSubject;
604
+
605
+ if (!contextSubject && state.tokens) {
606
+ for (let i = state.currentTokenIndex - 1; i >= 0; i--) {
607
+ const prevToken = state.tokens[i];
608
+ if (prevToken.type === 'heading' && prevToken.attrs) {
609
+ const prevSem = parseSemCached(prevToken.attrs);
610
+ if (prevSem.subject) {
611
+ const resolvedSubject = resolveSubject(prevSem, state);
612
+ if (resolvedSubject) {
613
+ contextSubject = resolvedSubject.value;
614
+ break;
615
+ }
616
+ }
740
617
  }
741
618
  }
619
+ }
742
620
 
743
- state.pendingListContext = {
744
- sem: contextSem,
745
- subject: contextSubject
746
- };
621
+ const nextToken = state.tokens?.[state.currentTokenIndex + 1];
622
+ if (state.listStack.length > 0 && nextToken && nextToken.type === 'list') {
623
+ const currentFrame = state.listStack[state.listStack.length - 1];
624
+ if (currentFrame.anchorSubject && nextToken.indent > currentFrame.indent) {
625
+ contextSubject = currentFrame.anchorSubject;
626
+ }
747
627
  }
628
+
629
+ state.pendingListContext = {
630
+ sem: contextSem,
631
+ subject: contextSubject,
632
+ contextText: contextMatch[1].replace(':', '').trim(),
633
+ contextToken: token // Store the context token for origin ranges
634
+ };
748
635
  }
749
636
 
750
- // Helper functions for token processing
751
637
  function processTokenAnnotations(token, state, tokenType) {
752
- // Process token's own attributes
753
638
  if (token.attrs) {
754
639
  const sem = parseSemCached(token.attrs);
755
640
  processAnnotation({
756
- type: tokenType,
757
- text: token.text,
758
- range: token.range,
759
- attrsRange: token.attrsRange || null,
760
- valueRange: token.valueRange || null
641
+ type: tokenType, text: token.text, range: token.range,
642
+ attrsRange: token.attrsRange || null, valueRange: token.valueRange || null
761
643
  }, sem, state);
762
644
  }
763
645
 
764
- // Process inline carriers
765
- const carriers = getCarriers(token);
766
- carriers.forEach(carrier => {
646
+ getCarriers(token).forEach(carrier => {
767
647
  if (carrier.attrs) {
768
648
  const sem = parseSemCached(carrier.attrs);
769
649
  processAnnotation(carrier, sem, state);
@@ -777,17 +657,41 @@ function processStandaloneSubject(token, state) {
777
657
 
778
658
  const sem = parseSemCached(`{=${match[1]}}`);
779
659
  const attrsStart = token.range[0] + token.text.indexOf('{=');
780
- const attrsEnd = attrsStart + (match[1] ? match[1].length : 0);
781
-
782
660
  processAnnotation({
783
- type: 'standalone',
784
- text: '',
785
- range: token.range,
786
- attrsRange: [attrsStart, attrsEnd],
661
+ type: 'standalone', text: '', range: token.range,
662
+ attrsRange: [attrsStart, attrsStart + (match[1] ? match[1].length : 0)],
787
663
  valueRange: null
788
664
  }, sem, state);
789
665
  }
790
666
 
667
+ const TOKEN_PROCESSORS = {
668
+ heading: (token, state) => {
669
+ if (token.attrs) {
670
+ const headingSem = parseSemCached(token.attrs);
671
+ if (headingSem.subject) {
672
+ const subject = resolveSubject(headingSem, state);
673
+ if (subject) state.documentSubject = subject;
674
+ }
675
+ }
676
+ processTokenAnnotations(token, state, token.type);
677
+ },
678
+ code: (token, state) => {
679
+ processTokenAnnotations(token, state, token.type);
680
+ },
681
+ blockquote: (token, state) => {
682
+ processTokenAnnotations(token, state, token.type);
683
+ },
684
+ para: (token, state) => {
685
+ processStandaloneSubject(token, state);
686
+ processListContextFromParagraph(token, state);
687
+ processTokenAnnotations(token, state, token.type);
688
+ },
689
+ list: (token, state) => {
690
+ manageListStack(token, state);
691
+ processListItem(token, state);
692
+ },
693
+ };
694
+
791
695
  export function parse(text, options = {}) {
792
696
  const state = {
793
697
  ctx: { ...DEFAULT_CONTEXT, ...(options.context || {}) },
@@ -795,69 +699,32 @@ export function parse(text, options = {}) {
795
699
  quads: [],
796
700
  origin: { blocks: new Map(), quadIndex: new Map() },
797
701
  currentSubject: null,
798
- documentSubject: null, // Track main document subject from headings
702
+ documentSubject: null,
799
703
  listStack: [],
800
704
  pendingListContext: null,
801
- tokens: null, // Store tokens for lookahead
802
- currentTokenIndex: -1 // Track current token index
705
+ tokens: null,
706
+ currentTokenIndex: -1
803
707
  };
804
708
 
805
709
  state.tokens = scanTokens(text);
806
710
 
807
- // Process prefix declarations first with prefix folding support
808
711
  state.tokens.filter(t => t.type === 'prefix').forEach(t => {
809
- // Check if the IRI value contains a CURIE that references a previously defined prefix
810
712
  let resolvedIri = t.iri;
811
713
  if (t.iri.includes(':')) {
812
- const [potentialPrefix, ...referenceParts] = t.iri.split(':');
813
- const reference = referenceParts.join(':'); // Preserve any additional colons in reference
714
+ const colonIndex = t.iri.indexOf(':');
715
+ const potentialPrefix = t.iri.substring(0, colonIndex);
716
+ const reference = t.iri.substring(colonIndex + 1);
814
717
  if (state.ctx[potentialPrefix] && potentialPrefix !== '@vocab') {
815
- // This is a CURIE referencing an existing prefix - resolve it
816
718
  resolvedIri = state.ctx[potentialPrefix] + reference;
817
719
  }
818
720
  }
819
721
  state.ctx[t.prefix] = resolvedIri;
820
722
  });
821
723
 
822
- // Process all other tokens
823
724
  for (let i = 0; i < state.tokens.length; i++) {
824
725
  const token = state.tokens[i];
825
726
  state.currentTokenIndex = i;
826
-
827
- switch (token.type) {
828
- case 'heading':
829
- // Update document subject when processing headings
830
- if (token.attrs) {
831
- const headingSem = parseSemCached(token.attrs);
832
- if (headingSem.subject) {
833
- const subject = resolveSubject(headingSem, state);
834
- if (subject) {
835
- state.documentSubject = subject;
836
- }
837
- }
838
- }
839
- processTokenAnnotations(token, state, token.type);
840
- break;
841
- case 'code':
842
- // Process annotations on the opening fence, but skip content processing
843
- // This allows safe self-explaining of the format in documentation
844
- processTokenAnnotations(token, state, token.type);
845
- break;
846
- case 'blockquote':
847
- processTokenAnnotations(token, state, token.type);
848
- break;
849
-
850
- case 'para':
851
- processStandaloneSubject(token, state);
852
- processListContextFromParagraph(token, state);
853
- processTokenAnnotations(token, state, token.type);
854
- break;
855
-
856
- case 'list':
857
- manageListStack(token, state);
858
- processListItem(token, state);
859
- break;
860
- }
727
+ TOKEN_PROCESSORS[token.type]?.(token, state);
861
728
  }
862
729
 
863
730
  return { quads: state.quads, origin: state.origin, context: state.ctx };
package/src/serialize.js CHANGED
@@ -177,6 +177,7 @@ export function serialize({ text, diff, origin, options = {} }) {
177
177
  return applyEdits(text, edits, ctx, base);
178
178
  }
179
179
 
180
+
180
181
  function planOperations(diff, base, ctx) {
181
182
  // Normalize quads once
182
183
  const normAdds = (diff.add || []).map(normalizeQuad).filter(isValidQuad);
@@ -376,6 +377,11 @@ function materializeEdits(plan, text, ctx, base) {
376
377
 
377
378
  // Materialize adds
378
379
  for (const { quad, targetBlock } of plan.adds) {
380
+ const quadKey = quadToKeyForOrigin(quad);
381
+ if (plan.consumedAdds.has(quadKey)) {
382
+ continue;
383
+ }
384
+
379
385
  if (quad.object.termType === 'Literal' || quad.object.termType === 'NamedNode') {
380
386
  if (!targetBlock) {
381
387
  const predShort = shortenIRI(quad.predicate.value, ctx);