mdld-parse 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/parse.js ADDED
@@ -0,0 +1,788 @@
1
+ import {
2
+ DEFAULT_CONTEXT,
3
+ DataFactory,
4
+ expandIRI,
5
+ parseSemanticBlock,
6
+ quadIndexKey,
7
+ createSlotInfo,
8
+ createLiteral,
9
+ hash
10
+ } from './utils.js';
11
+
12
+ // Semantic block cache to avoid repeated parsing
13
+ const semCache = {};
14
+ const EMPTY_SEM = Object.freeze({ predicates: [], types: [], subject: null });
15
+
16
+ function parseSemCached(attrs) {
17
+ if (!attrs) return EMPTY_SEM;
18
+ let sem = semCache[attrs];
19
+ if (!sem) {
20
+ sem = Object.freeze(parseSemanticBlock(attrs));
21
+ semCache[attrs] = sem;
22
+ }
23
+ return sem;
24
+ }
25
+
26
+ function calcAttrsRange(line, attrs, lineStart) {
27
+ if (!attrs) return null;
28
+ const attrsStartInLine = line.lastIndexOf(attrs);
29
+ return attrsStartInLine >= 0 ? [lineStart + attrsStartInLine, lineStart + attrsStartInLine + attrs.length] : null;
30
+ }
31
+
32
+ function calcValueRange(lineStart, valueStartInLine, valueEndInLine) {
33
+ return [lineStart + valueStartInLine, lineStart + valueEndInLine];
34
+ }
35
+
36
+ function createToken(type, range, text, attrs = null, attrsRange = null, valueRange = null, extra = {}) {
37
+ const token = { type, range, text, attrs, attrsRange, valueRange, ...extra };
38
+ // Add lazy carrier caching
39
+ Object.defineProperty(token, '_carriers', {
40
+ enumerable: false,
41
+ writable: true,
42
+ value: null
43
+ });
44
+ return token;
45
+ }
46
+
47
+ function getCarriers(token) {
48
+ if (!token._carriers) {
49
+ token._carriers = extractInlineCarriers(token.text, token.range[0]);
50
+ }
51
+ return token._carriers;
52
+ }
53
+
54
+ function scanTokens(text) {
55
+ const tokens = [];
56
+ const lines = text.split('\n');
57
+ let pos = 0;
58
+ let codeBlock = null;
59
+
60
+ for (let i = 0; i < lines.length; i++) {
61
+ const line = lines[i];
62
+ const lineStart = pos;
63
+ pos += line.length + 1;
64
+
65
+ if (line.startsWith('```')) {
66
+ if (!codeBlock) {
67
+ const fence = line.match(/^(`{3,})(.*)/);
68
+ const attrsText = fence[2].match(/\{[^{}]*\}/)?.[0] || null;
69
+ const attrsStartInLine = attrsText ? line.indexOf(attrsText) : -1;
70
+ const contentStart = lineStart + line.length + 1;
71
+ codeBlock = {
72
+ fence: fence[1],
73
+ start: lineStart,
74
+ content: [],
75
+ lang: fence[2].trim().split(/[\s{]/)[0],
76
+ attrs: attrsText,
77
+ attrsRange: attrsText && attrsStartInLine >= 0 ? [lineStart + attrsStartInLine, lineStart + attrsStartInLine + attrsText.length] : null,
78
+ valueRangeStart: contentStart
79
+ };
80
+ } else if (line.startsWith(codeBlock.fence)) {
81
+ const valueStart = codeBlock.valueRangeStart;
82
+ const valueEnd = Math.max(valueStart, lineStart - 1);
83
+ tokens.push({
84
+ type: 'code',
85
+ range: [codeBlock.start, lineStart],
86
+ text: codeBlock.content.join('\n'),
87
+ lang: codeBlock.lang,
88
+ attrs: codeBlock.attrs,
89
+ attrsRange: codeBlock.attrsRange,
90
+ valueRange: [valueStart, valueEnd]
91
+ });
92
+ codeBlock = null;
93
+ }
94
+ continue;
95
+ }
96
+
97
+ if (codeBlock) {
98
+ codeBlock.content.push(line);
99
+ continue;
100
+ }
101
+
102
+ const prefixMatch = line.match(/^\[([^\]]+)\]\s*<([^>]+)>/);
103
+ if (prefixMatch) {
104
+ tokens.push({ type: 'prefix', prefix: prefixMatch[1], iri: prefixMatch[2].trim() });
105
+ continue;
106
+ }
107
+
108
+ const headingMatch = line.match(/^(#{1,6})\s+(.+?)(?:\s*(\{[^}]+\}))?$/);
109
+ if (headingMatch) {
110
+ const attrs = headingMatch[3] || null;
111
+ const afterHashes = headingMatch[1].length;
112
+ const ws = line.substring(afterHashes).match(/^\s+/)?.[0]?.length || 0;
113
+ const valueStartInLine = afterHashes + ws;
114
+ const valueEndInLine = valueStartInLine + headingMatch[2].length;
115
+ tokens.push(createToken('heading', [lineStart, pos - 1], headingMatch[2].trim(), attrs,
116
+ calcAttrsRange(line, attrs, lineStart),
117
+ calcValueRange(lineStart, valueStartInLine, valueEndInLine),
118
+ { depth: headingMatch[1].length }));
119
+ continue;
120
+ }
121
+
122
+ const listMatch = line.match(/^(\s*)([-*+]|\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/);
123
+ if (listMatch) {
124
+ const attrs = listMatch[4] || null;
125
+ const prefix = listMatch[1].length + listMatch[2].length;
126
+ const ws = line.substring(prefix).match(/^\s+/)?.[0]?.length || 0;
127
+ const valueStartInLine = prefix + ws;
128
+ const valueEndInLine = valueStartInLine + listMatch[3].length;
129
+ tokens.push(createToken('list', [lineStart, pos - 1], listMatch[3].trim(), attrs,
130
+ calcAttrsRange(line, attrs, lineStart),
131
+ calcValueRange(lineStart, valueStartInLine, valueEndInLine),
132
+ { indent: listMatch[1].length }));
133
+ continue;
134
+ }
135
+
136
+ const blockquoteMatch = line.match(/^>\s+(.+?)(?:\s*(\{[^}]+\}))?$/);
137
+ if (blockquoteMatch) {
138
+ const attrs = blockquoteMatch[2] || null;
139
+ const prefixMatch = line.match(/^>\s+/);
140
+ const valueStartInLine = prefixMatch ? prefixMatch[0].length : 2;
141
+ const valueEndInLine = valueStartInLine + blockquoteMatch[1].length;
142
+ tokens.push(createToken('blockquote', [lineStart, pos - 1], blockquoteMatch[1].trim(), attrs,
143
+ calcAttrsRange(line, attrs, lineStart),
144
+ calcValueRange(lineStart, valueStartInLine, valueEndInLine)));
145
+ continue;
146
+ }
147
+
148
+ if (line.trim()) {
149
+ tokens.push(createToken('para', [lineStart, pos - 1], line.trim()));
150
+ }
151
+ }
152
+
153
+ return tokens;
154
+ }
155
+
156
+ // Inline carrier pattern constants (using sticky regexes for proper positioning)
157
+ const INLINE_CARRIER_PATTERNS = {
158
+ EMPHASIS: /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y,
159
+ CODE_SPAN: /``(.+?)``\s*\{([^}]+)\}/y
160
+ };
161
+
162
+ function createCarrier(type, text, attrs, attrsRange, valueRange, range, pos, extra = {}) {
163
+ return { type, text, attrs, attrsRange, valueRange, range, pos, ...extra };
164
+ }
165
+
166
+ function extractInlineCarriers(text, baseOffset = 0) {
167
+ const carriers = [];
168
+ let pos = 0;
169
+
170
+ while (pos < text.length) {
171
+ const emphasisCarrier = tryExtractEmphasisCarrier(text, pos, baseOffset);
172
+ if (emphasisCarrier) {
173
+ carriers.push(emphasisCarrier);
174
+ pos = emphasisCarrier.pos;
175
+ continue;
176
+ }
177
+
178
+ const codeCarrier = tryExtractCodeCarrier(text, pos, baseOffset);
179
+ if (codeCarrier) {
180
+ carriers.push(codeCarrier);
181
+ pos = codeCarrier.pos;
182
+ continue;
183
+ }
184
+
185
+ const bracketCarrier = tryExtractBracketCarrier(text, pos, baseOffset);
186
+ if (bracketCarrier) {
187
+ if (bracketCarrier.skip) {
188
+ pos = bracketCarrier.pos;
189
+ continue;
190
+ }
191
+ carriers.push(bracketCarrier);
192
+ pos = bracketCarrier.pos;
193
+ continue;
194
+ }
195
+
196
+ pos++; // Advance to next character if no carrier found
197
+ }
198
+
199
+ return carriers;
200
+ }
201
+
202
+ function calcCarrierRanges(match, baseOffset, matchStart) {
203
+ const valueStart = baseOffset + matchStart;
204
+ const valueEnd = valueStart + match[1].length;
205
+ const attrsStart = matchStart + match[0].indexOf('{');
206
+ const attrsEnd = attrsStart + match[2].length + 2; // +2 for { and }
207
+ return {
208
+ valueRange: [valueStart, valueEnd],
209
+ attrsRange: [attrsStart + 1, attrsEnd - 1], // Exclude braces
210
+ range: [valueStart, attrsEnd],
211
+ pos: attrsEnd
212
+ };
213
+ }
214
+
215
+ function tryExtractEmphasisCarrier(text, pos, baseOffset) {
216
+ INLINE_CARRIER_PATTERNS.EMPHASIS.lastIndex = pos;
217
+ const match = INLINE_CARRIER_PATTERNS.EMPHASIS.exec(text);
218
+ if (!match) return null;
219
+
220
+ const ranges = calcCarrierRanges(match, baseOffset, match.index);
221
+ return createCarrier('emphasis', match[1], `{${match[2]}}`,
222
+ ranges.attrsRange, ranges.valueRange, ranges.range, ranges.pos);
223
+ }
224
+
225
+ function tryExtractCodeCarrier(text, pos, baseOffset) {
226
+ INLINE_CARRIER_PATTERNS.CODE_SPAN.lastIndex = pos;
227
+ const match = INLINE_CARRIER_PATTERNS.CODE_SPAN.exec(text);
228
+ if (!match) return null;
229
+
230
+ const ranges = calcCarrierRanges(match, baseOffset, match.index);
231
+ return createCarrier('code', match[1], `{${match[2]}}`,
232
+ ranges.attrsRange, ranges.valueRange, ranges.range, ranges.pos);
233
+ }
234
+
235
+ function tryExtractBracketCarrier(text, pos, baseOffset) {
236
+ const bracketStart = text.indexOf('[', pos);
237
+ if (bracketStart === -1 || bracketStart !== pos) return null;
238
+
239
+ const bracketEnd = findMatchingBracket(text, bracketStart);
240
+ if (!bracketEnd) return null;
241
+
242
+ const carrierText = text.substring(bracketStart + 1, bracketEnd - 1);
243
+ const { url, spanEnd } = extractUrlFromBrackets(text, bracketEnd);
244
+ const { attrs, attrsRange, finalSpanEnd } = extractAttributesFromText(text, spanEnd, baseOffset);
245
+ const { carrierType, resourceIRI } = determineCarrierType(url);
246
+
247
+ if (url && url.startsWith('=')) {
248
+ return { skip: true, pos: finalSpanEnd };
249
+ }
250
+
251
+ return createCarrier(carrierType, carrierText, attrs, attrsRange,
252
+ [baseOffset + bracketStart + 1, baseOffset + bracketEnd - 1],
253
+ [baseOffset + bracketStart, baseOffset + finalSpanEnd],
254
+ finalSpanEnd, { url: resourceIRI });
255
+ }
256
+
257
+ function findMatchingBracket(text, bracketStart) {
258
+ let bracketDepth = 1;
259
+ let bracketEnd = bracketStart + 1;
260
+
261
+ while (bracketEnd < text.length && bracketDepth > 0) {
262
+ if (text[bracketEnd] === '[') {
263
+ bracketDepth++;
264
+ } else if (text[bracketEnd] === ']') {
265
+ bracketDepth--;
266
+ }
267
+ bracketEnd++;
268
+ }
269
+
270
+ return bracketDepth > 0 ? null : bracketEnd;
271
+ }
272
+
273
+ function extractUrlFromBrackets(text, bracketEnd) {
274
+ let url = null;
275
+ let spanEnd = bracketEnd;
276
+
277
+ if (text[spanEnd] === '(') {
278
+ const parenEnd = text.indexOf(')', spanEnd);
279
+ if (parenEnd !== -1) {
280
+ url = text.substring(spanEnd + 1, parenEnd);
281
+ spanEnd = parenEnd + 1;
282
+ }
283
+ }
284
+
285
+ return { url, spanEnd };
286
+ }
287
+
288
+ function extractAttributesFromText(text, spanEnd, baseOffset) {
289
+ let attrs = null;
290
+ let attrsRange = null;
291
+
292
+ const attrsMatch = text.substring(spanEnd).match(/^\s*\{([^}]+)\}/);
293
+ if (attrsMatch) {
294
+ attrs = `{${attrsMatch[1]}}`;
295
+ const braceIndex = attrsMatch[0].indexOf('{');
296
+ const absStart = baseOffset + spanEnd + (braceIndex >= 0 ? braceIndex : 0);
297
+ attrsRange = [absStart, absStart + attrs.length];
298
+ spanEnd += attrsMatch[0].length;
299
+ }
300
+
301
+ return { attrs, attrsRange, finalSpanEnd: spanEnd };
302
+ }
303
+
304
+ function determineCarrierType(url) {
305
+ let carrierType = 'span';
306
+ let resourceIRI = null;
307
+
308
+ if (url && !url.startsWith('=')) {
309
+ carrierType = 'link';
310
+ resourceIRI = url;
311
+ }
312
+
313
+ return { carrierType, resourceIRI };
314
+ }
315
+
316
+ function createBlock(subject, types, predicates, entries, range, attrsRange, valueRange, carrierType, ctx) {
317
+ const expanded = {
318
+ subject,
319
+ types: types.map(t => expandIRI(typeof t === 'string' ? t : t.iri, ctx)),
320
+ predicates: predicates.map(p => ({ iri: expandIRI(p.iri, ctx), form: p.form }))
321
+ };
322
+
323
+ // Use semantic signature for stable block identity
324
+ const signature = [
325
+ subject,
326
+ carrierType || 'unknown', // Include carrier type in signature
327
+ expanded.types.join(','),
328
+ expanded.predicates.map(p => `${p.form}${p.iri}`).join(',')
329
+ ].join('|');
330
+
331
+ const blockId = hash(signature);
332
+ return {
333
+ id: blockId,
334
+ range: { start: range[0], end: range[1] },
335
+ attrsRange: attrsRange ? { start: attrsRange[0], end: attrsRange[1] } : null,
336
+ valueRange: valueRange ? { start: valueRange[0], end: valueRange[1] } : null,
337
+ carrierType: carrierType || null,
338
+ subject,
339
+ types: expanded.types,
340
+ predicates: expanded.predicates,
341
+ entries: entries || [],
342
+ context: { ...ctx }
343
+ };
344
+ }
345
+
346
+ function emitQuad(quads, quadIndex, blockId, subject, predicate, object, dataFactory, meta = null) {
347
+ if (!subject || !predicate || !object) return;
348
+ const quad = dataFactory.quad(subject, predicate, object);
349
+ quads.push(quad);
350
+
351
+ // Create enhanced slot info with semantic slot tracking
352
+ const slotInfo = createSlotInfo(blockId, meta?.entryIndex, {
353
+ ...meta,
354
+ subject,
355
+ predicate,
356
+ object
357
+ });
358
+
359
+ quadIndex.set(quadIndexKey(quad.subject, quad.predicate, quad.object), slotInfo);
360
+ }
361
+ function resolveSubject(sem, state) {
362
+ if (!sem.subject) return null;
363
+ if (sem.subject === 'RESET') {
364
+ state.currentSubject = null;
365
+ return null;
366
+ }
367
+ if (sem.subject.startsWith('=#')) {
368
+ const fragment = sem.subject.substring(2);
369
+ if (state.currentSubject) {
370
+ const baseIRI = state.currentSubject.value.split('#')[0];
371
+ return state.df.namedNode(`${baseIRI}#${fragment}`);
372
+ }
373
+ return null;
374
+ } else {
375
+ return state.df.namedNode(expandIRI(sem.subject, state.ctx));
376
+ }
377
+ }
378
+
379
+ function resolveObject(sem, state) {
380
+ if (!sem.object) return null;
381
+ if (sem.object.startsWith('#')) {
382
+ const fragment = sem.object.substring(1);
383
+ if (state.currentSubject) {
384
+ const baseIRI = state.currentSubject.value.split('#')[0];
385
+ return state.df.namedNode(`${baseIRI}#${fragment}`);
386
+ }
387
+ return null;
388
+ } else {
389
+ // Regular soft IRI
390
+ return state.df.namedNode(expandIRI(sem.object, state.ctx));
391
+ }
392
+ }
393
+
394
+ function processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state) {
395
+ sem.types.forEach(t => {
396
+ const typeIRI = typeof t === 'string' ? t : t.iri;
397
+ const entryIndex = typeof t === 'string' ? null : t.entryIndex;
398
+ const typeSubject = newSubject ? newSubject : (localObject || carrierO || S);
399
+ const expandedType = expandIRI(typeIRI, state.ctx);
400
+
401
+ emitQuad(
402
+ state.quads, state.origin.quadIndex, block.id,
403
+ typeSubject,
404
+ state.df.namedNode(expandIRI('rdf:type', state.ctx)),
405
+ state.df.namedNode(expandedType),
406
+ state.df,
407
+ { kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex }
408
+ );
409
+ });
410
+ }
411
+
412
+ function processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state) {
413
+ sem.predicates.forEach(pred => {
414
+ const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
415
+
416
+ // Pre-bind subject/object roles for clarity
417
+ const roles = {
418
+ '': { subject: localObject || S, object: L },
419
+ '?': { subject: newSubject ? previousSubject : S, object: localObject || newSubjectOrCarrierO },
420
+ '!': { subject: localObject || newSubjectOrCarrierO, object: newSubject ? previousSubject : S }
421
+ };
422
+
423
+ const role = roles[pred.form];
424
+ if (role && role.subject && role.object) {
425
+ emitQuad(
426
+ state.quads, state.origin.quadIndex, block.id,
427
+ role.subject, P, role.object, state.df,
428
+ { kind: 'pred', token: `${pred.form}${pred.iri}`, form: pred.form, expandedPredicate: P.value, entryIndex: pred.entryIndex }
429
+ );
430
+ }
431
+ });
432
+ }
433
+
434
+ function processAnnotation(carrier, sem, state, options = {}) {
435
+ const { preserveGlobalSubject = false, implicitSubject = null } = options;
436
+
437
+ if (sem.subject === 'RESET') {
438
+ state.currentSubject = null;
439
+ return;
440
+ }
441
+
442
+ const previousSubject = state.currentSubject;
443
+ const newSubject = resolveSubject(sem, state);
444
+ const localObject = resolveObject(sem, state);
445
+
446
+ // Use implicit subject if provided (for list items)
447
+ const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
448
+ if (newSubject && !preserveGlobalSubject && !implicitSubject) {
449
+ state.currentSubject = newSubject;
450
+ }
451
+ const S = preserveGlobalSubject ? (newSubject || previousSubject) : (implicitSubject || state.currentSubject);
452
+ if (!S) return;
453
+
454
+ const block = createBlock(
455
+ S.value, sem.types, sem.predicates, sem.entries,
456
+ carrier.range, carrier.attrsRange || null, carrier.valueRange || null,
457
+ carrier.type || null, state.ctx
458
+ );
459
+ state.origin.blocks.set(block.id, block);
460
+
461
+ const L = createLiteral(carrier.text, sem.datatype, sem.language, state.ctx, state.df);
462
+ const carrierO = carrier.url ? state.df.namedNode(expandIRI(carrier.url, state.ctx)) : null;
463
+ const newSubjectOrCarrierO = newSubject || carrierO;
464
+
465
+ processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state);
466
+ processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state);
467
+ }
468
+
469
+ // Helper functions for list item processing
470
+ function findSubjectInAttrs(attrs, state, carrierInfo = null) {
471
+ const sem = parseSemCached(attrs);
472
+ if (sem.subject && sem.subject !== 'RESET') {
473
+ const subject = resolveSubject(sem, state);
474
+ if (subject) {
475
+ return { subject, carrier: carrierInfo || { type: 'unknown', text: '', attrs } };
476
+ }
477
+ }
478
+ return null;
479
+ }
480
+
481
+ export function findItemSubject(listToken, carriers, state) {
482
+ const subjectFromAttrs = findSubjectInAttrs(listToken.attrs, state, {
483
+ type: 'list', text: listToken.text, attrs: listToken.attrs, range: listToken.range
484
+ });
485
+ if (subjectFromAttrs) return subjectFromAttrs;
486
+
487
+ for (const carrier of carriers) {
488
+ const subjectFromCarrier = findSubjectInAttrs(carrier.attrs, state, carrier);
489
+ if (subjectFromCarrier) return subjectFromCarrier;
490
+ }
491
+
492
+ return null;
493
+ }
494
+
495
+ function hasOwnPredicates(listToken, carriers) {
496
+ // Check for explicit predicates (excluding subject declarations)
497
+ if (listToken.attrs) {
498
+ const attrs = parseSemCached(listToken.attrs);
499
+ if (attrs.predicates.some(p => !p.subject && p.iri !== 'RESET')) {
500
+ return true;
501
+ }
502
+ }
503
+ return carriers.some(carrier => {
504
+ const carrierAttrs = parseSemCached(carrier.attrs);
505
+ return carrierAttrs.predicates.some(p => !p.subject && p.iri !== 'RESET');
506
+ });
507
+ }
508
+
509
+ // Unified list context processing
510
+ function processContextSem({ sem, itemSubject, contextSubject, inheritLiterals = false, state, blockId = 'list-context' }) {
511
+ // Emit types
512
+ sem.types.forEach(t => {
513
+ const typeIRI = typeof t === 'string' ? t : t.iri;
514
+ emitQuad(
515
+ state.quads, state.origin.quadIndex, blockId,
516
+ itemSubject,
517
+ state.df.namedNode(expandIRI('rdf:type', state.ctx)),
518
+ state.df.namedNode(expandIRI(typeIRI, state.ctx)),
519
+ state.df
520
+ );
521
+ });
522
+
523
+ // Emit directional predicates
524
+ sem.predicates.forEach(pred => {
525
+ const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
526
+ if (pred.form === '!') {
527
+ emitQuad(state.quads, state.origin.quadIndex, blockId, itemSubject, P, contextSubject, state.df);
528
+ } else if (pred.form === '?') {
529
+ emitQuad(state.quads, state.origin.quadIndex, blockId, contextSubject, P, itemSubject, state.df);
530
+ }
531
+ });
532
+
533
+ // Optionally inherit literal predicates
534
+ if (inheritLiterals) {
535
+ const literalPredicates = sem.predicates.filter(p => p.form === '');
536
+ if (literalPredicates.length > 0) {
537
+ const inheritedSem = createInheritedSem(literalPredicates);
538
+ // Note: caller must handle subject switching for literal inheritance
539
+ return inheritedSem;
540
+ }
541
+ }
542
+ return null;
543
+ }
544
+
545
+ // Lightweight semantic object constructor for inherited predicates
546
+ function createInheritedSem(predicates) {
547
+ return {
548
+ subject: null,
549
+ object: null,
550
+ types: [],
551
+ predicates: predicates.map(p => ({ iri: p.iri, form: p.form, entryIndex: p.entryIndex })),
552
+ datatype: null,
553
+ language: null,
554
+ entries: []
555
+ };
556
+ }
557
+
558
+ // List stack management functions
559
+ function manageListStack(token, state) {
560
+ // Pop stack frames for lists that have ended (strictly less indent)
561
+ while (
562
+ state.listStack.length &&
563
+ token.indent < state.listStack[state.listStack.length - 1].indent
564
+ ) {
565
+ state.listStack.pop();
566
+ }
567
+
568
+ // If we have pending context, always create a new frame for it
569
+ if (state.pendingListContext) {
570
+ state.listStack.push({
571
+ indent: token.indent,
572
+ anchorSubject: state.pendingListContext.subject,
573
+ contextSubject: state.pendingListContext.subject,
574
+ contextSem: state.pendingListContext.sem
575
+ });
576
+ state.pendingListContext = null;
577
+ } else if (state.listStack.length === 0 || token.indent > state.listStack[state.listStack.length - 1].indent) {
578
+ // Push empty frame for nested lists without explicit context
579
+ // Inherit anchorSubject from parent frame if available
580
+ const parentFrame = state.listStack.length > 0 ? state.listStack[state.listStack.length - 1] : null;
581
+ state.listStack.push({
582
+ indent: token.indent,
583
+ anchorSubject: parentFrame?.anchorSubject || null,
584
+ contextSubject: parentFrame?.anchorSubject || null,
585
+ contextSem: null
586
+ });
587
+ }
588
+ // If token.indent == current frame indent and no pending context, we're at same level - do nothing
589
+ }
590
+
591
+ function processListItem(token, state) {
592
+ const carriers = getCarriers(token);
593
+
594
+ // Find item subject from list token or inline carriers
595
+ const itemInfo = findItemSubject(token, carriers, state);
596
+ if (!itemInfo) return;
597
+
598
+ const { subject: itemSubject } = itemInfo;
599
+
600
+ // Update the current list frame to track this item's subject for nested contexts
601
+ if (state.listStack.length > 0) {
602
+ const currentFrame = state.listStack[state.listStack.length - 1];
603
+ currentFrame.anchorSubject = itemSubject;
604
+ }
605
+
606
+ const listFrame = state.listStack[state.listStack.length - 1];
607
+
608
+ // Apply list context if available
609
+ if (listFrame?.contextSem) {
610
+ processContextSem({
611
+ sem: listFrame.contextSem,
612
+ itemSubject,
613
+ contextSubject: listFrame.contextSubject,
614
+ state
615
+ });
616
+
617
+ // Inherit literal predicates if item has no own predicates
618
+ const hasOwnPreds = hasOwnPredicates(token, carriers);
619
+ if (!hasOwnPreds) {
620
+ const inheritedSem = processContextSem({
621
+ sem: listFrame.contextSem,
622
+ itemSubject,
623
+ contextSubject: listFrame.contextSubject,
624
+ inheritLiterals: true,
625
+ state
626
+ });
627
+ if (inheritedSem) {
628
+ const prevSubject = state.currentSubject;
629
+ state.currentSubject = itemSubject;
630
+ processAnnotation(createCarrierFromToken(token, 'list'), inheritedSem, state, { preserveGlobalSubject: true });
631
+ state.currentSubject = prevSubject;
632
+ }
633
+ }
634
+ }
635
+
636
+ // Process item's own annotations using unified function
637
+ if (token.attrs) {
638
+ const sem = parseSemCached(token.attrs);
639
+ processAnnotation(createCarrierFromToken(token, 'list'), sem, state, {
640
+ preserveGlobalSubject: !state.listStack.length,
641
+ implicitSubject: itemSubject
642
+ });
643
+ }
644
+
645
+ // Process inline carriers' annotations
646
+ carriers.forEach(carrier => {
647
+ if (carrier.attrs) {
648
+ const sem = parseSemCached(carrier.attrs);
649
+ processAnnotation(carrier, sem, state, {
650
+ preserveGlobalSubject: !state.listStack.length,
651
+ implicitSubject: itemSubject
652
+ });
653
+ }
654
+ });
655
+ }
656
+
657
+ function processListContextFromParagraph(token, state) {
658
+ const contextMatch = token.text.match(/^(.+?)\s*\{([^}]+)\}$/);
659
+
660
+ if (contextMatch) {
661
+ const contextSem = parseSemCached(`{${contextMatch[2]}}`);
662
+
663
+ // Context subject resolution:
664
+ // 1. For top-level lists: use current subject or document subject
665
+ // 2. For nested lists: use parent list item's subject
666
+ let contextSubject = state.currentSubject || state.documentSubject;
667
+
668
+ // Check if this is a nested list context by looking ahead
669
+ const nextTokenIndex = state.currentTokenIndex + 1;
670
+ const nextToken = state.tokens && state.tokens[nextTokenIndex];
671
+
672
+ if (state.listStack.length > 0 && nextToken && nextToken.type === 'list') {
673
+ const currentFrame = state.listStack[state.listStack.length - 1];
674
+ if (currentFrame.anchorSubject && nextToken.indent > currentFrame.indent) {
675
+ contextSubject = currentFrame.anchorSubject;
676
+ }
677
+ }
678
+
679
+ state.pendingListContext = {
680
+ sem: contextSem,
681
+ subject: contextSubject
682
+ };
683
+ }
684
+ }
685
+
686
+ // Helper functions for token processing
687
+ function createCarrierFromToken(token, tokenType) {
688
+ return {
689
+ type: tokenType,
690
+ text: token.text,
691
+ range: token.range,
692
+ attrsRange: token.attrsRange || null,
693
+ valueRange: token.valueRange || null
694
+ };
695
+ }
696
+
697
+ function processTokenAnnotations(token, state, tokenType) {
698
+ // Process token's own attributes
699
+ if (token.attrs) {
700
+ const sem = parseSemCached(token.attrs);
701
+ processAnnotation(createCarrierFromToken(token, tokenType), sem, state);
702
+ }
703
+
704
+ // Process inline carriers
705
+ const carriers = getCarriers(token);
706
+ carriers.forEach(carrier => {
707
+ if (carrier.attrs) {
708
+ const sem = parseSemCached(carrier.attrs);
709
+ processAnnotation(carrier, sem, state);
710
+ }
711
+ });
712
+ }
713
+
714
+ function processStandaloneSubject(token, state) {
715
+ const match = token.text.match(/^\s*\{=(.*?)\}\s*$/);
716
+ if (!match) return;
717
+
718
+ const sem = parseSemCached(`{=${match[1]}}`);
719
+ const attrsStart = token.range[0] + token.text.indexOf('{=');
720
+ const attrsEnd = attrsStart + (match[1] ? match[1].length : 0);
721
+
722
+ processAnnotation({
723
+ type: 'standalone',
724
+ text: '',
725
+ range: token.range,
726
+ attrsRange: [attrsStart, attrsEnd],
727
+ valueRange: null
728
+ }, sem, state);
729
+ }
730
+
731
+ export function parse(text, options = {}) {
732
+ const state = {
733
+ ctx: { ...DEFAULT_CONTEXT, ...(options.context || {}) },
734
+ df: options.dataFactory || DataFactory,
735
+ quads: [],
736
+ origin: { blocks: new Map(), quadIndex: new Map() },
737
+ currentSubject: null,
738
+ documentSubject: null, // Track main document subject from headings
739
+ listStack: [],
740
+ pendingListContext: null,
741
+ tokens: null, // Store tokens for lookahead
742
+ currentTokenIndex: -1 // Track current token index
743
+ };
744
+
745
+ state.tokens = scanTokens(text);
746
+
747
+ // Process prefix declarations first
748
+ state.tokens.filter(t => t.type === 'prefix').forEach(t => state.ctx[t.prefix] = t.iri);
749
+
750
+ // Process all other tokens
751
+ for (let i = 0; i < state.tokens.length; i++) {
752
+ const token = state.tokens[i];
753
+ state.currentTokenIndex = i;
754
+
755
+ switch (token.type) {
756
+ case 'heading':
757
+ // Update document subject when processing headings
758
+ if (token.attrs) {
759
+ const headingSem = parseSemCached(token.attrs);
760
+ if (headingSem.subject) {
761
+ const subject = resolveSubject(headingSem, state);
762
+ if (subject) {
763
+ state.documentSubject = subject;
764
+ }
765
+ }
766
+ }
767
+ processTokenAnnotations(token, state, token.type);
768
+ break;
769
+ case 'code':
770
+ case 'blockquote':
771
+ processTokenAnnotations(token, state, token.type);
772
+ break;
773
+
774
+ case 'para':
775
+ processStandaloneSubject(token, state);
776
+ processListContextFromParagraph(token, state);
777
+ processTokenAnnotations(token, state, token.type);
778
+ break;
779
+
780
+ case 'list':
781
+ manageListStack(token, state);
782
+ processListItem(token, state);
783
+ break;
784
+ }
785
+ }
786
+
787
+ return { quads: state.quads, origin: state.origin, context: state.ctx };
788
+ }