mdld-parse 0.7.2 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/parse.js CHANGED
@@ -1,105 +1,130 @@
1
1
  import {
2
- DEFAULT_CONTEXT,
3
2
  DataFactory,
4
3
  expandIRI,
5
- parseSemanticBlock,
6
4
  quadIndexKey,
7
5
  createLiteral,
8
6
  hash
9
7
  } from './utils.js';
8
+ import {
9
+ DEFAULT_CONTEXT,
10
+ URL_REGEX,
11
+ FENCE_REGEX,
12
+ PREFIX_REGEX,
13
+ HEADING_REGEX,
14
+ UNORDERED_LIST_REGEX,
15
+ BLOCKQUOTE_REGEX,
16
+ STANDALONE_SUBJECT_REGEX,
17
+ CARRIER_PATTERN_ARRAY,
18
+
19
+ } from './constants.js';
20
+ import {
10
21
 
11
- const URL_REGEX = /^[a-zA-Z][a-zA-Z0-9+.-]*:/;
12
- const FENCE_REGEX = /^(`{3,}|~{3,})(.*)/;
13
- const PREFIX_REGEX = /^\[([^\]]+)\]\s*<([^>]+)>/;
14
- const HEADING_REGEX = /^(#{1,6})\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
15
- const UNORDERED_LIST_REGEX = /^(\s*)([-*+]|\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
16
- const BLOCKQUOTE_REGEX = /^>\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
17
- const STANDALONE_SUBJECT_REGEX = /^\s*\{=(.*?)\}\s*$/;
18
- const INLINE_CARRIER_PATTERNS = {
19
- EMPHASIS: /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y,
20
- CODE_SPAN: /``(.+?)``\s*\{([^}]+)\}/y
21
- };
22
-
23
- // Cache for fence regex patterns to avoid recreation
24
- const FENCE_CLOSE_PATTERNS = new Map();
22
+ getFenceClosePattern,
23
+ calcRangeInfo,
24
+ calcAttrsRange,
25
+ createToken,
26
+ createCarrier,
27
+ createListToken,
28
+ parseSemCached,
29
+ EMPTY_SEM,
30
+ parseLangAndAttrs,
31
+ findMatchingBracket,
32
+ extractUrlFromBrackets,
33
+ extractAttributesFromText,
34
+ determineCarrierType,
35
+ calcCarrierRanges,
36
+ extractCleanText,
37
+ RDF_TYPE,
38
+ RDF_STATEMENT,
39
+ RDF_SUBJECT,
40
+ RDF_PREDICATE,
41
+ RDF_OBJECT,
42
+ createLeanOriginEntry,
43
+ resolveFragment,
44
+ resolveSubject,
45
+ resolveObject,
46
+ processTokenWithBlockTracking
47
+ } from './shared.js';
25
48
 
26
- function getFenceClosePattern(fenceChar) {
27
- if (!FENCE_CLOSE_PATTERNS.has(fenceChar)) {
28
- FENCE_CLOSE_PATTERNS.set(fenceChar, new RegExp(`^(${fenceChar}{3,})`));
29
- }
30
- return FENCE_CLOSE_PATTERNS.get(fenceChar);
31
- }
32
49
 
33
- function parseLangAndAttrs(langAndAttrs) {
34
- const spaceIndex = langAndAttrs.indexOf(' ');
35
- const braceIndex = langAndAttrs.indexOf('{');
36
- const langEnd = Math.min(
37
- spaceIndex > -1 ? spaceIndex : Infinity,
38
- braceIndex > -1 ? braceIndex : Infinity
39
- );
40
- return {
41
- lang: langAndAttrs.substring(0, langEnd),
42
- attrsText: langAndAttrs.substring(langEnd).match(/\{[^{}]*\}/)?.[0] || null
50
+ export function parse(text, options = {}) {
51
+ const state = {
52
+ ctx: { ...DEFAULT_CONTEXT, ...(options.context || {}) },
53
+ df: options.dataFactory || DataFactory,
54
+ quads: [],
55
+ quadBuffer: new Map(),
56
+ removeSet: new Set(),
57
+ origin: {
58
+ quadIndex: new Map(),
59
+ blocks: new Map(),
60
+ documentStructure: []
61
+ },
62
+ currentSubject: null,
63
+ tokens: null,
64
+ currentTokenIndex: -1,
65
+ statements: [],
66
+ statementCandidates: new Map(),
67
+ currentBlock: null,
68
+ blockStack: []
43
69
  };
44
- }
45
70
 
46
- const semCache = {};
47
- const EMPTY_SEM = Object.freeze({ predicates: [], types: [], subject: null });
71
+ state.tokens = scanTokens(text);
48
72
 
49
- function parseSemCached(attrs) {
50
- if (!attrs) return EMPTY_SEM;
51
- let sem = semCache[attrs];
52
- if (!sem) {
53
- sem = Object.freeze(parseSemanticBlock(attrs));
54
- semCache[attrs] = sem;
73
+ // Single-pass processing: resolve prefixes AND process tokens together
74
+ for (let i = 0; i < state.tokens.length; i++) {
75
+ const token = state.tokens[i];
76
+ state.currentTokenIndex = i;
77
+
78
+ // Handle prefix tokens immediately during main pass
79
+ if (token.type === 'prefix') {
80
+ let resolvedIri = token.iri;
81
+ if (token.iri.includes(':')) {
82
+ const colonIndex = token.iri.indexOf(':');
83
+ const potentialPrefix = token.iri.substring(0, colonIndex);
84
+ const reference = token.iri.substring(colonIndex + 1);
85
+ if (state.ctx[potentialPrefix] && potentialPrefix !== '@vocab') {
86
+ resolvedIri = state.ctx[potentialPrefix] + reference;
87
+ }
88
+ }
89
+ state.ctx[token.prefix] = resolvedIri;
90
+ continue; // Skip token processor for prefixes
91
+ }
92
+
93
+ // Process all other tokens
94
+ TOKEN_PROCESSORS[token.type]?.(token, state);
55
95
  }
56
- return sem;
57
- }
58
96
 
59
- function calcRangeInfo(line, attrs, lineStart, prefixLength, valueLength) {
60
- const wsLength = prefixLength < line.length && line[prefixLength] === ' ' ? 1 :
61
- line.slice(prefixLength).match(/^\s+/)?.[0]?.length || 0;
62
- const valueStartInLine = prefixLength + wsLength;
63
- return {
64
- valueRange: [lineStart + valueStartInLine, lineStart + valueStartInLine + valueLength],
65
- attrsRange: calcAttrsRange(line, attrs, lineStart)
66
- };
67
- }
97
+ // Optimized quad filtering - use Set.has() instead of array.includes()
98
+ const quadKeys = new Set();
99
+ for (const quad of state.quads) {
100
+ quadKeys.add(quadIndexKey(quad.subject, quad.predicate, quad.object));
101
+ }
68
102
 
69
- function calcAttrsRange(line, attrs, lineStart) {
70
- if (!attrs) return null;
71
- const attrsStartInLine = line.lastIndexOf(attrs);
72
- return attrsStartInLine >= 0 ? [lineStart + attrsStartInLine, lineStart + attrsStartInLine + attrs.length] : null;
73
- }
103
+ // Direct Set iteration - more efficient than filter()
104
+ const filteredRemove = [];
105
+ for (const quad of state.removeSet) {
106
+ const key = quadIndexKey(quad.subject, quad.predicate, quad.object);
107
+ if (!quadKeys.has(key)) {
108
+ filteredRemove.push(quad);
109
+ }
110
+ }
74
111
 
75
- function createToken(type, range, text, attrs = null, attrsRange = null, valueRange = null, extra = {}) {
76
- const token = { type, range, text, attrs, attrsRange, valueRange, ...extra };
77
- Object.defineProperty(token, '_carriers', {
78
- enumerable: false, writable: true, value: null
79
- });
80
- return token;
112
+ return { quads: state.quads, remove: filteredRemove, statements: state.statements, origin: state.origin, context: state.ctx };
81
113
  }
82
114
 
115
+
116
+ // Cache for fence regex patterns - using shared utility
117
+
83
118
  function getCarriers(token) {
84
119
  if (token.type === 'code') return [];
85
120
  return token._carriers || (token._carriers = extractInlineCarriers(token.text, token.range[0]));
86
121
  }
87
122
 
88
- const createListToken = (type, line, lineStart, pos, match) => {
89
- const attrs = match[4] || null;
90
- const prefix = match[1].length + (match[2] ? match[2].length : 0);
91
- const rangeInfo = calcRangeInfo(line, attrs, lineStart, prefix, match[3].length);
92
- return createToken(type, [lineStart, pos - 1], match[3].trim(), attrs,
93
- rangeInfo.attrsRange, rangeInfo.valueRange, { indent: match[1].length });
94
- };
95
-
96
123
  function scanTokens(text) {
97
124
  const tokens = [];
98
125
  const lines = text.split('\n');
99
126
  let pos = 0;
100
127
  let codeBlock = null;
101
-
102
- // Direct lookup instead of linear search
103
128
  const PROCESSORS = [
104
129
  { type: 'fence', test: line => FENCE_REGEX.test(line.trim()), process: handleFence },
105
130
  { type: 'content', test: () => codeBlock, process: line => codeBlock.content.push(line) },
@@ -206,16 +231,6 @@ function scanTokens(text) {
206
231
  return tokens;
207
232
  }
208
233
 
209
- function createCarrier(type, text, attrs, attrsRange, valueRange, range, pos, extra = {}) {
210
- return { type, text, attrs, attrsRange, valueRange, range, pos, ...extra };
211
- }
212
-
213
- // Pre-compiled carrier patterns for better performance
214
- const CARRIER_PATTERN_ARRAY = [
215
- ['EMPHASIS', /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y],
216
- ['CODE_SPAN', /``(.+?)``\s*\{([^}]+)\}/y]
217
- ];
218
-
219
234
  function extractInlineCarriers(text, baseOffset = 0) {
220
235
  const carriers = [];
221
236
  let pos = 0;
@@ -280,73 +295,124 @@ function extractInlineCarriers(text, baseOffset = 0) {
280
295
  return carriers;
281
296
  }
282
297
 
283
- function calcCarrierRanges(match, baseOffset, matchStart) {
284
- const valueStart = baseOffset + matchStart + match[0].indexOf(match[1]);
285
- const valueEnd = valueStart + match[1].length;
286
- const attrsStart = baseOffset + matchStart + match[0].indexOf('{');
287
- const attrsEnd = attrsStart + match[2].length + 2; // +2 for { and }
288
- return {
289
- valueRange: [valueStart, valueEnd],
290
- attrsRange: [attrsStart + 1, attrsEnd - 1], // Exclude braces
291
- range: [baseOffset + matchStart, attrsEnd],
292
- pos: matchStart + match[0].length // pos should be relative to current text, not document
298
+
299
+ function createBlockEntry(token, state) {
300
+ const blockId = token._blockId || hash(`${token.type}:${token.range?.[0]}:${token.range?.[1]}`);
301
+ token._blockId = blockId; // Store for later reference
302
+
303
+ const cleanText = extractCleanText(token);
304
+
305
+ const blockEntry = {
306
+ id: blockId,
307
+ type: token.type,
308
+ range: token.range,
309
+ text: cleanText,
310
+ subject: null,
311
+ types: [],
312
+ predicates: [],
313
+ carriers: [],
314
+ listLevel: token.indent || 0,
315
+ parentBlockId: state.blockStack.length > 0 ? state.blockStack[state.blockStack.length - 1] : null,
316
+ quadKeys: [] // Will be populated during quad emission
293
317
  };
318
+
319
+ // Store block and add to document structure
320
+ state.origin.blocks.set(blockId, blockEntry);
321
+ state.origin.documentStructure.push(blockEntry);
322
+
323
+ return blockEntry;
294
324
  }
295
325
 
296
- function findMatchingBracket(text, bracketStart) {
297
- let bracketDepth = 1;
298
- let bracketEnd = bracketStart + 1;
326
+ function enrichBlockFromAnnotation(blockEntry, sem, carrier, state) {
327
+ // Update subject if available
328
+ if (sem.subject && sem.subject !== 'RESET') {
329
+ const resolvedSubject = resolveSubject(sem, state);
330
+ if (resolvedSubject) {
331
+ blockEntry.subject = resolvedSubject.value;
332
+ }
333
+ }
299
334
 
300
- while (bracketEnd < text.length && bracketDepth > 0) {
301
- if (text[bracketEnd] === '[') bracketDepth++;
302
- else if (text[bracketEnd] === ']') bracketDepth--;
303
- bracketEnd++;
335
+ // Add types
336
+ if (sem.types && sem.types.length > 0) {
337
+ sem.types.forEach(t => {
338
+ const typeIRI = typeof t === 'string' ? t : t.iri;
339
+ const expanded = expandIRI(typeIRI, state.ctx);
340
+ if (!blockEntry.types.includes(expanded)) {
341
+ blockEntry.types.push(expanded);
342
+ }
343
+ });
304
344
  }
305
345
 
306
- return bracketDepth > 0 ? null : bracketEnd;
307
- }
346
+ // Add predicates
347
+ if (sem.predicates && sem.predicates.length > 0) {
348
+ sem.predicates.forEach(pred => {
349
+ const expandedPred = {
350
+ iri: expandIRI(pred.iri, state.ctx),
351
+ form: pred.form || '',
352
+ object: null // Will be filled during quad emission
353
+ };
354
+ blockEntry.predicates.push(expandedPred);
355
+ });
356
+ }
308
357
 
309
- function extractUrlFromBrackets(text, bracketEnd) {
310
- let url = null;
311
- let spanEnd = bracketEnd;
358
+ // Add carrier information
359
+ if (carrier) {
360
+ const carrierInfo = {
361
+ type: carrier.type,
362
+ range: carrier.range,
363
+ text: carrier.text,
364
+ subject: null,
365
+ predicates: []
366
+ };
312
367
 
313
- if (text[spanEnd] === '(') {
314
- const parenEnd = text.indexOf(')', spanEnd);
315
- if (parenEnd !== -1) {
316
- url = text.substring(spanEnd + 1, parenEnd);
317
- spanEnd = parenEnd + 1;
368
+ // Extract carrier-specific semantics
369
+ if (carrier.attrs) {
370
+ const carrierSem = parseSemCached(carrier.attrs);
371
+ if (carrierSem.types) {
372
+ carrierInfo.predicates = carrierSem.predicates || [];
373
+ }
318
374
  }
319
- }
320
375
 
321
- return { url, spanEnd };
376
+ blockEntry.carriers.push(carrierInfo);
377
+ }
322
378
  }
323
379
 
324
- function extractAttributesFromText(text, spanEnd, baseOffset) {
325
- let attrs = null;
326
- let attrsRange = null;
327
- const remaining = text.substring(spanEnd);
328
-
329
- const wsMatch = remaining.match(/^\s+/);
330
- const attrsStart = wsMatch ? wsMatch[0].length : 0;
331
-
332
- if (remaining[attrsStart] === '{') {
333
- const braceEnd = remaining.indexOf('}', attrsStart);
334
- if (braceEnd !== -1) {
335
- attrs = remaining.substring(attrsStart, braceEnd + 1);
336
- const absStart = baseOffset + spanEnd + attrsStart;
337
- attrsRange = [absStart, absStart + attrs.length];
338
- spanEnd += braceEnd + 1;
339
- }
380
+ function processAnnotationWithBlockTracking(carrier, sem, state, options = {}) {
381
+ const { preserveGlobalSubject = false, implicitSubject = null } = options;
382
+
383
+ if (sem.subject === 'RESET') {
384
+ state.currentSubject = null;
385
+ return;
340
386
  }
341
387
 
342
- return { attrs, attrsRange, finalSpanEnd: spanEnd };
343
- }
388
+ const previousSubject = state.currentSubject;
389
+ const newSubject = resolveSubject(sem, state);
390
+ const localObject = resolveObject(sem, state);
391
+
392
+ const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
393
+ if (newSubject && !preserveGlobalSubject && !implicitSubject) {
394
+ state.currentSubject = newSubject;
395
+ }
396
+ const S = preserveGlobalSubject ? (newSubject || previousSubject) : (implicitSubject || state.currentSubject);
397
+ if (!S) return;
398
+
399
+ const block = createBlock(
400
+ S.value, sem.types, sem.predicates,
401
+ carrier.range, carrier.attrsRange || null, carrier.valueRange || null,
402
+ carrier.type || null, state.ctx, carrier.text
403
+ );
404
+
405
+ const L = createLiteral(carrier.text, sem.datatype, sem.language, state.ctx, state.df);
406
+ const carrierO = carrier.url ? state.df.namedNode(expandIRI(carrier.url, state.ctx)) : null;
407
+ const newSubjectOrCarrierO = newSubject || carrierO;
344
408
 
345
- function determineCarrierType(url) {
346
- if (url && !url.startsWith('=')) {
347
- return { carrierType: 'link', resourceIRI: url };
409
+ // Enrich current block with semantic information
410
+ if (state.currentBlock) {
411
+ enrichBlockFromAnnotation(state.currentBlock, sem, carrier, state);
348
412
  }
349
- return { carrierType: 'span', resourceIRI: null };
413
+
414
+ processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state, carrier);
415
+ processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state, carrier);
350
416
  }
351
417
 
352
418
  function createBlock(subject, types, predicates, range, attrsRange, valueRange, carrierType, ctx, text) {
@@ -371,7 +437,7 @@ function createBlock(subject, types, predicates, range, attrsRange, valueRange,
371
437
  };
372
438
  }
373
439
 
374
- function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predicate, object, dataFactory, meta = null, statements = null, statementCandidates = null) {
440
+ function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predicate, object, dataFactory, meta = null, statements = null, statementCandidates = null, state = null) {
375
441
  if (!subject || !predicate || !object) return;
376
442
 
377
443
  const quad = dataFactory.quad(subject, predicate, object);
@@ -407,29 +473,21 @@ function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predi
407
473
  // Detect rdf:Statement pattern during single-pass parsing
408
474
  detectStatementPatternSinglePass(quad, dataFactory, meta, statements, statementCandidates);
409
475
 
410
- // Create lean origin entry - avoid spread operator for better performance
411
- const originEntry = {
412
- blockId: block.id,
413
- range: block.range,
414
- carrierType: block.carrierType,
415
- subject: subject.value,
416
- predicate: predicate.value,
417
- context: block.context, // Direct reference instead of spread
418
- polarity: meta?.remove ? '-' : '+',
419
- value: block.text || ''
420
- };
476
+ // Create lean origin entry using shared utility
477
+ const originEntry = createLeanOriginEntry(block, subject, predicate, meta);
421
478
 
422
479
  quadIndex.set(quadKey, originEntry);
480
+
481
+ // Link block to this quad for reverse lookup during rendering
482
+ if (state.currentBlock && block.id === state.currentBlock.id) {
483
+ if (!state.currentBlock.quadKeys) {
484
+ state.currentBlock.quadKeys = [];
485
+ }
486
+ state.currentBlock.quadKeys.push(quadKey);
487
+ }
423
488
  }
424
489
  }
425
490
 
426
- // Extract RDF constants once at module level for efficiency
427
- const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
428
- const RDF_STATEMENT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement';
429
- const RDF_SUBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject';
430
- const RDF_PREDICATE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate';
431
- const RDF_OBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#object';
432
-
433
491
  function detectStatementPatternSinglePass(quad, dataFactory, meta, statements = null, statementCandidates = null) {
434
492
  // Skip if not called from parse context (for testing compatibility)
435
493
  if (!statements || !statementCandidates) return;
@@ -479,30 +537,6 @@ function detectStatementPatternSinglePass(quad, dataFactory, meta, statements =
479
537
  }
480
538
  }
481
539
 
482
- const resolveFragment = (fragment, state) => {
483
- if (!state.currentSubject) return null;
484
- const subjectValue = state.currentSubject.value;
485
- const hashIndex = subjectValue.indexOf('#');
486
- const baseIRI = hashIndex > -1 ? subjectValue.slice(0, hashIndex) : subjectValue;
487
- return state.df.namedNode(baseIRI + '#' + fragment);
488
- };
489
-
490
- function resolveSubject(sem, state) {
491
- if (!sem.subject) return null;
492
- if (sem.subject === 'RESET') {
493
- state.currentSubject = null;
494
- return null;
495
- }
496
- if (sem.subject.startsWith('=#')) return resolveFragment(sem.subject.substring(2), state);
497
- return state.df.namedNode(expandIRI(sem.subject, state.ctx));
498
- }
499
-
500
- function resolveObject(sem, state) {
501
- if (!sem.object) return null;
502
- if (sem.object.startsWith('#')) return resolveFragment(sem.object.substring(1), state);
503
- return state.df.namedNode(expandIRI(sem.object, state.ctx));
504
- }
505
-
506
540
  const createTypeQuad = (typeIRI, subject, state, block, entryIndex = null) => {
507
541
  const expandedType = expandIRI(typeIRI, state.ctx);
508
542
  const typeInfo = typeof entryIndex === 'object' ? entryIndex : { entryIndex, remove: false };
@@ -513,7 +547,8 @@ const createTypeQuad = (typeIRI, subject, state, block, entryIndex = null) => {
513
547
  state.df.namedNode(expandedType),
514
548
  state.df,
515
549
  { kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex: typeInfo.entryIndex, remove: typeInfo.remove },
516
- state.statements, state.statementCandidates
550
+ state.statements, state.statementCandidates,
551
+ state
517
552
  );
518
553
  };
519
554
 
@@ -557,43 +592,16 @@ function processPredicateAnnotations(sem, newSubject, previousSubject, localObje
557
592
  emitQuad(state.quads, state.quadBuffer, state.removeSet, state.origin.quadIndex, block,
558
593
  role.subject, P, role.object, state.df,
559
594
  { kind: 'pred', token: `${pred.form}${pred.iri}`, form: pred.form, expandedPredicate: P.value, entryIndex: pred.entryIndex, remove: pred.remove || false },
560
- state.statements, state.statementCandidates
595
+ state.statements, state.statementCandidates,
596
+ state
561
597
  );
562
598
  }
563
599
  });
564
600
  }
565
601
 
566
602
  function processAnnotation(carrier, sem, state, options = {}) {
567
- const { preserveGlobalSubject = false, implicitSubject = null } = options;
568
-
569
- if (sem.subject === 'RESET') {
570
- state.currentSubject = null;
571
- return;
572
- }
573
-
574
- const previousSubject = state.currentSubject;
575
- const newSubject = resolveSubject(sem, state);
576
- const localObject = resolveObject(sem, state);
577
-
578
- const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
579
- if (newSubject && !preserveGlobalSubject && !implicitSubject) {
580
- state.currentSubject = newSubject;
581
- }
582
- const S = preserveGlobalSubject ? (newSubject || previousSubject) : (implicitSubject || state.currentSubject);
583
- if (!S) return;
584
-
585
- const block = createBlock(
586
- S.value, sem.types, sem.predicates,
587
- carrier.range, carrier.attrsRange || null, carrier.valueRange || null,
588
- carrier.type || null, state.ctx, carrier.text
589
- );
590
-
591
- const L = createLiteral(carrier.text, sem.datatype, sem.language, state.ctx, state.df);
592
- const carrierO = carrier.url ? state.df.namedNode(expandIRI(carrier.url, state.ctx)) : null;
593
- const newSubjectOrCarrierO = newSubject || carrierO;
594
-
595
- processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state, carrier);
596
- processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state, carrier);
603
+ // Use the enhanced block tracking version
604
+ processAnnotationWithBlockTracking(carrier, sem, state, options);
597
605
  }
598
606
 
599
607
 
@@ -634,77 +642,9 @@ function processStandaloneSubject(token, state) {
634
642
  }
635
643
 
636
644
  const TOKEN_PROCESSORS = {
637
- heading: (token, state) => {
638
- processTokenAnnotations(token, state, token.type);
639
- },
640
- code: (token, state) => {
641
- processTokenAnnotations(token, state, token.type);
642
- },
643
- blockquote: (token, state) => {
644
- processTokenAnnotations(token, state, token.type);
645
- },
646
- para: (token, state) => {
647
- processStandaloneSubject(token, state);
648
- processTokenAnnotations(token, state, token.type);
649
- },
650
- list: (token, state) => {
651
- processTokenAnnotations(token, state, token.type);
652
- },
645
+ heading: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
646
+ code: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
647
+ blockquote: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
648
+ para: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry, [processStandaloneSubject]),
649
+ list: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
653
650
  };
654
-
655
- export function parse(text, options = {}) {
656
- const state = {
657
- ctx: { ...DEFAULT_CONTEXT, ...(options.context || {}) },
658
- df: options.dataFactory || DataFactory,
659
- quads: [],
660
- quadBuffer: new Map(),
661
- removeSet: new Set(),
662
- origin: { quadIndex: new Map() },
663
- currentSubject: null,
664
- tokens: null,
665
- currentTokenIndex: -1,
666
- statements: [],
667
- statementCandidates: new Map() // Track incomplete rdf:Statement patterns
668
- };
669
-
670
- state.tokens = scanTokens(text);
671
-
672
- // Single loop instead of filter+forEach for better performance
673
- for (const token of state.tokens) {
674
- if (token.type === 'prefix') {
675
- let resolvedIri = token.iri;
676
- if (token.iri.includes(':')) {
677
- const colonIndex = token.iri.indexOf(':');
678
- const potentialPrefix = token.iri.substring(0, colonIndex);
679
- const reference = token.iri.substring(colonIndex + 1);
680
- if (state.ctx[potentialPrefix] && potentialPrefix !== '@vocab') {
681
- resolvedIri = state.ctx[potentialPrefix] + reference;
682
- }
683
- }
684
- state.ctx[token.prefix] = resolvedIri;
685
- }
686
- }
687
-
688
- for (let i = 0; i < state.tokens.length; i++) {
689
- const token = state.tokens[i];
690
- state.currentTokenIndex = i;
691
- TOKEN_PROCESSORS[token.type]?.(token, state);
692
- }
693
-
694
- // Optimize array operations - avoid Array.from() and filter()
695
- const quadKeys = new Set();
696
- for (const quad of state.quads) {
697
- quadKeys.add(quadIndexKey(quad.subject, quad.predicate, quad.object));
698
- }
699
-
700
- // Direct iteration instead of Array.from() + filter()
701
- const filteredRemove = [];
702
- for (const quad of state.removeSet) {
703
- const key = quadIndexKey(quad.subject, quad.predicate, quad.object);
704
- if (!quadKeys.has(key)) {
705
- filteredRemove.push(quad);
706
- }
707
- }
708
-
709
- return { quads: state.quads, remove: filteredRemove, statements: state.statements, origin: state.origin, context: state.ctx };
710
- }