mdld-parse 0.7.2 → 0.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/constants.js +30 -0
- package/src/generate.js +37 -53
- package/src/index.js +1 -1
- package/src/locate.js +2 -17
- package/src/merge.js +4 -5
- package/src/parse.js +222 -282
- package/src/render.js +320 -357
- package/src/shared.js +529 -0
- package/src/utils.js +2 -9
package/src/parse.js
CHANGED
|
@@ -1,105 +1,130 @@
|
|
|
1
1
|
import {
|
|
2
|
-
DEFAULT_CONTEXT,
|
|
3
2
|
DataFactory,
|
|
4
3
|
expandIRI,
|
|
5
|
-
parseSemanticBlock,
|
|
6
4
|
quadIndexKey,
|
|
7
5
|
createLiteral,
|
|
8
6
|
hash
|
|
9
7
|
} from './utils.js';
|
|
8
|
+
import {
|
|
9
|
+
DEFAULT_CONTEXT,
|
|
10
|
+
URL_REGEX,
|
|
11
|
+
FENCE_REGEX,
|
|
12
|
+
PREFIX_REGEX,
|
|
13
|
+
HEADING_REGEX,
|
|
14
|
+
UNORDERED_LIST_REGEX,
|
|
15
|
+
BLOCKQUOTE_REGEX,
|
|
16
|
+
STANDALONE_SUBJECT_REGEX,
|
|
17
|
+
CARRIER_PATTERN_ARRAY,
|
|
18
|
+
|
|
19
|
+
} from './constants.js';
|
|
20
|
+
import {
|
|
10
21
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
getFenceClosePattern,
|
|
23
|
+
calcRangeInfo,
|
|
24
|
+
calcAttrsRange,
|
|
25
|
+
createToken,
|
|
26
|
+
createCarrier,
|
|
27
|
+
createListToken,
|
|
28
|
+
parseSemCached,
|
|
29
|
+
EMPTY_SEM,
|
|
30
|
+
parseLangAndAttrs,
|
|
31
|
+
findMatchingBracket,
|
|
32
|
+
extractUrlFromBrackets,
|
|
33
|
+
extractAttributesFromText,
|
|
34
|
+
determineCarrierType,
|
|
35
|
+
calcCarrierRanges,
|
|
36
|
+
extractCleanText,
|
|
37
|
+
RDF_TYPE,
|
|
38
|
+
RDF_STATEMENT,
|
|
39
|
+
RDF_SUBJECT,
|
|
40
|
+
RDF_PREDICATE,
|
|
41
|
+
RDF_OBJECT,
|
|
42
|
+
createLeanOriginEntry,
|
|
43
|
+
resolveFragment,
|
|
44
|
+
resolveSubject,
|
|
45
|
+
resolveObject,
|
|
46
|
+
processTokenWithBlockTracking
|
|
47
|
+
} from './shared.js';
|
|
25
48
|
|
|
26
|
-
function getFenceClosePattern(fenceChar) {
|
|
27
|
-
if (!FENCE_CLOSE_PATTERNS.has(fenceChar)) {
|
|
28
|
-
FENCE_CLOSE_PATTERNS.set(fenceChar, new RegExp(`^(${fenceChar}{3,})`));
|
|
29
|
-
}
|
|
30
|
-
return FENCE_CLOSE_PATTERNS.get(fenceChar);
|
|
31
|
-
}
|
|
32
49
|
|
|
33
|
-
function
|
|
34
|
-
const
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
50
|
+
export function parse(text, options = {}) {
|
|
51
|
+
const state = {
|
|
52
|
+
ctx: { ...DEFAULT_CONTEXT, ...(options.context || {}) },
|
|
53
|
+
df: options.dataFactory || DataFactory,
|
|
54
|
+
quads: [],
|
|
55
|
+
quadBuffer: new Map(),
|
|
56
|
+
removeSet: new Set(),
|
|
57
|
+
origin: {
|
|
58
|
+
quadIndex: new Map(),
|
|
59
|
+
blocks: new Map(),
|
|
60
|
+
documentStructure: []
|
|
61
|
+
},
|
|
62
|
+
currentSubject: null,
|
|
63
|
+
tokens: null,
|
|
64
|
+
currentTokenIndex: -1,
|
|
65
|
+
statements: [],
|
|
66
|
+
statementCandidates: new Map(),
|
|
67
|
+
currentBlock: null,
|
|
68
|
+
blockStack: []
|
|
43
69
|
};
|
|
44
|
-
}
|
|
45
70
|
|
|
46
|
-
|
|
47
|
-
const EMPTY_SEM = Object.freeze({ predicates: [], types: [], subject: null });
|
|
71
|
+
state.tokens = scanTokens(text);
|
|
48
72
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
73
|
+
// Single-pass processing: resolve prefixes AND process tokens together
|
|
74
|
+
for (let i = 0; i < state.tokens.length; i++) {
|
|
75
|
+
const token = state.tokens[i];
|
|
76
|
+
state.currentTokenIndex = i;
|
|
77
|
+
|
|
78
|
+
// Handle prefix tokens immediately during main pass
|
|
79
|
+
if (token.type === 'prefix') {
|
|
80
|
+
let resolvedIri = token.iri;
|
|
81
|
+
if (token.iri.includes(':')) {
|
|
82
|
+
const colonIndex = token.iri.indexOf(':');
|
|
83
|
+
const potentialPrefix = token.iri.substring(0, colonIndex);
|
|
84
|
+
const reference = token.iri.substring(colonIndex + 1);
|
|
85
|
+
if (state.ctx[potentialPrefix] && potentialPrefix !== '@vocab') {
|
|
86
|
+
resolvedIri = state.ctx[potentialPrefix] + reference;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
state.ctx[token.prefix] = resolvedIri;
|
|
90
|
+
continue; // Skip token processor for prefixes
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Process all other tokens
|
|
94
|
+
TOKEN_PROCESSORS[token.type]?.(token, state);
|
|
55
95
|
}
|
|
56
|
-
return sem;
|
|
57
|
-
}
|
|
58
96
|
|
|
59
|
-
|
|
60
|
-
const
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
valueRange: [lineStart + valueStartInLine, lineStart + valueStartInLine + valueLength],
|
|
65
|
-
attrsRange: calcAttrsRange(line, attrs, lineStart)
|
|
66
|
-
};
|
|
67
|
-
}
|
|
97
|
+
// Optimized quad filtering - use Set.has() instead of array.includes()
|
|
98
|
+
const quadKeys = new Set();
|
|
99
|
+
for (const quad of state.quads) {
|
|
100
|
+
quadKeys.add(quadIndexKey(quad.subject, quad.predicate, quad.object));
|
|
101
|
+
}
|
|
68
102
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
const
|
|
72
|
-
|
|
73
|
-
|
|
103
|
+
// Direct Set iteration - more efficient than filter()
|
|
104
|
+
const filteredRemove = [];
|
|
105
|
+
for (const quad of state.removeSet) {
|
|
106
|
+
const key = quadIndexKey(quad.subject, quad.predicate, quad.object);
|
|
107
|
+
if (!quadKeys.has(key)) {
|
|
108
|
+
filteredRemove.push(quad);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
74
111
|
|
|
75
|
-
|
|
76
|
-
const token = { type, range, text, attrs, attrsRange, valueRange, ...extra };
|
|
77
|
-
Object.defineProperty(token, '_carriers', {
|
|
78
|
-
enumerable: false, writable: true, value: null
|
|
79
|
-
});
|
|
80
|
-
return token;
|
|
112
|
+
return { quads: state.quads, remove: filteredRemove, statements: state.statements, origin: state.origin, context: state.ctx };
|
|
81
113
|
}
|
|
82
114
|
|
|
115
|
+
|
|
116
|
+
// Cache for fence regex patterns - using shared utility
|
|
117
|
+
|
|
83
118
|
function getCarriers(token) {
|
|
84
119
|
if (token.type === 'code') return [];
|
|
85
120
|
return token._carriers || (token._carriers = extractInlineCarriers(token.text, token.range[0]));
|
|
86
121
|
}
|
|
87
122
|
|
|
88
|
-
const createListToken = (type, line, lineStart, pos, match) => {
|
|
89
|
-
const attrs = match[4] || null;
|
|
90
|
-
const prefix = match[1].length + (match[2] ? match[2].length : 0);
|
|
91
|
-
const rangeInfo = calcRangeInfo(line, attrs, lineStart, prefix, match[3].length);
|
|
92
|
-
return createToken(type, [lineStart, pos - 1], match[3].trim(), attrs,
|
|
93
|
-
rangeInfo.attrsRange, rangeInfo.valueRange, { indent: match[1].length });
|
|
94
|
-
};
|
|
95
|
-
|
|
96
123
|
function scanTokens(text) {
|
|
97
124
|
const tokens = [];
|
|
98
125
|
const lines = text.split('\n');
|
|
99
126
|
let pos = 0;
|
|
100
127
|
let codeBlock = null;
|
|
101
|
-
|
|
102
|
-
// Direct lookup instead of linear search
|
|
103
128
|
const PROCESSORS = [
|
|
104
129
|
{ type: 'fence', test: line => FENCE_REGEX.test(line.trim()), process: handleFence },
|
|
105
130
|
{ type: 'content', test: () => codeBlock, process: line => codeBlock.content.push(line) },
|
|
@@ -206,16 +231,6 @@ function scanTokens(text) {
|
|
|
206
231
|
return tokens;
|
|
207
232
|
}
|
|
208
233
|
|
|
209
|
-
function createCarrier(type, text, attrs, attrsRange, valueRange, range, pos, extra = {}) {
|
|
210
|
-
return { type, text, attrs, attrsRange, valueRange, range, pos, ...extra };
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
// Pre-compiled carrier patterns for better performance
|
|
214
|
-
const CARRIER_PATTERN_ARRAY = [
|
|
215
|
-
['EMPHASIS', /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y],
|
|
216
|
-
['CODE_SPAN', /``(.+?)``\s*\{([^}]+)\}/y]
|
|
217
|
-
];
|
|
218
|
-
|
|
219
234
|
function extractInlineCarriers(text, baseOffset = 0) {
|
|
220
235
|
const carriers = [];
|
|
221
236
|
let pos = 0;
|
|
@@ -280,73 +295,124 @@ function extractInlineCarriers(text, baseOffset = 0) {
|
|
|
280
295
|
return carriers;
|
|
281
296
|
}
|
|
282
297
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
const
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
298
|
+
|
|
299
|
+
function createBlockEntry(token, state) {
|
|
300
|
+
const blockId = token._blockId || hash(`${token.type}:${token.range?.[0]}:${token.range?.[1]}`);
|
|
301
|
+
token._blockId = blockId; // Store for later reference
|
|
302
|
+
|
|
303
|
+
const cleanText = extractCleanText(token);
|
|
304
|
+
|
|
305
|
+
const blockEntry = {
|
|
306
|
+
id: blockId,
|
|
307
|
+
type: token.type,
|
|
308
|
+
range: token.range,
|
|
309
|
+
text: cleanText,
|
|
310
|
+
subject: null,
|
|
311
|
+
types: [],
|
|
312
|
+
predicates: [],
|
|
313
|
+
carriers: [],
|
|
314
|
+
listLevel: token.indent || 0,
|
|
315
|
+
parentBlockId: state.blockStack.length > 0 ? state.blockStack[state.blockStack.length - 1] : null,
|
|
316
|
+
quadKeys: [] // Will be populated during quad emission
|
|
293
317
|
};
|
|
318
|
+
|
|
319
|
+
// Store block and add to document structure
|
|
320
|
+
state.origin.blocks.set(blockId, blockEntry);
|
|
321
|
+
state.origin.documentStructure.push(blockEntry);
|
|
322
|
+
|
|
323
|
+
return blockEntry;
|
|
294
324
|
}
|
|
295
325
|
|
|
296
|
-
function
|
|
297
|
-
|
|
298
|
-
|
|
326
|
+
function enrichBlockFromAnnotation(blockEntry, sem, carrier, state) {
|
|
327
|
+
// Update subject if available
|
|
328
|
+
if (sem.subject && sem.subject !== 'RESET') {
|
|
329
|
+
const resolvedSubject = resolveSubject(sem, state);
|
|
330
|
+
if (resolvedSubject) {
|
|
331
|
+
blockEntry.subject = resolvedSubject.value;
|
|
332
|
+
}
|
|
333
|
+
}
|
|
299
334
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
335
|
+
// Add types
|
|
336
|
+
if (sem.types && sem.types.length > 0) {
|
|
337
|
+
sem.types.forEach(t => {
|
|
338
|
+
const typeIRI = typeof t === 'string' ? t : t.iri;
|
|
339
|
+
const expanded = expandIRI(typeIRI, state.ctx);
|
|
340
|
+
if (!blockEntry.types.includes(expanded)) {
|
|
341
|
+
blockEntry.types.push(expanded);
|
|
342
|
+
}
|
|
343
|
+
});
|
|
304
344
|
}
|
|
305
345
|
|
|
306
|
-
|
|
307
|
-
|
|
346
|
+
// Add predicates
|
|
347
|
+
if (sem.predicates && sem.predicates.length > 0) {
|
|
348
|
+
sem.predicates.forEach(pred => {
|
|
349
|
+
const expandedPred = {
|
|
350
|
+
iri: expandIRI(pred.iri, state.ctx),
|
|
351
|
+
form: pred.form || '',
|
|
352
|
+
object: null // Will be filled during quad emission
|
|
353
|
+
};
|
|
354
|
+
blockEntry.predicates.push(expandedPred);
|
|
355
|
+
});
|
|
356
|
+
}
|
|
308
357
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
358
|
+
// Add carrier information
|
|
359
|
+
if (carrier) {
|
|
360
|
+
const carrierInfo = {
|
|
361
|
+
type: carrier.type,
|
|
362
|
+
range: carrier.range,
|
|
363
|
+
text: carrier.text,
|
|
364
|
+
subject: null,
|
|
365
|
+
predicates: []
|
|
366
|
+
};
|
|
312
367
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
368
|
+
// Extract carrier-specific semantics
|
|
369
|
+
if (carrier.attrs) {
|
|
370
|
+
const carrierSem = parseSemCached(carrier.attrs);
|
|
371
|
+
if (carrierSem.types) {
|
|
372
|
+
carrierInfo.predicates = carrierSem.predicates || [];
|
|
373
|
+
}
|
|
318
374
|
}
|
|
319
|
-
}
|
|
320
375
|
|
|
321
|
-
|
|
376
|
+
blockEntry.carriers.push(carrierInfo);
|
|
377
|
+
}
|
|
322
378
|
}
|
|
323
379
|
|
|
324
|
-
function
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
const attrsStart = wsMatch ? wsMatch[0].length : 0;
|
|
331
|
-
|
|
332
|
-
if (remaining[attrsStart] === '{') {
|
|
333
|
-
const braceEnd = remaining.indexOf('}', attrsStart);
|
|
334
|
-
if (braceEnd !== -1) {
|
|
335
|
-
attrs = remaining.substring(attrsStart, braceEnd + 1);
|
|
336
|
-
const absStart = baseOffset + spanEnd + attrsStart;
|
|
337
|
-
attrsRange = [absStart, absStart + attrs.length];
|
|
338
|
-
spanEnd += braceEnd + 1;
|
|
339
|
-
}
|
|
380
|
+
function processAnnotationWithBlockTracking(carrier, sem, state, options = {}) {
|
|
381
|
+
const { preserveGlobalSubject = false, implicitSubject = null } = options;
|
|
382
|
+
|
|
383
|
+
if (sem.subject === 'RESET') {
|
|
384
|
+
state.currentSubject = null;
|
|
385
|
+
return;
|
|
340
386
|
}
|
|
341
387
|
|
|
342
|
-
|
|
343
|
-
|
|
388
|
+
const previousSubject = state.currentSubject;
|
|
389
|
+
const newSubject = resolveSubject(sem, state);
|
|
390
|
+
const localObject = resolveObject(sem, state);
|
|
391
|
+
|
|
392
|
+
const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
|
|
393
|
+
if (newSubject && !preserveGlobalSubject && !implicitSubject) {
|
|
394
|
+
state.currentSubject = newSubject;
|
|
395
|
+
}
|
|
396
|
+
const S = preserveGlobalSubject ? (newSubject || previousSubject) : (implicitSubject || state.currentSubject);
|
|
397
|
+
if (!S) return;
|
|
398
|
+
|
|
399
|
+
const block = createBlock(
|
|
400
|
+
S.value, sem.types, sem.predicates,
|
|
401
|
+
carrier.range, carrier.attrsRange || null, carrier.valueRange || null,
|
|
402
|
+
carrier.type || null, state.ctx, carrier.text
|
|
403
|
+
);
|
|
404
|
+
|
|
405
|
+
const L = createLiteral(carrier.text, sem.datatype, sem.language, state.ctx, state.df);
|
|
406
|
+
const carrierO = carrier.url ? state.df.namedNode(expandIRI(carrier.url, state.ctx)) : null;
|
|
407
|
+
const newSubjectOrCarrierO = newSubject || carrierO;
|
|
344
408
|
|
|
345
|
-
|
|
346
|
-
if (
|
|
347
|
-
|
|
409
|
+
// Enrich current block with semantic information
|
|
410
|
+
if (state.currentBlock) {
|
|
411
|
+
enrichBlockFromAnnotation(state.currentBlock, sem, carrier, state);
|
|
348
412
|
}
|
|
349
|
-
|
|
413
|
+
|
|
414
|
+
processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state, carrier);
|
|
415
|
+
processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state, carrier);
|
|
350
416
|
}
|
|
351
417
|
|
|
352
418
|
function createBlock(subject, types, predicates, range, attrsRange, valueRange, carrierType, ctx, text) {
|
|
@@ -371,7 +437,7 @@ function createBlock(subject, types, predicates, range, attrsRange, valueRange,
|
|
|
371
437
|
};
|
|
372
438
|
}
|
|
373
439
|
|
|
374
|
-
function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predicate, object, dataFactory, meta = null, statements = null, statementCandidates = null) {
|
|
440
|
+
function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predicate, object, dataFactory, meta = null, statements = null, statementCandidates = null, state = null) {
|
|
375
441
|
if (!subject || !predicate || !object) return;
|
|
376
442
|
|
|
377
443
|
const quad = dataFactory.quad(subject, predicate, object);
|
|
@@ -407,29 +473,21 @@ function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predi
|
|
|
407
473
|
// Detect rdf:Statement pattern during single-pass parsing
|
|
408
474
|
detectStatementPatternSinglePass(quad, dataFactory, meta, statements, statementCandidates);
|
|
409
475
|
|
|
410
|
-
// Create lean origin entry
|
|
411
|
-
const originEntry =
|
|
412
|
-
blockId: block.id,
|
|
413
|
-
range: block.range,
|
|
414
|
-
carrierType: block.carrierType,
|
|
415
|
-
subject: subject.value,
|
|
416
|
-
predicate: predicate.value,
|
|
417
|
-
context: block.context, // Direct reference instead of spread
|
|
418
|
-
polarity: meta?.remove ? '-' : '+',
|
|
419
|
-
value: block.text || ''
|
|
420
|
-
};
|
|
476
|
+
// Create lean origin entry using shared utility
|
|
477
|
+
const originEntry = createLeanOriginEntry(block, subject, predicate, meta);
|
|
421
478
|
|
|
422
479
|
quadIndex.set(quadKey, originEntry);
|
|
480
|
+
|
|
481
|
+
// Link block to this quad for reverse lookup during rendering
|
|
482
|
+
if (state.currentBlock && block.id === state.currentBlock.id) {
|
|
483
|
+
if (!state.currentBlock.quadKeys) {
|
|
484
|
+
state.currentBlock.quadKeys = [];
|
|
485
|
+
}
|
|
486
|
+
state.currentBlock.quadKeys.push(quadKey);
|
|
487
|
+
}
|
|
423
488
|
}
|
|
424
489
|
}
|
|
425
490
|
|
|
426
|
-
// Extract RDF constants once at module level for efficiency
|
|
427
|
-
const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
|
|
428
|
-
const RDF_STATEMENT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement';
|
|
429
|
-
const RDF_SUBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject';
|
|
430
|
-
const RDF_PREDICATE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate';
|
|
431
|
-
const RDF_OBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#object';
|
|
432
|
-
|
|
433
491
|
function detectStatementPatternSinglePass(quad, dataFactory, meta, statements = null, statementCandidates = null) {
|
|
434
492
|
// Skip if not called from parse context (for testing compatibility)
|
|
435
493
|
if (!statements || !statementCandidates) return;
|
|
@@ -479,30 +537,6 @@ function detectStatementPatternSinglePass(quad, dataFactory, meta, statements =
|
|
|
479
537
|
}
|
|
480
538
|
}
|
|
481
539
|
|
|
482
|
-
const resolveFragment = (fragment, state) => {
|
|
483
|
-
if (!state.currentSubject) return null;
|
|
484
|
-
const subjectValue = state.currentSubject.value;
|
|
485
|
-
const hashIndex = subjectValue.indexOf('#');
|
|
486
|
-
const baseIRI = hashIndex > -1 ? subjectValue.slice(0, hashIndex) : subjectValue;
|
|
487
|
-
return state.df.namedNode(baseIRI + '#' + fragment);
|
|
488
|
-
};
|
|
489
|
-
|
|
490
|
-
function resolveSubject(sem, state) {
|
|
491
|
-
if (!sem.subject) return null;
|
|
492
|
-
if (sem.subject === 'RESET') {
|
|
493
|
-
state.currentSubject = null;
|
|
494
|
-
return null;
|
|
495
|
-
}
|
|
496
|
-
if (sem.subject.startsWith('=#')) return resolveFragment(sem.subject.substring(2), state);
|
|
497
|
-
return state.df.namedNode(expandIRI(sem.subject, state.ctx));
|
|
498
|
-
}
|
|
499
|
-
|
|
500
|
-
function resolveObject(sem, state) {
|
|
501
|
-
if (!sem.object) return null;
|
|
502
|
-
if (sem.object.startsWith('#')) return resolveFragment(sem.object.substring(1), state);
|
|
503
|
-
return state.df.namedNode(expandIRI(sem.object, state.ctx));
|
|
504
|
-
}
|
|
505
|
-
|
|
506
540
|
const createTypeQuad = (typeIRI, subject, state, block, entryIndex = null) => {
|
|
507
541
|
const expandedType = expandIRI(typeIRI, state.ctx);
|
|
508
542
|
const typeInfo = typeof entryIndex === 'object' ? entryIndex : { entryIndex, remove: false };
|
|
@@ -513,7 +547,8 @@ const createTypeQuad = (typeIRI, subject, state, block, entryIndex = null) => {
|
|
|
513
547
|
state.df.namedNode(expandedType),
|
|
514
548
|
state.df,
|
|
515
549
|
{ kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex: typeInfo.entryIndex, remove: typeInfo.remove },
|
|
516
|
-
state.statements, state.statementCandidates
|
|
550
|
+
state.statements, state.statementCandidates,
|
|
551
|
+
state
|
|
517
552
|
);
|
|
518
553
|
};
|
|
519
554
|
|
|
@@ -557,43 +592,16 @@ function processPredicateAnnotations(sem, newSubject, previousSubject, localObje
|
|
|
557
592
|
emitQuad(state.quads, state.quadBuffer, state.removeSet, state.origin.quadIndex, block,
|
|
558
593
|
role.subject, P, role.object, state.df,
|
|
559
594
|
{ kind: 'pred', token: `${pred.form}${pred.iri}`, form: pred.form, expandedPredicate: P.value, entryIndex: pred.entryIndex, remove: pred.remove || false },
|
|
560
|
-
state.statements, state.statementCandidates
|
|
595
|
+
state.statements, state.statementCandidates,
|
|
596
|
+
state
|
|
561
597
|
);
|
|
562
598
|
}
|
|
563
599
|
});
|
|
564
600
|
}
|
|
565
601
|
|
|
566
602
|
function processAnnotation(carrier, sem, state, options = {}) {
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
if (sem.subject === 'RESET') {
|
|
570
|
-
state.currentSubject = null;
|
|
571
|
-
return;
|
|
572
|
-
}
|
|
573
|
-
|
|
574
|
-
const previousSubject = state.currentSubject;
|
|
575
|
-
const newSubject = resolveSubject(sem, state);
|
|
576
|
-
const localObject = resolveObject(sem, state);
|
|
577
|
-
|
|
578
|
-
const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
|
|
579
|
-
if (newSubject && !preserveGlobalSubject && !implicitSubject) {
|
|
580
|
-
state.currentSubject = newSubject;
|
|
581
|
-
}
|
|
582
|
-
const S = preserveGlobalSubject ? (newSubject || previousSubject) : (implicitSubject || state.currentSubject);
|
|
583
|
-
if (!S) return;
|
|
584
|
-
|
|
585
|
-
const block = createBlock(
|
|
586
|
-
S.value, sem.types, sem.predicates,
|
|
587
|
-
carrier.range, carrier.attrsRange || null, carrier.valueRange || null,
|
|
588
|
-
carrier.type || null, state.ctx, carrier.text
|
|
589
|
-
);
|
|
590
|
-
|
|
591
|
-
const L = createLiteral(carrier.text, sem.datatype, sem.language, state.ctx, state.df);
|
|
592
|
-
const carrierO = carrier.url ? state.df.namedNode(expandIRI(carrier.url, state.ctx)) : null;
|
|
593
|
-
const newSubjectOrCarrierO = newSubject || carrierO;
|
|
594
|
-
|
|
595
|
-
processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state, carrier);
|
|
596
|
-
processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state, carrier);
|
|
603
|
+
// Use the enhanced block tracking version
|
|
604
|
+
processAnnotationWithBlockTracking(carrier, sem, state, options);
|
|
597
605
|
}
|
|
598
606
|
|
|
599
607
|
|
|
@@ -634,77 +642,9 @@ function processStandaloneSubject(token, state) {
|
|
|
634
642
|
}
|
|
635
643
|
|
|
636
644
|
const TOKEN_PROCESSORS = {
|
|
637
|
-
heading: (token, state) =>
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
},
|
|
643
|
-
blockquote: (token, state) => {
|
|
644
|
-
processTokenAnnotations(token, state, token.type);
|
|
645
|
-
},
|
|
646
|
-
para: (token, state) => {
|
|
647
|
-
processStandaloneSubject(token, state);
|
|
648
|
-
processTokenAnnotations(token, state, token.type);
|
|
649
|
-
},
|
|
650
|
-
list: (token, state) => {
|
|
651
|
-
processTokenAnnotations(token, state, token.type);
|
|
652
|
-
},
|
|
645
|
+
heading: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
|
|
646
|
+
code: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
|
|
647
|
+
blockquote: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
|
|
648
|
+
para: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry, [processStandaloneSubject]),
|
|
649
|
+
list: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
|
|
653
650
|
};
|
|
654
|
-
|
|
655
|
-
export function parse(text, options = {}) {
|
|
656
|
-
const state = {
|
|
657
|
-
ctx: { ...DEFAULT_CONTEXT, ...(options.context || {}) },
|
|
658
|
-
df: options.dataFactory || DataFactory,
|
|
659
|
-
quads: [],
|
|
660
|
-
quadBuffer: new Map(),
|
|
661
|
-
removeSet: new Set(),
|
|
662
|
-
origin: { quadIndex: new Map() },
|
|
663
|
-
currentSubject: null,
|
|
664
|
-
tokens: null,
|
|
665
|
-
currentTokenIndex: -1,
|
|
666
|
-
statements: [],
|
|
667
|
-
statementCandidates: new Map() // Track incomplete rdf:Statement patterns
|
|
668
|
-
};
|
|
669
|
-
|
|
670
|
-
state.tokens = scanTokens(text);
|
|
671
|
-
|
|
672
|
-
// Single loop instead of filter+forEach for better performance
|
|
673
|
-
for (const token of state.tokens) {
|
|
674
|
-
if (token.type === 'prefix') {
|
|
675
|
-
let resolvedIri = token.iri;
|
|
676
|
-
if (token.iri.includes(':')) {
|
|
677
|
-
const colonIndex = token.iri.indexOf(':');
|
|
678
|
-
const potentialPrefix = token.iri.substring(0, colonIndex);
|
|
679
|
-
const reference = token.iri.substring(colonIndex + 1);
|
|
680
|
-
if (state.ctx[potentialPrefix] && potentialPrefix !== '@vocab') {
|
|
681
|
-
resolvedIri = state.ctx[potentialPrefix] + reference;
|
|
682
|
-
}
|
|
683
|
-
}
|
|
684
|
-
state.ctx[token.prefix] = resolvedIri;
|
|
685
|
-
}
|
|
686
|
-
}
|
|
687
|
-
|
|
688
|
-
for (let i = 0; i < state.tokens.length; i++) {
|
|
689
|
-
const token = state.tokens[i];
|
|
690
|
-
state.currentTokenIndex = i;
|
|
691
|
-
TOKEN_PROCESSORS[token.type]?.(token, state);
|
|
692
|
-
}
|
|
693
|
-
|
|
694
|
-
// Optimize array operations - avoid Array.from() and filter()
|
|
695
|
-
const quadKeys = new Set();
|
|
696
|
-
for (const quad of state.quads) {
|
|
697
|
-
quadKeys.add(quadIndexKey(quad.subject, quad.predicate, quad.object));
|
|
698
|
-
}
|
|
699
|
-
|
|
700
|
-
// Direct iteration instead of Array.from() + filter()
|
|
701
|
-
const filteredRemove = [];
|
|
702
|
-
for (const quad of state.removeSet) {
|
|
703
|
-
const key = quadIndexKey(quad.subject, quad.predicate, quad.object);
|
|
704
|
-
if (!quadKeys.has(key)) {
|
|
705
|
-
filteredRemove.push(quad);
|
|
706
|
-
}
|
|
707
|
-
}
|
|
708
|
-
|
|
709
|
-
return { quads: state.quads, remove: filteredRemove, statements: state.statements, origin: state.origin, context: state.ctx };
|
|
710
|
-
}
|