mdld-parse 0.4.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -2
- package/package.json +1 -1
- package/src/parse.js +410 -311
- package/src/serialize.js +527 -0
package/src/parse.js
CHANGED
|
@@ -9,23 +9,20 @@ import {
|
|
|
9
9
|
hash
|
|
10
10
|
} from './utils.js';
|
|
11
11
|
|
|
12
|
-
// Constants and patterns
|
|
13
12
|
const URL_REGEX = /^[a-zA-Z][a-zA-Z0-9+.-]*:/;
|
|
14
13
|
const FENCE_REGEX = /^(`{3,})(.*)/;
|
|
15
14
|
const PREFIX_REGEX = /^\[([^\]]+)\]\s*<([^>]+)>/;
|
|
16
15
|
const HEADING_REGEX = /^(#{1,6})\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
|
|
17
|
-
const
|
|
16
|
+
const UNORDERED_LIST_REGEX = /^(\s*)([-*+])\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
|
|
17
|
+
const ORDERED_LIST_REGEX = /^(\s*)(\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
|
|
18
18
|
const BLOCKQUOTE_REGEX = /^>\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
|
|
19
19
|
const STANDALONE_SUBJECT_REGEX = /^\s*\{=(.*?)\}\s*$/;
|
|
20
20
|
const LIST_CONTEXT_REGEX = /^(.+?)\s*\{([^}]+)\}$/;
|
|
21
|
-
|
|
22
|
-
// Inline carrier pattern constants
|
|
23
21
|
const INLINE_CARRIER_PATTERNS = {
|
|
24
22
|
EMPHASIS: /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y,
|
|
25
23
|
CODE_SPAN: /``(.+?)``\s*\{([^}]+)\}/y
|
|
26
24
|
};
|
|
27
25
|
|
|
28
|
-
// Semantic block cache to avoid repeated parsing
|
|
29
26
|
const semCache = {};
|
|
30
27
|
const EMPTY_SEM = Object.freeze({ predicates: [], types: [], subject: null });
|
|
31
28
|
|
|
@@ -39,41 +36,50 @@ function parseSemCached(attrs) {
|
|
|
39
36
|
return sem;
|
|
40
37
|
}
|
|
41
38
|
|
|
39
|
+
function calcRangeInfo(line, attrs, lineStart, prefixLength, valueLength) {
|
|
40
|
+
const wsLength = prefixLength < line.length && line[prefixLength] === ' ' ? 1 :
|
|
41
|
+
line.slice(prefixLength).match(/^\s+/)?.[0]?.length || 0;
|
|
42
|
+
const valueStartInLine = prefixLength + wsLength;
|
|
43
|
+
return {
|
|
44
|
+
valueRange: [lineStart + valueStartInLine, lineStart + valueStartInLine + valueLength],
|
|
45
|
+
attrsRange: calcAttrsRange(line, attrs, lineStart)
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
|
|
42
49
|
function calcAttrsRange(line, attrs, lineStart) {
|
|
43
50
|
if (!attrs) return null;
|
|
44
51
|
const attrsStartInLine = line.lastIndexOf(attrs);
|
|
45
52
|
return attrsStartInLine >= 0 ? [lineStart + attrsStartInLine, lineStart + attrsStartInLine + attrs.length] : null;
|
|
46
53
|
}
|
|
47
54
|
|
|
48
|
-
function calcValueRange(lineStart, valueStartInLine, valueEndInLine) {
|
|
49
|
-
return [lineStart + valueStartInLine, lineStart + valueEndInLine];
|
|
50
|
-
}
|
|
51
|
-
|
|
52
55
|
function createToken(type, range, text, attrs = null, attrsRange = null, valueRange = null, extra = {}) {
|
|
53
56
|
const token = { type, range, text, attrs, attrsRange, valueRange, ...extra };
|
|
54
|
-
// Add lazy carrier caching
|
|
55
57
|
Object.defineProperty(token, '_carriers', {
|
|
56
|
-
enumerable: false,
|
|
57
|
-
writable: true,
|
|
58
|
-
value: null
|
|
58
|
+
enumerable: false, writable: true, value: null
|
|
59
59
|
});
|
|
60
60
|
return token;
|
|
61
61
|
}
|
|
62
62
|
|
|
63
63
|
function getCarriers(token) {
|
|
64
|
-
if (
|
|
65
|
-
|
|
66
|
-
}
|
|
67
|
-
return token._carriers;
|
|
64
|
+
if (token.type === 'code') return [];
|
|
65
|
+
return token._carriers || (token._carriers = extractInlineCarriers(token.text, token.range[0]));
|
|
68
66
|
}
|
|
69
67
|
|
|
68
|
+
const createListToken = (type, line, lineStart, pos, match, indent = null) => {
|
|
69
|
+
const attrs = match[4] || null;
|
|
70
|
+
const prefix = match[1].length + (match[2] ? match[2].length : 0);
|
|
71
|
+
const rangeInfo = calcRangeInfo(line, attrs, lineStart, prefix, match[3].length);
|
|
72
|
+
const extra = indent !== null ? { indent } : { indent: match[1].length, number: parseInt(match[2]) };
|
|
73
|
+
return createToken(type, [lineStart, pos - 1], match[3].trim(), attrs,
|
|
74
|
+
rangeInfo.attrsRange, rangeInfo.valueRange, extra);
|
|
75
|
+
};
|
|
76
|
+
|
|
70
77
|
function scanTokens(text) {
|
|
71
78
|
const tokens = [];
|
|
72
79
|
const lines = text.split('\n');
|
|
73
80
|
let pos = 0;
|
|
74
81
|
let codeBlock = null;
|
|
75
82
|
|
|
76
|
-
// Token processors in order of priority
|
|
77
83
|
const processors = [
|
|
78
84
|
{
|
|
79
85
|
test: line => line.startsWith('```'),
|
|
@@ -109,14 +115,14 @@ function scanTokens(text) {
|
|
|
109
115
|
});
|
|
110
116
|
codeBlock = null;
|
|
111
117
|
}
|
|
112
|
-
return true;
|
|
118
|
+
return true;
|
|
113
119
|
}
|
|
114
120
|
},
|
|
115
121
|
{
|
|
116
122
|
test: () => codeBlock,
|
|
117
123
|
process: line => {
|
|
118
124
|
codeBlock.content.push(line);
|
|
119
|
-
return true;
|
|
125
|
+
return true;
|
|
120
126
|
}
|
|
121
127
|
},
|
|
122
128
|
{
|
|
@@ -124,7 +130,7 @@ function scanTokens(text) {
|
|
|
124
130
|
process: (line, lineStart, pos) => {
|
|
125
131
|
const match = PREFIX_REGEX.exec(line);
|
|
126
132
|
tokens.push({ type: 'prefix', prefix: match[1], iri: match[2].trim() });
|
|
127
|
-
return true;
|
|
133
|
+
return true;
|
|
128
134
|
}
|
|
129
135
|
},
|
|
130
136
|
{
|
|
@@ -133,32 +139,26 @@ function scanTokens(text) {
|
|
|
133
139
|
const match = HEADING_REGEX.exec(line);
|
|
134
140
|
const attrs = match[3] || null;
|
|
135
141
|
const afterHashes = match[1].length;
|
|
136
|
-
const
|
|
137
|
-
line.slice(afterHashes).match(/^\s+/)?.[0]?.length || 0;
|
|
138
|
-
const valueStartInLine = afterHashes + wsLength;
|
|
139
|
-
const valueEndInLine = valueStartInLine + match[2].length;
|
|
142
|
+
const rangeInfo = calcRangeInfo(line, attrs, lineStart, afterHashes, match[2].length);
|
|
140
143
|
tokens.push(createToken('heading', [lineStart, pos - 1], match[2].trim(), attrs,
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
{ depth: match[1].length }));
|
|
144
|
-
return true; // handled
|
|
144
|
+
rangeInfo.attrsRange, rangeInfo.valueRange, { depth: match[1].length }));
|
|
145
|
+
return true;
|
|
145
146
|
}
|
|
146
147
|
},
|
|
147
148
|
{
|
|
148
|
-
test: line =>
|
|
149
|
+
test: line => UNORDERED_LIST_REGEX.test(line),
|
|
149
150
|
process: (line, lineStart, pos) => {
|
|
150
|
-
const match =
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
return true; // handled
|
|
151
|
+
const match = UNORDERED_LIST_REGEX.exec(line);
|
|
152
|
+
tokens.push(createListToken('unordered-list', line, lineStart, pos, match, match[1].length));
|
|
153
|
+
return true;
|
|
154
|
+
}
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
test: line => ORDERED_LIST_REGEX.test(line),
|
|
158
|
+
process: (line, lineStart, pos) => {
|
|
159
|
+
const match = ORDERED_LIST_REGEX.exec(line);
|
|
160
|
+
tokens.push(createListToken('ordered-list', line, lineStart, pos, match));
|
|
161
|
+
return true;
|
|
162
162
|
}
|
|
163
163
|
},
|
|
164
164
|
{
|
|
@@ -170,15 +170,15 @@ function scanTokens(text) {
|
|
|
170
170
|
const valueEndInLine = valueStartInLine + match[1].length;
|
|
171
171
|
tokens.push(createToken('blockquote', [lineStart, pos - 1], match[1].trim(), attrs,
|
|
172
172
|
calcAttrsRange(line, attrs, lineStart),
|
|
173
|
-
|
|
174
|
-
return true;
|
|
173
|
+
[lineStart + valueStartInLine, lineStart + valueEndInLine]));
|
|
174
|
+
return true;
|
|
175
175
|
}
|
|
176
176
|
},
|
|
177
177
|
{
|
|
178
178
|
test: line => line.trim(),
|
|
179
179
|
process: (line, lineStart, pos) => {
|
|
180
180
|
tokens.push(createToken('para', [lineStart, pos - 1], line.trim()));
|
|
181
|
-
return true;
|
|
181
|
+
return true;
|
|
182
182
|
}
|
|
183
183
|
}
|
|
184
184
|
];
|
|
@@ -191,7 +191,7 @@ function scanTokens(text) {
|
|
|
191
191
|
// Try each processor until one handles the line
|
|
192
192
|
for (const processor of processors) {
|
|
193
193
|
if (processor.test(line) && processor.process(line, lineStart, pos)) {
|
|
194
|
-
break;
|
|
194
|
+
break;
|
|
195
195
|
}
|
|
196
196
|
}
|
|
197
197
|
}
|
|
@@ -207,44 +207,35 @@ function extractInlineCarriers(text, baseOffset = 0) {
|
|
|
207
207
|
const carriers = [];
|
|
208
208
|
let pos = 0;
|
|
209
209
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
// Angle-bracket URLs: <URL>{...}
|
|
213
|
-
if (text[pos] === '<') {
|
|
210
|
+
const CARRIER_EXTRACTORS = {
|
|
211
|
+
'<': (text, pos, baseOffset) => {
|
|
214
212
|
const angleEnd = text.indexOf('>', pos);
|
|
215
|
-
if (angleEnd
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
}
|
|
225
|
-
return null;
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
// Bracketed links: [text](URL){...} and [text]{...}
|
|
229
|
-
if (text[pos] === '[') {
|
|
213
|
+
if (angleEnd === -1) return null;
|
|
214
|
+
const url = text.slice(pos + 1, angleEnd);
|
|
215
|
+
if (!URL_REGEX.test(url)) return null;
|
|
216
|
+
const { attrs, attrsRange, finalSpanEnd } = extractAttributesFromText(text, angleEnd + 1, baseOffset);
|
|
217
|
+
return createCarrier('link', url, attrs, attrsRange,
|
|
218
|
+
[baseOffset + pos + 1, baseOffset + angleEnd],
|
|
219
|
+
[baseOffset + pos, baseOffset + finalSpanEnd], finalSpanEnd, { url });
|
|
220
|
+
},
|
|
221
|
+
'[': (text, pos, baseOffset) => {
|
|
230
222
|
const bracketEnd = findMatchingBracket(text, pos);
|
|
231
|
-
if (bracketEnd)
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
[baseOffset + pos + 1, baseOffset + bracketEnd - 1],
|
|
241
|
-
[baseOffset + pos, baseOffset + finalSpanEnd],
|
|
242
|
-
finalSpanEnd, { url: resourceIRI });
|
|
243
|
-
}
|
|
244
|
-
return null;
|
|
223
|
+
if (!bracketEnd) return null;
|
|
224
|
+
const carrierText = text.slice(pos + 1, bracketEnd - 1);
|
|
225
|
+
const { url, spanEnd } = extractUrlFromBrackets(text, bracketEnd);
|
|
226
|
+
const { attrs, attrsRange, finalSpanEnd } = extractAttributesFromText(text, spanEnd, baseOffset);
|
|
227
|
+
const { carrierType, resourceIRI } = determineCarrierType(url);
|
|
228
|
+
if (url?.startsWith('=')) return { skip: true, pos: finalSpanEnd };
|
|
229
|
+
return createCarrier(carrierType, carrierText, attrs, attrsRange,
|
|
230
|
+
[baseOffset + pos + 1, baseOffset + bracketEnd - 1],
|
|
231
|
+
[baseOffset + pos, baseOffset + finalSpanEnd], finalSpanEnd, { url: resourceIRI });
|
|
245
232
|
}
|
|
233
|
+
};
|
|
234
|
+
|
|
235
|
+
const extractCarrier = (text, pos, baseOffset) => {
|
|
236
|
+
const extractor = CARRIER_EXTRACTORS[text[pos]];
|
|
237
|
+
if (extractor) return extractor(text, pos, baseOffset);
|
|
246
238
|
|
|
247
|
-
// Regex-based carriers: emphasis and code spans
|
|
248
239
|
for (const [type, pattern] of Object.entries(INLINE_CARRIER_PATTERNS)) {
|
|
249
240
|
pattern.lastIndex = pos;
|
|
250
241
|
const match = pattern.exec(text);
|
|
@@ -255,7 +246,6 @@ function extractInlineCarriers(text, baseOffset = 0) {
|
|
|
255
246
|
ranges.attrsRange, ranges.valueRange, ranges.range, ranges.pos);
|
|
256
247
|
}
|
|
257
248
|
}
|
|
258
|
-
|
|
259
249
|
return null;
|
|
260
250
|
};
|
|
261
251
|
|
|
@@ -352,14 +342,9 @@ function createBlock(subject, types, predicates, entries, range, attrsRange, val
|
|
|
352
342
|
predicates: predicates.map(p => ({ iri: expandIRI(p.iri, ctx), form: p.form }))
|
|
353
343
|
};
|
|
354
344
|
|
|
355
|
-
const signature = [
|
|
356
|
-
subject,
|
|
357
|
-
carrierType || 'unknown',
|
|
358
|
-
expanded.types.join(','),
|
|
359
|
-
expanded.predicates.map(p => `${p.form}${p.iri}`).join(',')
|
|
360
|
-
].join('|');
|
|
361
|
-
|
|
345
|
+
const signature = [subject, carrierType || 'unknown', expanded.types.join(','), expanded.predicates.map(p => `${p.form}${p.iri}`).join(',')].join('|');
|
|
362
346
|
const blockId = hash(signature);
|
|
347
|
+
|
|
363
348
|
return {
|
|
364
349
|
id: blockId,
|
|
365
350
|
range: { start: range[0], end: range[1] },
|
|
@@ -386,93 +371,75 @@ function emitQuad(quads, quadIndex, blockId, subject, predicate, object, dataFac
|
|
|
386
371
|
|
|
387
372
|
quadIndex.set(quadIndexKey(quad.subject, quad.predicate, quad.object), slotInfo);
|
|
388
373
|
}
|
|
374
|
+
const resolveFragment = (fragment, state) => {
|
|
375
|
+
if (!state.currentSubject) return null;
|
|
376
|
+
const baseIRI = state.currentSubject.value.split('#')[0];
|
|
377
|
+
return state.df.namedNode(`${baseIRI}#${fragment}`);
|
|
378
|
+
};
|
|
379
|
+
|
|
389
380
|
function resolveSubject(sem, state) {
|
|
390
381
|
if (!sem.subject) return null;
|
|
391
382
|
if (sem.subject === 'RESET') {
|
|
392
383
|
state.currentSubject = null;
|
|
393
384
|
return null;
|
|
394
385
|
}
|
|
395
|
-
if (sem.subject.startsWith('=#'))
|
|
396
|
-
const fragment = sem.subject.substring(2);
|
|
397
|
-
if (state.currentSubject) {
|
|
398
|
-
const baseIRI = state.currentSubject.value.split('#')[0];
|
|
399
|
-
return state.df.namedNode(`${baseIRI}#${fragment}`);
|
|
400
|
-
}
|
|
401
|
-
return null;
|
|
402
|
-
}
|
|
386
|
+
if (sem.subject.startsWith('=#')) return resolveFragment(sem.subject.substring(2), state);
|
|
403
387
|
return state.df.namedNode(expandIRI(sem.subject, state.ctx));
|
|
404
388
|
}
|
|
405
389
|
|
|
406
390
|
function resolveObject(sem, state) {
|
|
407
391
|
if (!sem.object) return null;
|
|
408
|
-
if (sem.object.startsWith('#'))
|
|
409
|
-
const fragment = sem.object.substring(1);
|
|
410
|
-
if (state.currentSubject) {
|
|
411
|
-
const baseIRI = state.currentSubject.value.split('#')[0];
|
|
412
|
-
return state.df.namedNode(`${baseIRI}#${fragment}`);
|
|
413
|
-
}
|
|
414
|
-
return null;
|
|
415
|
-
}
|
|
392
|
+
if (sem.object.startsWith('#')) return resolveFragment(sem.object.substring(1), state);
|
|
416
393
|
return state.df.namedNode(expandIRI(sem.object, state.ctx));
|
|
417
394
|
}
|
|
418
395
|
|
|
396
|
+
const createTypeQuad = (typeIRI, subject, state, blockId, entryIndex = null) => {
|
|
397
|
+
const expandedType = expandIRI(typeIRI, state.ctx);
|
|
398
|
+
emitQuad(
|
|
399
|
+
state.quads, state.origin.quadIndex, blockId,
|
|
400
|
+
subject,
|
|
401
|
+
state.df.namedNode(expandIRI('rdf:type', state.ctx)),
|
|
402
|
+
state.df.namedNode(expandedType),
|
|
403
|
+
state.df,
|
|
404
|
+
{ kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex }
|
|
405
|
+
);
|
|
406
|
+
};
|
|
407
|
+
|
|
419
408
|
function processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state, carrier) {
|
|
420
409
|
sem.types.forEach(t => {
|
|
421
410
|
const typeIRI = typeof t === 'string' ? t : t.iri;
|
|
422
411
|
const entryIndex = typeof t === 'string' ? null : t.entryIndex;
|
|
423
|
-
|
|
424
|
-
// For angle-bracket URLs and bracketed links [text](URL), use the URL as the subject
|
|
425
|
-
// for type declarations when there's no explicit subject declaration.
|
|
426
|
-
// This implements {+URL} soft subject behavior.
|
|
427
412
|
let typeSubject = newSubject ? newSubject : (localObject || carrierO || S);
|
|
428
413
|
if (carrier?.type === 'link' && carrier?.url && !newSubject) {
|
|
429
|
-
typeSubject = carrierO;
|
|
414
|
+
typeSubject = carrierO;
|
|
430
415
|
}
|
|
431
|
-
|
|
432
|
-
const expandedType = expandIRI(typeIRI, state.ctx);
|
|
433
|
-
|
|
434
|
-
emitQuad(
|
|
435
|
-
state.quads, state.origin.quadIndex, block.id,
|
|
436
|
-
typeSubject,
|
|
437
|
-
state.df.namedNode(expandIRI('rdf:type', state.ctx)),
|
|
438
|
-
state.df.namedNode(expandedType),
|
|
439
|
-
state.df,
|
|
440
|
-
{ kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex }
|
|
441
|
-
);
|
|
416
|
+
createTypeQuad(typeIRI, typeSubject, state, block.id, entryIndex);
|
|
442
417
|
});
|
|
443
418
|
}
|
|
444
419
|
|
|
420
|
+
const determinePredicateRole = (pred, carrier, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L) => {
|
|
421
|
+
if (pred.form === '' && carrier?.type === 'link' && carrier?.url && carrier.text === carrier.url) {
|
|
422
|
+
return null;
|
|
423
|
+
}
|
|
424
|
+
switch (pred.form) {
|
|
425
|
+
case '':
|
|
426
|
+
return carrier?.type === 'link' && carrier?.url && carrier.text !== carrier.url && !newSubject
|
|
427
|
+
? { subject: newSubjectOrCarrierO, object: L }
|
|
428
|
+
: { subject: localObject || S, object: L };
|
|
429
|
+
case '?':
|
|
430
|
+
return { subject: newSubject ? previousSubject : S, object: localObject || newSubjectOrCarrierO };
|
|
431
|
+
case '!':
|
|
432
|
+
return { subject: localObject || newSubjectOrCarrierO, object: newSubject ? previousSubject : S };
|
|
433
|
+
default:
|
|
434
|
+
return null;
|
|
435
|
+
}
|
|
436
|
+
};
|
|
437
|
+
|
|
445
438
|
function processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state, carrier) {
|
|
446
439
|
sem.predicates.forEach(pred => {
|
|
447
|
-
const
|
|
448
|
-
|
|
449
|
-
// Skip literal predicates for angle-bracket URLs only
|
|
450
|
-
if (pred.form === '' && carrier?.type === 'link' && carrier?.url && carrier.text === carrier.url) {
|
|
451
|
-
return;
|
|
452
|
-
}
|
|
453
|
-
|
|
454
|
-
// Determine subject/object roles based on predicate form
|
|
455
|
-
let role;
|
|
456
|
-
switch (pred.form) {
|
|
457
|
-
case '':
|
|
458
|
-
// For bracketed links with literal predicates and no explicit subject, use URL as subject
|
|
459
|
-
if (carrier?.type === 'link' && carrier?.url && carrier.text !== carrier.url && !newSubject) {
|
|
460
|
-
role = { subject: newSubjectOrCarrierO, object: L };
|
|
461
|
-
} else {
|
|
462
|
-
role = { subject: localObject || S, object: L };
|
|
463
|
-
}
|
|
464
|
-
break;
|
|
465
|
-
case '?':
|
|
466
|
-
role = { subject: newSubject ? previousSubject : S, object: localObject || newSubjectOrCarrierO };
|
|
467
|
-
break;
|
|
468
|
-
case '!':
|
|
469
|
-
role = { subject: localObject || newSubjectOrCarrierO, object: newSubject ? previousSubject : S };
|
|
470
|
-
break;
|
|
471
|
-
default:
|
|
472
|
-
role = null;
|
|
473
|
-
}
|
|
474
|
-
|
|
440
|
+
const role = determinePredicateRole(pred, carrier, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L);
|
|
475
441
|
if (role) {
|
|
442
|
+
const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
|
|
476
443
|
emitQuad(state.quads, state.origin.quadIndex, block.id,
|
|
477
444
|
role.subject, P, role.object, state.df,
|
|
478
445
|
{ kind: 'pred', token: `${pred.form}${pred.iri}`, form: pred.form, expandedPredicate: P.value, entryIndex: pred.entryIndex }
|
|
@@ -493,7 +460,6 @@ function processAnnotation(carrier, sem, state, options = {}) {
|
|
|
493
460
|
const newSubject = resolveSubject(sem, state);
|
|
494
461
|
const localObject = resolveObject(sem, state);
|
|
495
462
|
|
|
496
|
-
// Use implicit subject if provided (for list items)
|
|
497
463
|
const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
|
|
498
464
|
if (newSubject && !preserveGlobalSubject && !implicitSubject) {
|
|
499
465
|
state.currentSubject = newSubject;
|
|
@@ -542,7 +508,6 @@ export function findItemSubject(listToken, carriers, state) {
|
|
|
542
508
|
}
|
|
543
509
|
|
|
544
510
|
function hasOwnPredicates(listToken, carriers) {
|
|
545
|
-
// Check for explicit predicates (excluding subject declarations)
|
|
546
511
|
if (listToken.attrs) {
|
|
547
512
|
const attrs = parseSemCached(listToken.attrs);
|
|
548
513
|
if (attrs.predicates.some(p => !p.subject && p.iri !== 'RESET')) {
|
|
@@ -555,9 +520,7 @@ function hasOwnPredicates(listToken, carriers) {
|
|
|
555
520
|
});
|
|
556
521
|
}
|
|
557
522
|
|
|
558
|
-
|
|
559
|
-
function processContextSem({ sem, itemSubject, contextSubject, inheritLiterals = false, state, blockId = 'list-context' }) {
|
|
560
|
-
// Emit types
|
|
523
|
+
const processContextSem = ({ sem, itemSubject, contextSubject, inheritLiterals = false, state, blockId = 'list-context' }) => {
|
|
561
524
|
sem.types.forEach(t => {
|
|
562
525
|
const typeIRI = typeof t === 'string' ? t : t.iri;
|
|
563
526
|
emitQuad(
|
|
@@ -569,7 +532,6 @@ function processContextSem({ sem, itemSubject, contextSubject, inheritLiterals =
|
|
|
569
532
|
);
|
|
570
533
|
});
|
|
571
534
|
|
|
572
|
-
// Emit directional predicates
|
|
573
535
|
sem.predicates.forEach(pred => {
|
|
574
536
|
const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
|
|
575
537
|
if (pred.form === '!') {
|
|
@@ -579,46 +541,35 @@ function processContextSem({ sem, itemSubject, contextSubject, inheritLiterals =
|
|
|
579
541
|
}
|
|
580
542
|
});
|
|
581
543
|
|
|
582
|
-
// Optionally inherit literal predicates
|
|
583
544
|
if (inheritLiterals) {
|
|
584
545
|
const literalPredicates = sem.predicates.filter(p => p.form === '');
|
|
585
546
|
if (literalPredicates.length > 0) {
|
|
586
547
|
return {
|
|
587
|
-
subject: null,
|
|
588
|
-
object: null,
|
|
589
|
-
types: [],
|
|
548
|
+
subject: null, object: null, types: [],
|
|
590
549
|
predicates: literalPredicates.map(p => ({ iri: p.iri, form: p.form, entryIndex: p.entryIndex })),
|
|
591
|
-
datatype: null,
|
|
592
|
-
language: null,
|
|
593
|
-
entries: []
|
|
550
|
+
datatype: null, language: null, entries: []
|
|
594
551
|
};
|
|
595
552
|
}
|
|
596
553
|
}
|
|
597
554
|
return null;
|
|
598
|
-
}
|
|
555
|
+
};
|
|
599
556
|
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
// Pop stack frames for lists that have ended (strictly less indent)
|
|
603
|
-
while (
|
|
604
|
-
state.listStack.length &&
|
|
605
|
-
token.indent < state.listStack[state.listStack.length - 1].indent
|
|
606
|
-
) {
|
|
557
|
+
const manageListStack = (token, state) => {
|
|
558
|
+
while (state.listStack.length && token.indent < state.listStack[state.listStack.length - 1].indent) {
|
|
607
559
|
state.listStack.pop();
|
|
608
560
|
}
|
|
609
561
|
|
|
610
|
-
// If we have pending context, always create a new frame for it
|
|
611
562
|
if (state.pendingListContext) {
|
|
612
563
|
state.listStack.push({
|
|
613
564
|
indent: token.indent,
|
|
614
565
|
anchorSubject: state.pendingListContext.subject,
|
|
615
566
|
contextSubject: state.pendingListContext.subject,
|
|
616
|
-
contextSem: state.pendingListContext.sem
|
|
567
|
+
contextSem: state.pendingListContext.sem,
|
|
568
|
+
contextText: state.pendingListContext.contextText,
|
|
569
|
+
contextToken: state.pendingListContext.contextToken // Store context token for origins
|
|
617
570
|
});
|
|
618
571
|
state.pendingListContext = null;
|
|
619
572
|
} else if (state.listStack.length === 0 || token.indent > state.listStack[state.listStack.length - 1].indent) {
|
|
620
|
-
// Push empty frame for nested lists without explicit context
|
|
621
|
-
// Inherit anchorSubject from parent frame if available
|
|
622
573
|
const parentFrame = state.listStack.length > 0 ? state.listStack[state.listStack.length - 1] : null;
|
|
623
574
|
state.listStack.push({
|
|
624
575
|
indent: token.indent,
|
|
@@ -627,138 +578,278 @@ function manageListStack(token, state) {
|
|
|
627
578
|
contextSem: null
|
|
628
579
|
});
|
|
629
580
|
}
|
|
630
|
-
|
|
631
|
-
}
|
|
581
|
+
};
|
|
632
582
|
|
|
633
|
-
|
|
634
|
-
const
|
|
583
|
+
const combineSemanticInfo = (token, carriers, listFrame, state, itemSubject) => {
|
|
584
|
+
const combinedSem = { subject: null, object: null, types: [], predicates: [], datatype: null, language: null, entries: [] };
|
|
585
|
+
const addSem = (sem) => { combinedSem.types.push(...sem.types); combinedSem.predicates.push(...sem.predicates); combinedSem.entries.push(...sem.entries); };
|
|
635
586
|
|
|
636
|
-
|
|
587
|
+
if (listFrame?.contextSem) {
|
|
588
|
+
const inheritedSem = processContextSem({ sem: listFrame.contextSem, itemSubject, contextSubject: listFrame.contextSubject, inheritLiterals: true, state });
|
|
589
|
+
if (inheritedSem) addSem(inheritedSem);
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
if (token.attrs) addSem(parseSemCached(token.attrs));
|
|
593
|
+
carriers.forEach(carrier => { if (carrier.attrs) addSem(parseSemCached(carrier.attrs)); });
|
|
594
|
+
|
|
595
|
+
return combinedSem;
|
|
596
|
+
};
|
|
597
|
+
|
|
598
|
+
const processListItem = (token, state) => {
|
|
599
|
+
const carriers = getCarriers(token);
|
|
637
600
|
const itemInfo = findItemSubject(token, carriers, state);
|
|
638
601
|
if (!itemInfo) return;
|
|
639
602
|
|
|
640
603
|
const { subject: itemSubject } = itemInfo;
|
|
641
|
-
|
|
642
|
-
// Update the current list frame to track this item's subject for nested contexts
|
|
643
|
-
if (state.listStack.length > 0) {
|
|
644
|
-
const currentFrame = state.listStack[state.listStack.length - 1];
|
|
645
|
-
currentFrame.anchorSubject = itemSubject;
|
|
646
|
-
}
|
|
604
|
+
if (state.listStack.length > 0) state.listStack[state.listStack.length - 1].anchorSubject = itemSubject;
|
|
647
605
|
|
|
648
606
|
const listFrame = state.listStack[state.listStack.length - 1];
|
|
607
|
+
const combinedSem = combineSemanticInfo(token, carriers, listFrame, state, itemSubject);
|
|
649
608
|
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
object: null,
|
|
654
|
-
types: [],
|
|
655
|
-
predicates: [],
|
|
656
|
-
datatype: null,
|
|
657
|
-
language: null,
|
|
658
|
-
entries: []
|
|
659
|
-
};
|
|
609
|
+
if (combinedSem.entries.length > 0) {
|
|
610
|
+
const prevSubject = state.currentSubject;
|
|
611
|
+
state.currentSubject = itemSubject;
|
|
660
612
|
|
|
661
|
-
|
|
662
|
-
if (listFrame?.contextSem) {
|
|
663
|
-
const inheritedSem = processContextSem({
|
|
664
|
-
sem: listFrame.contextSem,
|
|
665
|
-
itemSubject,
|
|
666
|
-
contextSubject: listFrame.contextSubject,
|
|
667
|
-
inheritLiterals: true,
|
|
668
|
-
state
|
|
669
|
-
});
|
|
613
|
+
processAnnotation({ type: 'list', text: token.text, range: token.range, attrsRange: token.attrsRange || null, valueRange: token.valueRange || null }, combinedSem, state, { preserveGlobalSubject: !state.listStack.length, implicitSubject: itemSubject });
|
|
670
614
|
|
|
671
|
-
|
|
672
|
-
combinedSem.types.push(...inheritedSem.types);
|
|
673
|
-
combinedSem.predicates.push(...inheritedSem.predicates);
|
|
674
|
-
combinedSem.entries.push(...inheritedSem.entries);
|
|
675
|
-
}
|
|
615
|
+
state.currentSubject = prevSubject;
|
|
676
616
|
}
|
|
617
|
+
};
|
|
677
618
|
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
619
|
+
const applyListAnchorAnnotations = (itemSubject, contextSem, state, listItemText, contextToken) => {
|
|
620
|
+
// Use the context token's ranges for proper origin tracking
|
|
621
|
+
const baseToken = contextToken || { range: [0, 0], attrsRange: [0, 0] };
|
|
622
|
+
|
|
623
|
+
const paragraphText = baseToken.text || '';
|
|
624
|
+
const annotationMatch = paragraphText.match(/\{[^}]+\}/);
|
|
625
|
+
|
|
626
|
+
let annotationStart;
|
|
627
|
+
if (annotationMatch && baseToken.range) {
|
|
628
|
+
// Found annotation in paragraph, calculate its absolute position
|
|
629
|
+
const relativeStart = paragraphText.indexOf(annotationMatch[0]);
|
|
630
|
+
annotationStart = baseToken.range[0] + relativeStart;
|
|
631
|
+
} else {
|
|
632
|
+
// Fallback to start of token
|
|
633
|
+
annotationStart = baseToken.range ? baseToken.range[0] : 0;
|
|
684
634
|
}
|
|
685
635
|
|
|
686
|
-
//
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
636
|
+
// Apply types with proper ranges
|
|
637
|
+
contextSem.types.forEach(type => {
|
|
638
|
+
const entry = contextSem.entries.find(e => e.kind === 'type' && e.iri === type.iri);
|
|
639
|
+
if (entry && entry.relRange) {
|
|
640
|
+
// Calculate absolute range: annotation start + relative range within annotation
|
|
641
|
+
const typeRange = [annotationStart + entry.relRange.start, annotationStart + entry.relRange.end];
|
|
642
|
+
|
|
643
|
+
emitQuad(state.quads, state.origin.quadIndex, 'list-anchor-type',
|
|
644
|
+
itemSubject,
|
|
645
|
+
state.df.namedNode(expandIRI('rdf:type', state.ctx)),
|
|
646
|
+
state.df.namedNode(expandIRI(type.iri, state.ctx)),
|
|
647
|
+
state.df,
|
|
648
|
+
{ type: 'list-anchor', range: typeRange, entryIndex: type.entryIndex }
|
|
649
|
+
);
|
|
693
650
|
}
|
|
694
651
|
});
|
|
695
652
|
|
|
696
|
-
//
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
653
|
+
// Apply predicates with proper ranges
|
|
654
|
+
contextSem.predicates.forEach(pred => {
|
|
655
|
+
if (pred.form !== '?' && pred.form !== '!') { // Skip context predicates
|
|
656
|
+
const entry = contextSem.entries.find(e => e.kind === 'property' && e.iri === pred.iri);
|
|
657
|
+
if (entry && entry.relRange) {
|
|
658
|
+
// Calculate absolute range: annotation start + relative range within annotation
|
|
659
|
+
const predRange = [annotationStart + entry.relRange.start, annotationStart + entry.relRange.end];
|
|
700
660
|
|
|
701
|
-
|
|
702
|
-
type: 'list',
|
|
703
|
-
text: token.text,
|
|
704
|
-
range: token.range,
|
|
705
|
-
attrsRange: token.attrsRange || null,
|
|
706
|
-
valueRange: token.valueRange || null
|
|
707
|
-
}, combinedSem, state, {
|
|
708
|
-
preserveGlobalSubject: !state.listStack.length,
|
|
709
|
-
implicitSubject: itemSubject
|
|
710
|
-
});
|
|
661
|
+
const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
|
|
711
662
|
|
|
712
|
-
|
|
663
|
+
// For literal predicates, the value comes from the list item text
|
|
664
|
+
let objectValue;
|
|
665
|
+
if (pred.form === '') {
|
|
666
|
+
objectValue = state.df.literal(listItemText || '');
|
|
667
|
+
} else {
|
|
668
|
+
// For other forms, this would need more complex handling
|
|
669
|
+
objectValue = state.df.literal(listItemText || '');
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
emitQuad(state.quads, state.origin.quadIndex, 'list-anchor-predicate',
|
|
673
|
+
itemSubject, P, objectValue, state.df,
|
|
674
|
+
{ type: 'list-anchor', range: predRange, entryIndex: pred.entryIndex }
|
|
675
|
+
);
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
});
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
function processOrderedListItem(token, state) {
|
|
682
|
+
if (!state.isProcessingOrderedList) {
|
|
683
|
+
state.listCounter = (state.listCounter || 0) + 1;
|
|
684
|
+
state.rdfListIndex = 0;
|
|
685
|
+
state.firstListNode = null;
|
|
686
|
+
state.previousListNode = null;
|
|
687
|
+
state.contextConnected = false;
|
|
688
|
+
state.isProcessingOrderedList = true;
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
generateRdfListTriples(token, state);
|
|
692
|
+
|
|
693
|
+
const listFrame = state.listStack[state.listStack.length - 1];
|
|
694
|
+
if (listFrame?.contextSem) {
|
|
695
|
+
const carriers = getCarriers(token);
|
|
696
|
+
const itemInfo = findItemSubject(token, carriers, state);
|
|
697
|
+
if (itemInfo?.subject) {
|
|
698
|
+
applyListAnchorAnnotations(itemInfo.subject, listFrame.contextSem, state, token.text, listFrame.contextToken);
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
if (listFrame?.contextSem && listFrame?.contextSubject && !state.contextConnected) {
|
|
703
|
+
listFrame.contextSem.predicates.forEach(pred => {
|
|
704
|
+
if (pred.form === '?') {
|
|
705
|
+
const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
|
|
706
|
+
const firstListNode = state.firstListNode;
|
|
707
|
+
if (firstListNode) {
|
|
708
|
+
emitQuad(state.quads, state.origin.quadIndex, 'ordered-list-context',
|
|
709
|
+
listFrame.contextSubject, P, state.df.namedNode(firstListNode), state.df);
|
|
710
|
+
state.contextConnected = true;
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
});
|
|
713
714
|
}
|
|
714
715
|
}
|
|
715
716
|
|
|
716
|
-
function
|
|
717
|
-
const
|
|
717
|
+
function generateRdfListTriples(token, state) {
|
|
718
|
+
const carriers = getCarriers(token);
|
|
719
|
+
const listIndex = (state.rdfListIndex || 0) + 1;
|
|
720
|
+
state.rdfListIndex = listIndex;
|
|
721
|
+
const listNodeName = `list-${state.listCounter}-${listIndex}`;
|
|
722
|
+
|
|
723
|
+
const listFrame = state.listStack[state.listStack.length - 1];
|
|
724
|
+
const contextSubject = listFrame?.contextSubject || state.currentSubject || state.documentSubject;
|
|
725
|
+
const baseIRI = contextSubject ? contextSubject.value : (state.ctx[''] || '');
|
|
726
|
+
|
|
727
|
+
const listNodeIri = baseIRI.includes('#')
|
|
728
|
+
? `${baseIRI.split('#')[0]}#${listNodeName}`
|
|
729
|
+
: `${baseIRI}#${listNodeName}`;
|
|
730
|
+
|
|
731
|
+
if (!state.firstListNode) state.firstListNode = listNodeIri;
|
|
732
|
+
|
|
733
|
+
// Emit rdf:type triple with origin tracking
|
|
734
|
+
emitQuad(state.quads, state.origin.quadIndex, 'ordered-list-rdf-type',
|
|
735
|
+
DataFactory.namedNode(listNodeIri),
|
|
736
|
+
DataFactory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
|
|
737
|
+
DataFactory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#List'),
|
|
738
|
+
DataFactory,
|
|
739
|
+
{ type: 'ordered-list', range: token.valueRange || token.range, listNodeName }
|
|
740
|
+
);
|
|
718
741
|
|
|
719
|
-
|
|
720
|
-
|
|
742
|
+
const itemInfo = findItemSubject(token, carriers, state);
|
|
743
|
+
let firstObject;
|
|
744
|
+
if (itemInfo?.value) {
|
|
745
|
+
firstObject = itemInfo.value;
|
|
746
|
+
} else if (itemInfo?.subject) {
|
|
747
|
+
firstObject = itemInfo.subject;
|
|
748
|
+
} else {
|
|
749
|
+
firstObject = DataFactory.literal(token.text);
|
|
750
|
+
}
|
|
721
751
|
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
752
|
+
// Determine the appropriate range based on object type
|
|
753
|
+
let originRange;
|
|
754
|
+
if (itemInfo?.subject) {
|
|
755
|
+
// For IRIs, target the annotation range
|
|
756
|
+
originRange = token.attrsRange || token.valueRange || token.range;
|
|
757
|
+
} else {
|
|
758
|
+
// For literals, target the value range
|
|
759
|
+
originRange = token.valueRange || token.range;
|
|
760
|
+
}
|
|
726
761
|
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
762
|
+
// Emit rdf:first triple with origin tracking
|
|
763
|
+
emitQuad(state.quads, state.origin.quadIndex, 'ordered-list-rdf-first',
|
|
764
|
+
DataFactory.namedNode(listNodeIri),
|
|
765
|
+
DataFactory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#first'),
|
|
766
|
+
firstObject,
|
|
767
|
+
DataFactory,
|
|
768
|
+
{ type: 'ordered-list', range: originRange, listNodeName }
|
|
769
|
+
);
|
|
770
|
+
|
|
771
|
+
if (state.previousListNode) {
|
|
772
|
+
// Find and remove the previous rdf:rest -> rdf:nil quad, then emit a new one
|
|
773
|
+
const prevRestQuadIndex = state.quads.findIndex(q =>
|
|
774
|
+
q.subject.value === state.previousListNode &&
|
|
775
|
+
q.predicate.value === 'http://www.w3.org/1999/02/22-rdf-syntax-ns#rest'
|
|
776
|
+
);
|
|
777
|
+
if (prevRestQuadIndex !== -1) {
|
|
778
|
+
// Remove the old quad
|
|
779
|
+
state.quads.splice(prevRestQuadIndex, 1);
|
|
780
|
+
|
|
781
|
+
// Emit new rdf:rest quad with proper origin tracking
|
|
782
|
+
emitQuad(state.quads, state.origin.quadIndex, 'ordered-list-rdf-rest-update',
|
|
783
|
+
DataFactory.namedNode(state.previousListNode),
|
|
784
|
+
DataFactory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#rest'),
|
|
785
|
+
DataFactory.namedNode(listNodeIri),
|
|
786
|
+
DataFactory,
|
|
787
|
+
{ type: 'ordered-list', range: token.valueRange || token.range, listNodeName: state.previousListNode }
|
|
788
|
+
);
|
|
789
|
+
}
|
|
790
|
+
}
|
|
730
791
|
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
792
|
+
// Emit rdf:rest triple with origin tracking
|
|
793
|
+
emitQuad(state.quads, state.origin.quadIndex, 'ordered-list-rdf-rest',
|
|
794
|
+
DataFactory.namedNode(listNodeIri),
|
|
795
|
+
DataFactory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#rest'),
|
|
796
|
+
DataFactory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#nil'),
|
|
797
|
+
DataFactory,
|
|
798
|
+
{ type: 'ordered-list', range: token.valueRange || token.range, listNodeName }
|
|
799
|
+
);
|
|
800
|
+
|
|
801
|
+
state.previousListNode = listNodeIri;
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
function processListContextFromParagraph(token, state) {
|
|
805
|
+
const contextMatch = LIST_CONTEXT_REGEX.exec(token.text);
|
|
806
|
+
if (!contextMatch) return;
|
|
807
|
+
|
|
808
|
+
const contextSem = parseSemCached(`{${contextMatch[2]}}`);
|
|
809
|
+
let contextSubject = state.currentSubject || state.documentSubject;
|
|
810
|
+
|
|
811
|
+
if (!contextSubject && state.tokens) {
|
|
812
|
+
for (let i = state.currentTokenIndex - 1; i >= 0; i--) {
|
|
813
|
+
const prevToken = state.tokens[i];
|
|
814
|
+
if (prevToken.type === 'heading' && prevToken.attrs) {
|
|
815
|
+
const prevSem = parseSemCached(prevToken.attrs);
|
|
816
|
+
if (prevSem.subject) {
|
|
817
|
+
const resolvedSubject = resolveSubject(prevSem, state);
|
|
818
|
+
if (resolvedSubject) {
|
|
819
|
+
contextSubject = resolvedSubject.value;
|
|
820
|
+
break;
|
|
821
|
+
}
|
|
822
|
+
}
|
|
735
823
|
}
|
|
736
824
|
}
|
|
825
|
+
}
|
|
737
826
|
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
827
|
+
const nextToken = state.tokens?.[state.currentTokenIndex + 1];
|
|
828
|
+
if (state.listStack.length > 0 && nextToken && (nextToken.type === 'unordered-list' || nextToken.type === 'ordered-list')) {
|
|
829
|
+
const currentFrame = state.listStack[state.listStack.length - 1];
|
|
830
|
+
if (currentFrame.anchorSubject && nextToken.indent > currentFrame.indent) {
|
|
831
|
+
contextSubject = currentFrame.anchorSubject;
|
|
832
|
+
}
|
|
742
833
|
}
|
|
834
|
+
|
|
835
|
+
state.pendingListContext = {
|
|
836
|
+
sem: contextSem,
|
|
837
|
+
subject: contextSubject,
|
|
838
|
+
contextText: contextMatch[1].replace(':', '').trim(),
|
|
839
|
+
contextToken: token // Store the context token for origin ranges
|
|
840
|
+
};
|
|
743
841
|
}
|
|
744
842
|
|
|
745
|
-
// Helper functions for token processing
|
|
746
843
|
function processTokenAnnotations(token, state, tokenType) {
|
|
747
|
-
// Process token's own attributes
|
|
748
844
|
if (token.attrs) {
|
|
749
845
|
const sem = parseSemCached(token.attrs);
|
|
750
846
|
processAnnotation({
|
|
751
|
-
type: tokenType,
|
|
752
|
-
|
|
753
|
-
range: token.range,
|
|
754
|
-
attrsRange: token.attrsRange || null,
|
|
755
|
-
valueRange: token.valueRange || null
|
|
847
|
+
type: tokenType, text: token.text, range: token.range,
|
|
848
|
+
attrsRange: token.attrsRange || null, valueRange: token.valueRange || null
|
|
756
849
|
}, sem, state);
|
|
757
850
|
}
|
|
758
851
|
|
|
759
|
-
|
|
760
|
-
const carriers = getCarriers(token);
|
|
761
|
-
carriers.forEach(carrier => {
|
|
852
|
+
getCarriers(token).forEach(carrier => {
|
|
762
853
|
if (carrier.attrs) {
|
|
763
854
|
const sem = parseSemCached(carrier.attrs);
|
|
764
855
|
processAnnotation(carrier, sem, state);
|
|
@@ -772,17 +863,52 @@ function processStandaloneSubject(token, state) {
|
|
|
772
863
|
|
|
773
864
|
const sem = parseSemCached(`{=${match[1]}}`);
|
|
774
865
|
const attrsStart = token.range[0] + token.text.indexOf('{=');
|
|
775
|
-
const attrsEnd = attrsStart + (match[1] ? match[1].length : 0);
|
|
776
|
-
|
|
777
866
|
processAnnotation({
|
|
778
|
-
type: 'standalone',
|
|
779
|
-
|
|
780
|
-
range: token.range,
|
|
781
|
-
attrsRange: [attrsStart, attrsEnd],
|
|
867
|
+
type: 'standalone', text: '', range: token.range,
|
|
868
|
+
attrsRange: [attrsStart, attrsStart + (match[1] ? match[1].length : 0)],
|
|
782
869
|
valueRange: null
|
|
783
870
|
}, sem, state);
|
|
784
871
|
}
|
|
785
872
|
|
|
873
|
+
const TOKEN_PROCESSORS = {
|
|
874
|
+
heading: (token, state) => {
|
|
875
|
+
state.isProcessingOrderedList = false;
|
|
876
|
+
if (token.attrs) {
|
|
877
|
+
const headingSem = parseSemCached(token.attrs);
|
|
878
|
+
if (headingSem.subject) {
|
|
879
|
+
const subject = resolveSubject(headingSem, state);
|
|
880
|
+
if (subject) state.documentSubject = subject;
|
|
881
|
+
}
|
|
882
|
+
}
|
|
883
|
+
processTokenAnnotations(token, state, token.type);
|
|
884
|
+
},
|
|
885
|
+
code: (token, state) => {
|
|
886
|
+
state.isProcessingOrderedList = false;
|
|
887
|
+
processTokenAnnotations(token, state, token.type);
|
|
888
|
+
},
|
|
889
|
+
blockquote: (token, state) => {
|
|
890
|
+
state.isProcessingOrderedList = false;
|
|
891
|
+
processTokenAnnotations(token, state, token.type);
|
|
892
|
+
},
|
|
893
|
+
para: (token, state) => {
|
|
894
|
+
if (!token.text.includes('{?') && !token.text.includes('{!')) {
|
|
895
|
+
state.isProcessingOrderedList = false;
|
|
896
|
+
}
|
|
897
|
+
processStandaloneSubject(token, state);
|
|
898
|
+
processListContextFromParagraph(token, state);
|
|
899
|
+
processTokenAnnotations(token, state, token.type);
|
|
900
|
+
},
|
|
901
|
+
'unordered-list': (token, state) => {
|
|
902
|
+
state.isProcessingOrderedList = false;
|
|
903
|
+
manageListStack(token, state);
|
|
904
|
+
processListItem(token, state);
|
|
905
|
+
},
|
|
906
|
+
'ordered-list': (token, state) => {
|
|
907
|
+
manageListStack(token, state);
|
|
908
|
+
processOrderedListItem(token, state);
|
|
909
|
+
}
|
|
910
|
+
};
|
|
911
|
+
|
|
786
912
|
export function parse(text, options = {}) {
|
|
787
913
|
const state = {
|
|
788
914
|
ctx: { ...DEFAULT_CONTEXT, ...(options.context || {}) },
|
|
@@ -790,65 +916,38 @@ export function parse(text, options = {}) {
|
|
|
790
916
|
quads: [],
|
|
791
917
|
origin: { blocks: new Map(), quadIndex: new Map() },
|
|
792
918
|
currentSubject: null,
|
|
793
|
-
documentSubject: null,
|
|
919
|
+
documentSubject: null,
|
|
794
920
|
listStack: [],
|
|
795
921
|
pendingListContext: null,
|
|
796
|
-
tokens: null,
|
|
797
|
-
currentTokenIndex: -1
|
|
922
|
+
tokens: null,
|
|
923
|
+
currentTokenIndex: -1,
|
|
924
|
+
listCounter: 0,
|
|
925
|
+
rdfListIndex: 0,
|
|
926
|
+
firstListNode: null,
|
|
927
|
+
previousListNode: null,
|
|
928
|
+
contextConnected: false,
|
|
929
|
+
isProcessingOrderedList: false
|
|
798
930
|
};
|
|
799
931
|
|
|
800
932
|
state.tokens = scanTokens(text);
|
|
801
933
|
|
|
802
|
-
// Process prefix declarations first with prefix folding support
|
|
803
934
|
state.tokens.filter(t => t.type === 'prefix').forEach(t => {
|
|
804
|
-
// Check if the IRI value contains a CURIE that references a previously defined prefix
|
|
805
935
|
let resolvedIri = t.iri;
|
|
806
936
|
if (t.iri.includes(':')) {
|
|
807
|
-
const
|
|
808
|
-
const
|
|
937
|
+
const colonIndex = t.iri.indexOf(':');
|
|
938
|
+
const potentialPrefix = t.iri.substring(0, colonIndex);
|
|
939
|
+
const reference = t.iri.substring(colonIndex + 1);
|
|
809
940
|
if (state.ctx[potentialPrefix] && potentialPrefix !== '@vocab') {
|
|
810
|
-
// This is a CURIE referencing an existing prefix - resolve it
|
|
811
941
|
resolvedIri = state.ctx[potentialPrefix] + reference;
|
|
812
942
|
}
|
|
813
943
|
}
|
|
814
944
|
state.ctx[t.prefix] = resolvedIri;
|
|
815
945
|
});
|
|
816
946
|
|
|
817
|
-
// Process all other tokens
|
|
818
947
|
for (let i = 0; i < state.tokens.length; i++) {
|
|
819
948
|
const token = state.tokens[i];
|
|
820
949
|
state.currentTokenIndex = i;
|
|
821
|
-
|
|
822
|
-
switch (token.type) {
|
|
823
|
-
case 'heading':
|
|
824
|
-
// Update document subject when processing headings
|
|
825
|
-
if (token.attrs) {
|
|
826
|
-
const headingSem = parseSemCached(token.attrs);
|
|
827
|
-
if (headingSem.subject) {
|
|
828
|
-
const subject = resolveSubject(headingSem, state);
|
|
829
|
-
if (subject) {
|
|
830
|
-
state.documentSubject = subject;
|
|
831
|
-
}
|
|
832
|
-
}
|
|
833
|
-
}
|
|
834
|
-
processTokenAnnotations(token, state, token.type);
|
|
835
|
-
break;
|
|
836
|
-
case 'code':
|
|
837
|
-
case 'blockquote':
|
|
838
|
-
processTokenAnnotations(token, state, token.type);
|
|
839
|
-
break;
|
|
840
|
-
|
|
841
|
-
case 'para':
|
|
842
|
-
processStandaloneSubject(token, state);
|
|
843
|
-
processListContextFromParagraph(token, state);
|
|
844
|
-
processTokenAnnotations(token, state, token.type);
|
|
845
|
-
break;
|
|
846
|
-
|
|
847
|
-
case 'list':
|
|
848
|
-
manageListStack(token, state);
|
|
849
|
-
processListItem(token, state);
|
|
850
|
-
break;
|
|
851
|
-
}
|
|
950
|
+
TOKEN_PROCESSORS[token.type]?.(token, state);
|
|
852
951
|
}
|
|
853
952
|
|
|
854
953
|
return { quads: state.quads, origin: state.origin, context: state.ctx };
|