mdld-parse 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/package.json +8 -5
- package/src/index.js +10 -0
- package/src/parse.js +788 -0
- package/src/serialize.js +531 -0
- package/src/utils.js +305 -0
- package/index.js +0 -1364
package/src/parse.js
ADDED
|
@@ -0,0 +1,788 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DEFAULT_CONTEXT,
|
|
3
|
+
DataFactory,
|
|
4
|
+
expandIRI,
|
|
5
|
+
parseSemanticBlock,
|
|
6
|
+
quadIndexKey,
|
|
7
|
+
createSlotInfo,
|
|
8
|
+
createLiteral,
|
|
9
|
+
hash
|
|
10
|
+
} from './utils.js';
|
|
11
|
+
|
|
12
|
+
// Semantic block cache to avoid repeated parsing
|
|
13
|
+
const semCache = {};
|
|
14
|
+
const EMPTY_SEM = Object.freeze({ predicates: [], types: [], subject: null });
|
|
15
|
+
|
|
16
|
+
function parseSemCached(attrs) {
|
|
17
|
+
if (!attrs) return EMPTY_SEM;
|
|
18
|
+
let sem = semCache[attrs];
|
|
19
|
+
if (!sem) {
|
|
20
|
+
sem = Object.freeze(parseSemanticBlock(attrs));
|
|
21
|
+
semCache[attrs] = sem;
|
|
22
|
+
}
|
|
23
|
+
return sem;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function calcAttrsRange(line, attrs, lineStart) {
|
|
27
|
+
if (!attrs) return null;
|
|
28
|
+
const attrsStartInLine = line.lastIndexOf(attrs);
|
|
29
|
+
return attrsStartInLine >= 0 ? [lineStart + attrsStartInLine, lineStart + attrsStartInLine + attrs.length] : null;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function calcValueRange(lineStart, valueStartInLine, valueEndInLine) {
|
|
33
|
+
return [lineStart + valueStartInLine, lineStart + valueEndInLine];
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function createToken(type, range, text, attrs = null, attrsRange = null, valueRange = null, extra = {}) {
|
|
37
|
+
const token = { type, range, text, attrs, attrsRange, valueRange, ...extra };
|
|
38
|
+
// Add lazy carrier caching
|
|
39
|
+
Object.defineProperty(token, '_carriers', {
|
|
40
|
+
enumerable: false,
|
|
41
|
+
writable: true,
|
|
42
|
+
value: null
|
|
43
|
+
});
|
|
44
|
+
return token;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function getCarriers(token) {
|
|
48
|
+
if (!token._carriers) {
|
|
49
|
+
token._carriers = extractInlineCarriers(token.text, token.range[0]);
|
|
50
|
+
}
|
|
51
|
+
return token._carriers;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function scanTokens(text) {
|
|
55
|
+
const tokens = [];
|
|
56
|
+
const lines = text.split('\n');
|
|
57
|
+
let pos = 0;
|
|
58
|
+
let codeBlock = null;
|
|
59
|
+
|
|
60
|
+
for (let i = 0; i < lines.length; i++) {
|
|
61
|
+
const line = lines[i];
|
|
62
|
+
const lineStart = pos;
|
|
63
|
+
pos += line.length + 1;
|
|
64
|
+
|
|
65
|
+
if (line.startsWith('```')) {
|
|
66
|
+
if (!codeBlock) {
|
|
67
|
+
const fence = line.match(/^(`{3,})(.*)/);
|
|
68
|
+
const attrsText = fence[2].match(/\{[^{}]*\}/)?.[0] || null;
|
|
69
|
+
const attrsStartInLine = attrsText ? line.indexOf(attrsText) : -1;
|
|
70
|
+
const contentStart = lineStart + line.length + 1;
|
|
71
|
+
codeBlock = {
|
|
72
|
+
fence: fence[1],
|
|
73
|
+
start: lineStart,
|
|
74
|
+
content: [],
|
|
75
|
+
lang: fence[2].trim().split(/[\s{]/)[0],
|
|
76
|
+
attrs: attrsText,
|
|
77
|
+
attrsRange: attrsText && attrsStartInLine >= 0 ? [lineStart + attrsStartInLine, lineStart + attrsStartInLine + attrsText.length] : null,
|
|
78
|
+
valueRangeStart: contentStart
|
|
79
|
+
};
|
|
80
|
+
} else if (line.startsWith(codeBlock.fence)) {
|
|
81
|
+
const valueStart = codeBlock.valueRangeStart;
|
|
82
|
+
const valueEnd = Math.max(valueStart, lineStart - 1);
|
|
83
|
+
tokens.push({
|
|
84
|
+
type: 'code',
|
|
85
|
+
range: [codeBlock.start, lineStart],
|
|
86
|
+
text: codeBlock.content.join('\n'),
|
|
87
|
+
lang: codeBlock.lang,
|
|
88
|
+
attrs: codeBlock.attrs,
|
|
89
|
+
attrsRange: codeBlock.attrsRange,
|
|
90
|
+
valueRange: [valueStart, valueEnd]
|
|
91
|
+
});
|
|
92
|
+
codeBlock = null;
|
|
93
|
+
}
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
if (codeBlock) {
|
|
98
|
+
codeBlock.content.push(line);
|
|
99
|
+
continue;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const prefixMatch = line.match(/^\[([^\]]+)\]\s*<([^>]+)>/);
|
|
103
|
+
if (prefixMatch) {
|
|
104
|
+
tokens.push({ type: 'prefix', prefix: prefixMatch[1], iri: prefixMatch[2].trim() });
|
|
105
|
+
continue;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+?)(?:\s*(\{[^}]+\}))?$/);
|
|
109
|
+
if (headingMatch) {
|
|
110
|
+
const attrs = headingMatch[3] || null;
|
|
111
|
+
const afterHashes = headingMatch[1].length;
|
|
112
|
+
const ws = line.substring(afterHashes).match(/^\s+/)?.[0]?.length || 0;
|
|
113
|
+
const valueStartInLine = afterHashes + ws;
|
|
114
|
+
const valueEndInLine = valueStartInLine + headingMatch[2].length;
|
|
115
|
+
tokens.push(createToken('heading', [lineStart, pos - 1], headingMatch[2].trim(), attrs,
|
|
116
|
+
calcAttrsRange(line, attrs, lineStart),
|
|
117
|
+
calcValueRange(lineStart, valueStartInLine, valueEndInLine),
|
|
118
|
+
{ depth: headingMatch[1].length }));
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const listMatch = line.match(/^(\s*)([-*+]|\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/);
|
|
123
|
+
if (listMatch) {
|
|
124
|
+
const attrs = listMatch[4] || null;
|
|
125
|
+
const prefix = listMatch[1].length + listMatch[2].length;
|
|
126
|
+
const ws = line.substring(prefix).match(/^\s+/)?.[0]?.length || 0;
|
|
127
|
+
const valueStartInLine = prefix + ws;
|
|
128
|
+
const valueEndInLine = valueStartInLine + listMatch[3].length;
|
|
129
|
+
tokens.push(createToken('list', [lineStart, pos - 1], listMatch[3].trim(), attrs,
|
|
130
|
+
calcAttrsRange(line, attrs, lineStart),
|
|
131
|
+
calcValueRange(lineStart, valueStartInLine, valueEndInLine),
|
|
132
|
+
{ indent: listMatch[1].length }));
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
const blockquoteMatch = line.match(/^>\s+(.+?)(?:\s*(\{[^}]+\}))?$/);
|
|
137
|
+
if (blockquoteMatch) {
|
|
138
|
+
const attrs = blockquoteMatch[2] || null;
|
|
139
|
+
const prefixMatch = line.match(/^>\s+/);
|
|
140
|
+
const valueStartInLine = prefixMatch ? prefixMatch[0].length : 2;
|
|
141
|
+
const valueEndInLine = valueStartInLine + blockquoteMatch[1].length;
|
|
142
|
+
tokens.push(createToken('blockquote', [lineStart, pos - 1], blockquoteMatch[1].trim(), attrs,
|
|
143
|
+
calcAttrsRange(line, attrs, lineStart),
|
|
144
|
+
calcValueRange(lineStart, valueStartInLine, valueEndInLine)));
|
|
145
|
+
continue;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (line.trim()) {
|
|
149
|
+
tokens.push(createToken('para', [lineStart, pos - 1], line.trim()));
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return tokens;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Inline carrier pattern constants (using sticky regexes for proper positioning)
|
|
157
|
+
const INLINE_CARRIER_PATTERNS = {
|
|
158
|
+
EMPHASIS: /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y,
|
|
159
|
+
CODE_SPAN: /``(.+?)``\s*\{([^}]+)\}/y
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
function createCarrier(type, text, attrs, attrsRange, valueRange, range, pos, extra = {}) {
|
|
163
|
+
return { type, text, attrs, attrsRange, valueRange, range, pos, ...extra };
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
function extractInlineCarriers(text, baseOffset = 0) {
|
|
167
|
+
const carriers = [];
|
|
168
|
+
let pos = 0;
|
|
169
|
+
|
|
170
|
+
while (pos < text.length) {
|
|
171
|
+
const emphasisCarrier = tryExtractEmphasisCarrier(text, pos, baseOffset);
|
|
172
|
+
if (emphasisCarrier) {
|
|
173
|
+
carriers.push(emphasisCarrier);
|
|
174
|
+
pos = emphasisCarrier.pos;
|
|
175
|
+
continue;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
const codeCarrier = tryExtractCodeCarrier(text, pos, baseOffset);
|
|
179
|
+
if (codeCarrier) {
|
|
180
|
+
carriers.push(codeCarrier);
|
|
181
|
+
pos = codeCarrier.pos;
|
|
182
|
+
continue;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
const bracketCarrier = tryExtractBracketCarrier(text, pos, baseOffset);
|
|
186
|
+
if (bracketCarrier) {
|
|
187
|
+
if (bracketCarrier.skip) {
|
|
188
|
+
pos = bracketCarrier.pos;
|
|
189
|
+
continue;
|
|
190
|
+
}
|
|
191
|
+
carriers.push(bracketCarrier);
|
|
192
|
+
pos = bracketCarrier.pos;
|
|
193
|
+
continue;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
pos++; // Advance to next character if no carrier found
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
return carriers;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
function calcCarrierRanges(match, baseOffset, matchStart) {
|
|
203
|
+
const valueStart = baseOffset + matchStart;
|
|
204
|
+
const valueEnd = valueStart + match[1].length;
|
|
205
|
+
const attrsStart = matchStart + match[0].indexOf('{');
|
|
206
|
+
const attrsEnd = attrsStart + match[2].length + 2; // +2 for { and }
|
|
207
|
+
return {
|
|
208
|
+
valueRange: [valueStart, valueEnd],
|
|
209
|
+
attrsRange: [attrsStart + 1, attrsEnd - 1], // Exclude braces
|
|
210
|
+
range: [valueStart, attrsEnd],
|
|
211
|
+
pos: attrsEnd
|
|
212
|
+
};
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
function tryExtractEmphasisCarrier(text, pos, baseOffset) {
|
|
216
|
+
INLINE_CARRIER_PATTERNS.EMPHASIS.lastIndex = pos;
|
|
217
|
+
const match = INLINE_CARRIER_PATTERNS.EMPHASIS.exec(text);
|
|
218
|
+
if (!match) return null;
|
|
219
|
+
|
|
220
|
+
const ranges = calcCarrierRanges(match, baseOffset, match.index);
|
|
221
|
+
return createCarrier('emphasis', match[1], `{${match[2]}}`,
|
|
222
|
+
ranges.attrsRange, ranges.valueRange, ranges.range, ranges.pos);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
function tryExtractCodeCarrier(text, pos, baseOffset) {
|
|
226
|
+
INLINE_CARRIER_PATTERNS.CODE_SPAN.lastIndex = pos;
|
|
227
|
+
const match = INLINE_CARRIER_PATTERNS.CODE_SPAN.exec(text);
|
|
228
|
+
if (!match) return null;
|
|
229
|
+
|
|
230
|
+
const ranges = calcCarrierRanges(match, baseOffset, match.index);
|
|
231
|
+
return createCarrier('code', match[1], `{${match[2]}}`,
|
|
232
|
+
ranges.attrsRange, ranges.valueRange, ranges.range, ranges.pos);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
function tryExtractBracketCarrier(text, pos, baseOffset) {
|
|
236
|
+
const bracketStart = text.indexOf('[', pos);
|
|
237
|
+
if (bracketStart === -1 || bracketStart !== pos) return null;
|
|
238
|
+
|
|
239
|
+
const bracketEnd = findMatchingBracket(text, bracketStart);
|
|
240
|
+
if (!bracketEnd) return null;
|
|
241
|
+
|
|
242
|
+
const carrierText = text.substring(bracketStart + 1, bracketEnd - 1);
|
|
243
|
+
const { url, spanEnd } = extractUrlFromBrackets(text, bracketEnd);
|
|
244
|
+
const { attrs, attrsRange, finalSpanEnd } = extractAttributesFromText(text, spanEnd, baseOffset);
|
|
245
|
+
const { carrierType, resourceIRI } = determineCarrierType(url);
|
|
246
|
+
|
|
247
|
+
if (url && url.startsWith('=')) {
|
|
248
|
+
return { skip: true, pos: finalSpanEnd };
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
return createCarrier(carrierType, carrierText, attrs, attrsRange,
|
|
252
|
+
[baseOffset + bracketStart + 1, baseOffset + bracketEnd - 1],
|
|
253
|
+
[baseOffset + bracketStart, baseOffset + finalSpanEnd],
|
|
254
|
+
finalSpanEnd, { url: resourceIRI });
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
function findMatchingBracket(text, bracketStart) {
|
|
258
|
+
let bracketDepth = 1;
|
|
259
|
+
let bracketEnd = bracketStart + 1;
|
|
260
|
+
|
|
261
|
+
while (bracketEnd < text.length && bracketDepth > 0) {
|
|
262
|
+
if (text[bracketEnd] === '[') {
|
|
263
|
+
bracketDepth++;
|
|
264
|
+
} else if (text[bracketEnd] === ']') {
|
|
265
|
+
bracketDepth--;
|
|
266
|
+
}
|
|
267
|
+
bracketEnd++;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
return bracketDepth > 0 ? null : bracketEnd;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
function extractUrlFromBrackets(text, bracketEnd) {
|
|
274
|
+
let url = null;
|
|
275
|
+
let spanEnd = bracketEnd;
|
|
276
|
+
|
|
277
|
+
if (text[spanEnd] === '(') {
|
|
278
|
+
const parenEnd = text.indexOf(')', spanEnd);
|
|
279
|
+
if (parenEnd !== -1) {
|
|
280
|
+
url = text.substring(spanEnd + 1, parenEnd);
|
|
281
|
+
spanEnd = parenEnd + 1;
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
return { url, spanEnd };
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
function extractAttributesFromText(text, spanEnd, baseOffset) {
|
|
289
|
+
let attrs = null;
|
|
290
|
+
let attrsRange = null;
|
|
291
|
+
|
|
292
|
+
const attrsMatch = text.substring(spanEnd).match(/^\s*\{([^}]+)\}/);
|
|
293
|
+
if (attrsMatch) {
|
|
294
|
+
attrs = `{${attrsMatch[1]}}`;
|
|
295
|
+
const braceIndex = attrsMatch[0].indexOf('{');
|
|
296
|
+
const absStart = baseOffset + spanEnd + (braceIndex >= 0 ? braceIndex : 0);
|
|
297
|
+
attrsRange = [absStart, absStart + attrs.length];
|
|
298
|
+
spanEnd += attrsMatch[0].length;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
return { attrs, attrsRange, finalSpanEnd: spanEnd };
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
function determineCarrierType(url) {
|
|
305
|
+
let carrierType = 'span';
|
|
306
|
+
let resourceIRI = null;
|
|
307
|
+
|
|
308
|
+
if (url && !url.startsWith('=')) {
|
|
309
|
+
carrierType = 'link';
|
|
310
|
+
resourceIRI = url;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
return { carrierType, resourceIRI };
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
function createBlock(subject, types, predicates, entries, range, attrsRange, valueRange, carrierType, ctx) {
|
|
317
|
+
const expanded = {
|
|
318
|
+
subject,
|
|
319
|
+
types: types.map(t => expandIRI(typeof t === 'string' ? t : t.iri, ctx)),
|
|
320
|
+
predicates: predicates.map(p => ({ iri: expandIRI(p.iri, ctx), form: p.form }))
|
|
321
|
+
};
|
|
322
|
+
|
|
323
|
+
// Use semantic signature for stable block identity
|
|
324
|
+
const signature = [
|
|
325
|
+
subject,
|
|
326
|
+
carrierType || 'unknown', // Include carrier type in signature
|
|
327
|
+
expanded.types.join(','),
|
|
328
|
+
expanded.predicates.map(p => `${p.form}${p.iri}`).join(',')
|
|
329
|
+
].join('|');
|
|
330
|
+
|
|
331
|
+
const blockId = hash(signature);
|
|
332
|
+
return {
|
|
333
|
+
id: blockId,
|
|
334
|
+
range: { start: range[0], end: range[1] },
|
|
335
|
+
attrsRange: attrsRange ? { start: attrsRange[0], end: attrsRange[1] } : null,
|
|
336
|
+
valueRange: valueRange ? { start: valueRange[0], end: valueRange[1] } : null,
|
|
337
|
+
carrierType: carrierType || null,
|
|
338
|
+
subject,
|
|
339
|
+
types: expanded.types,
|
|
340
|
+
predicates: expanded.predicates,
|
|
341
|
+
entries: entries || [],
|
|
342
|
+
context: { ...ctx }
|
|
343
|
+
};
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
function emitQuad(quads, quadIndex, blockId, subject, predicate, object, dataFactory, meta = null) {
|
|
347
|
+
if (!subject || !predicate || !object) return;
|
|
348
|
+
const quad = dataFactory.quad(subject, predicate, object);
|
|
349
|
+
quads.push(quad);
|
|
350
|
+
|
|
351
|
+
// Create enhanced slot info with semantic slot tracking
|
|
352
|
+
const slotInfo = createSlotInfo(blockId, meta?.entryIndex, {
|
|
353
|
+
...meta,
|
|
354
|
+
subject,
|
|
355
|
+
predicate,
|
|
356
|
+
object
|
|
357
|
+
});
|
|
358
|
+
|
|
359
|
+
quadIndex.set(quadIndexKey(quad.subject, quad.predicate, quad.object), slotInfo);
|
|
360
|
+
}
|
|
361
|
+
function resolveSubject(sem, state) {
|
|
362
|
+
if (!sem.subject) return null;
|
|
363
|
+
if (sem.subject === 'RESET') {
|
|
364
|
+
state.currentSubject = null;
|
|
365
|
+
return null;
|
|
366
|
+
}
|
|
367
|
+
if (sem.subject.startsWith('=#')) {
|
|
368
|
+
const fragment = sem.subject.substring(2);
|
|
369
|
+
if (state.currentSubject) {
|
|
370
|
+
const baseIRI = state.currentSubject.value.split('#')[0];
|
|
371
|
+
return state.df.namedNode(`${baseIRI}#${fragment}`);
|
|
372
|
+
}
|
|
373
|
+
return null;
|
|
374
|
+
} else {
|
|
375
|
+
return state.df.namedNode(expandIRI(sem.subject, state.ctx));
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
function resolveObject(sem, state) {
|
|
380
|
+
if (!sem.object) return null;
|
|
381
|
+
if (sem.object.startsWith('#')) {
|
|
382
|
+
const fragment = sem.object.substring(1);
|
|
383
|
+
if (state.currentSubject) {
|
|
384
|
+
const baseIRI = state.currentSubject.value.split('#')[0];
|
|
385
|
+
return state.df.namedNode(`${baseIRI}#${fragment}`);
|
|
386
|
+
}
|
|
387
|
+
return null;
|
|
388
|
+
} else {
|
|
389
|
+
// Regular soft IRI
|
|
390
|
+
return state.df.namedNode(expandIRI(sem.object, state.ctx));
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
function processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state) {
|
|
395
|
+
sem.types.forEach(t => {
|
|
396
|
+
const typeIRI = typeof t === 'string' ? t : t.iri;
|
|
397
|
+
const entryIndex = typeof t === 'string' ? null : t.entryIndex;
|
|
398
|
+
const typeSubject = newSubject ? newSubject : (localObject || carrierO || S);
|
|
399
|
+
const expandedType = expandIRI(typeIRI, state.ctx);
|
|
400
|
+
|
|
401
|
+
emitQuad(
|
|
402
|
+
state.quads, state.origin.quadIndex, block.id,
|
|
403
|
+
typeSubject,
|
|
404
|
+
state.df.namedNode(expandIRI('rdf:type', state.ctx)),
|
|
405
|
+
state.df.namedNode(expandedType),
|
|
406
|
+
state.df,
|
|
407
|
+
{ kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex }
|
|
408
|
+
);
|
|
409
|
+
});
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
function processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state) {
|
|
413
|
+
sem.predicates.forEach(pred => {
|
|
414
|
+
const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
|
|
415
|
+
|
|
416
|
+
// Pre-bind subject/object roles for clarity
|
|
417
|
+
const roles = {
|
|
418
|
+
'': { subject: localObject || S, object: L },
|
|
419
|
+
'?': { subject: newSubject ? previousSubject : S, object: localObject || newSubjectOrCarrierO },
|
|
420
|
+
'!': { subject: localObject || newSubjectOrCarrierO, object: newSubject ? previousSubject : S }
|
|
421
|
+
};
|
|
422
|
+
|
|
423
|
+
const role = roles[pred.form];
|
|
424
|
+
if (role && role.subject && role.object) {
|
|
425
|
+
emitQuad(
|
|
426
|
+
state.quads, state.origin.quadIndex, block.id,
|
|
427
|
+
role.subject, P, role.object, state.df,
|
|
428
|
+
{ kind: 'pred', token: `${pred.form}${pred.iri}`, form: pred.form, expandedPredicate: P.value, entryIndex: pred.entryIndex }
|
|
429
|
+
);
|
|
430
|
+
}
|
|
431
|
+
});
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
function processAnnotation(carrier, sem, state, options = {}) {
|
|
435
|
+
const { preserveGlobalSubject = false, implicitSubject = null } = options;
|
|
436
|
+
|
|
437
|
+
if (sem.subject === 'RESET') {
|
|
438
|
+
state.currentSubject = null;
|
|
439
|
+
return;
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
const previousSubject = state.currentSubject;
|
|
443
|
+
const newSubject = resolveSubject(sem, state);
|
|
444
|
+
const localObject = resolveObject(sem, state);
|
|
445
|
+
|
|
446
|
+
// Use implicit subject if provided (for list items)
|
|
447
|
+
const effectiveSubject = implicitSubject || (newSubject && !preserveGlobalSubject ? newSubject : previousSubject);
|
|
448
|
+
if (newSubject && !preserveGlobalSubject && !implicitSubject) {
|
|
449
|
+
state.currentSubject = newSubject;
|
|
450
|
+
}
|
|
451
|
+
const S = preserveGlobalSubject ? (newSubject || previousSubject) : (implicitSubject || state.currentSubject);
|
|
452
|
+
if (!S) return;
|
|
453
|
+
|
|
454
|
+
const block = createBlock(
|
|
455
|
+
S.value, sem.types, sem.predicates, sem.entries,
|
|
456
|
+
carrier.range, carrier.attrsRange || null, carrier.valueRange || null,
|
|
457
|
+
carrier.type || null, state.ctx
|
|
458
|
+
);
|
|
459
|
+
state.origin.blocks.set(block.id, block);
|
|
460
|
+
|
|
461
|
+
const L = createLiteral(carrier.text, sem.datatype, sem.language, state.ctx, state.df);
|
|
462
|
+
const carrierO = carrier.url ? state.df.namedNode(expandIRI(carrier.url, state.ctx)) : null;
|
|
463
|
+
const newSubjectOrCarrierO = newSubject || carrierO;
|
|
464
|
+
|
|
465
|
+
processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state);
|
|
466
|
+
processPredicateAnnotations(sem, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L, block, state);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
// Helper functions for list item processing
|
|
470
|
+
function findSubjectInAttrs(attrs, state, carrierInfo = null) {
|
|
471
|
+
const sem = parseSemCached(attrs);
|
|
472
|
+
if (sem.subject && sem.subject !== 'RESET') {
|
|
473
|
+
const subject = resolveSubject(sem, state);
|
|
474
|
+
if (subject) {
|
|
475
|
+
return { subject, carrier: carrierInfo || { type: 'unknown', text: '', attrs } };
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
return null;
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
export function findItemSubject(listToken, carriers, state) {
|
|
482
|
+
const subjectFromAttrs = findSubjectInAttrs(listToken.attrs, state, {
|
|
483
|
+
type: 'list', text: listToken.text, attrs: listToken.attrs, range: listToken.range
|
|
484
|
+
});
|
|
485
|
+
if (subjectFromAttrs) return subjectFromAttrs;
|
|
486
|
+
|
|
487
|
+
for (const carrier of carriers) {
|
|
488
|
+
const subjectFromCarrier = findSubjectInAttrs(carrier.attrs, state, carrier);
|
|
489
|
+
if (subjectFromCarrier) return subjectFromCarrier;
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
return null;
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
function hasOwnPredicates(listToken, carriers) {
|
|
496
|
+
// Check for explicit predicates (excluding subject declarations)
|
|
497
|
+
if (listToken.attrs) {
|
|
498
|
+
const attrs = parseSemCached(listToken.attrs);
|
|
499
|
+
if (attrs.predicates.some(p => !p.subject && p.iri !== 'RESET')) {
|
|
500
|
+
return true;
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
return carriers.some(carrier => {
|
|
504
|
+
const carrierAttrs = parseSemCached(carrier.attrs);
|
|
505
|
+
return carrierAttrs.predicates.some(p => !p.subject && p.iri !== 'RESET');
|
|
506
|
+
});
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
// Unified list context processing
|
|
510
|
+
function processContextSem({ sem, itemSubject, contextSubject, inheritLiterals = false, state, blockId = 'list-context' }) {
|
|
511
|
+
// Emit types
|
|
512
|
+
sem.types.forEach(t => {
|
|
513
|
+
const typeIRI = typeof t === 'string' ? t : t.iri;
|
|
514
|
+
emitQuad(
|
|
515
|
+
state.quads, state.origin.quadIndex, blockId,
|
|
516
|
+
itemSubject,
|
|
517
|
+
state.df.namedNode(expandIRI('rdf:type', state.ctx)),
|
|
518
|
+
state.df.namedNode(expandIRI(typeIRI, state.ctx)),
|
|
519
|
+
state.df
|
|
520
|
+
);
|
|
521
|
+
});
|
|
522
|
+
|
|
523
|
+
// Emit directional predicates
|
|
524
|
+
sem.predicates.forEach(pred => {
|
|
525
|
+
const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
|
|
526
|
+
if (pred.form === '!') {
|
|
527
|
+
emitQuad(state.quads, state.origin.quadIndex, blockId, itemSubject, P, contextSubject, state.df);
|
|
528
|
+
} else if (pred.form === '?') {
|
|
529
|
+
emitQuad(state.quads, state.origin.quadIndex, blockId, contextSubject, P, itemSubject, state.df);
|
|
530
|
+
}
|
|
531
|
+
});
|
|
532
|
+
|
|
533
|
+
// Optionally inherit literal predicates
|
|
534
|
+
if (inheritLiterals) {
|
|
535
|
+
const literalPredicates = sem.predicates.filter(p => p.form === '');
|
|
536
|
+
if (literalPredicates.length > 0) {
|
|
537
|
+
const inheritedSem = createInheritedSem(literalPredicates);
|
|
538
|
+
// Note: caller must handle subject switching for literal inheritance
|
|
539
|
+
return inheritedSem;
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
return null;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
// Lightweight semantic object constructor for inherited predicates
|
|
546
|
+
function createInheritedSem(predicates) {
|
|
547
|
+
return {
|
|
548
|
+
subject: null,
|
|
549
|
+
object: null,
|
|
550
|
+
types: [],
|
|
551
|
+
predicates: predicates.map(p => ({ iri: p.iri, form: p.form, entryIndex: p.entryIndex })),
|
|
552
|
+
datatype: null,
|
|
553
|
+
language: null,
|
|
554
|
+
entries: []
|
|
555
|
+
};
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
// List stack management functions
|
|
559
|
+
function manageListStack(token, state) {
|
|
560
|
+
// Pop stack frames for lists that have ended (strictly less indent)
|
|
561
|
+
while (
|
|
562
|
+
state.listStack.length &&
|
|
563
|
+
token.indent < state.listStack[state.listStack.length - 1].indent
|
|
564
|
+
) {
|
|
565
|
+
state.listStack.pop();
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
// If we have pending context, always create a new frame for it
|
|
569
|
+
if (state.pendingListContext) {
|
|
570
|
+
state.listStack.push({
|
|
571
|
+
indent: token.indent,
|
|
572
|
+
anchorSubject: state.pendingListContext.subject,
|
|
573
|
+
contextSubject: state.pendingListContext.subject,
|
|
574
|
+
contextSem: state.pendingListContext.sem
|
|
575
|
+
});
|
|
576
|
+
state.pendingListContext = null;
|
|
577
|
+
} else if (state.listStack.length === 0 || token.indent > state.listStack[state.listStack.length - 1].indent) {
|
|
578
|
+
// Push empty frame for nested lists without explicit context
|
|
579
|
+
// Inherit anchorSubject from parent frame if available
|
|
580
|
+
const parentFrame = state.listStack.length > 0 ? state.listStack[state.listStack.length - 1] : null;
|
|
581
|
+
state.listStack.push({
|
|
582
|
+
indent: token.indent,
|
|
583
|
+
anchorSubject: parentFrame?.anchorSubject || null,
|
|
584
|
+
contextSubject: parentFrame?.anchorSubject || null,
|
|
585
|
+
contextSem: null
|
|
586
|
+
});
|
|
587
|
+
}
|
|
588
|
+
// If token.indent == current frame indent and no pending context, we're at same level - do nothing
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
function processListItem(token, state) {
|
|
592
|
+
const carriers = getCarriers(token);
|
|
593
|
+
|
|
594
|
+
// Find item subject from list token or inline carriers
|
|
595
|
+
const itemInfo = findItemSubject(token, carriers, state);
|
|
596
|
+
if (!itemInfo) return;
|
|
597
|
+
|
|
598
|
+
const { subject: itemSubject } = itemInfo;
|
|
599
|
+
|
|
600
|
+
// Update the current list frame to track this item's subject for nested contexts
|
|
601
|
+
if (state.listStack.length > 0) {
|
|
602
|
+
const currentFrame = state.listStack[state.listStack.length - 1];
|
|
603
|
+
currentFrame.anchorSubject = itemSubject;
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
const listFrame = state.listStack[state.listStack.length - 1];
|
|
607
|
+
|
|
608
|
+
// Apply list context if available
|
|
609
|
+
if (listFrame?.contextSem) {
|
|
610
|
+
processContextSem({
|
|
611
|
+
sem: listFrame.contextSem,
|
|
612
|
+
itemSubject,
|
|
613
|
+
contextSubject: listFrame.contextSubject,
|
|
614
|
+
state
|
|
615
|
+
});
|
|
616
|
+
|
|
617
|
+
// Inherit literal predicates if item has no own predicates
|
|
618
|
+
const hasOwnPreds = hasOwnPredicates(token, carriers);
|
|
619
|
+
if (!hasOwnPreds) {
|
|
620
|
+
const inheritedSem = processContextSem({
|
|
621
|
+
sem: listFrame.contextSem,
|
|
622
|
+
itemSubject,
|
|
623
|
+
contextSubject: listFrame.contextSubject,
|
|
624
|
+
inheritLiterals: true,
|
|
625
|
+
state
|
|
626
|
+
});
|
|
627
|
+
if (inheritedSem) {
|
|
628
|
+
const prevSubject = state.currentSubject;
|
|
629
|
+
state.currentSubject = itemSubject;
|
|
630
|
+
processAnnotation(createCarrierFromToken(token, 'list'), inheritedSem, state, { preserveGlobalSubject: true });
|
|
631
|
+
state.currentSubject = prevSubject;
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
// Process item's own annotations using unified function
|
|
637
|
+
if (token.attrs) {
|
|
638
|
+
const sem = parseSemCached(token.attrs);
|
|
639
|
+
processAnnotation(createCarrierFromToken(token, 'list'), sem, state, {
|
|
640
|
+
preserveGlobalSubject: !state.listStack.length,
|
|
641
|
+
implicitSubject: itemSubject
|
|
642
|
+
});
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
// Process inline carriers' annotations
|
|
646
|
+
carriers.forEach(carrier => {
|
|
647
|
+
if (carrier.attrs) {
|
|
648
|
+
const sem = parseSemCached(carrier.attrs);
|
|
649
|
+
processAnnotation(carrier, sem, state, {
|
|
650
|
+
preserveGlobalSubject: !state.listStack.length,
|
|
651
|
+
implicitSubject: itemSubject
|
|
652
|
+
});
|
|
653
|
+
}
|
|
654
|
+
});
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
function processListContextFromParagraph(token, state) {
|
|
658
|
+
const contextMatch = token.text.match(/^(.+?)\s*\{([^}]+)\}$/);
|
|
659
|
+
|
|
660
|
+
if (contextMatch) {
|
|
661
|
+
const contextSem = parseSemCached(`{${contextMatch[2]}}`);
|
|
662
|
+
|
|
663
|
+
// Context subject resolution:
|
|
664
|
+
// 1. For top-level lists: use current subject or document subject
|
|
665
|
+
// 2. For nested lists: use parent list item's subject
|
|
666
|
+
let contextSubject = state.currentSubject || state.documentSubject;
|
|
667
|
+
|
|
668
|
+
// Check if this is a nested list context by looking ahead
|
|
669
|
+
const nextTokenIndex = state.currentTokenIndex + 1;
|
|
670
|
+
const nextToken = state.tokens && state.tokens[nextTokenIndex];
|
|
671
|
+
|
|
672
|
+
if (state.listStack.length > 0 && nextToken && nextToken.type === 'list') {
|
|
673
|
+
const currentFrame = state.listStack[state.listStack.length - 1];
|
|
674
|
+
if (currentFrame.anchorSubject && nextToken.indent > currentFrame.indent) {
|
|
675
|
+
contextSubject = currentFrame.anchorSubject;
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
state.pendingListContext = {
|
|
680
|
+
sem: contextSem,
|
|
681
|
+
subject: contextSubject
|
|
682
|
+
};
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
// Helper functions for token processing
|
|
687
|
+
function createCarrierFromToken(token, tokenType) {
|
|
688
|
+
return {
|
|
689
|
+
type: tokenType,
|
|
690
|
+
text: token.text,
|
|
691
|
+
range: token.range,
|
|
692
|
+
attrsRange: token.attrsRange || null,
|
|
693
|
+
valueRange: token.valueRange || null
|
|
694
|
+
};
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
function processTokenAnnotations(token, state, tokenType) {
|
|
698
|
+
// Process token's own attributes
|
|
699
|
+
if (token.attrs) {
|
|
700
|
+
const sem = parseSemCached(token.attrs);
|
|
701
|
+
processAnnotation(createCarrierFromToken(token, tokenType), sem, state);
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
// Process inline carriers
|
|
705
|
+
const carriers = getCarriers(token);
|
|
706
|
+
carriers.forEach(carrier => {
|
|
707
|
+
if (carrier.attrs) {
|
|
708
|
+
const sem = parseSemCached(carrier.attrs);
|
|
709
|
+
processAnnotation(carrier, sem, state);
|
|
710
|
+
}
|
|
711
|
+
});
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
function processStandaloneSubject(token, state) {
|
|
715
|
+
const match = token.text.match(/^\s*\{=(.*?)\}\s*$/);
|
|
716
|
+
if (!match) return;
|
|
717
|
+
|
|
718
|
+
const sem = parseSemCached(`{=${match[1]}}`);
|
|
719
|
+
const attrsStart = token.range[0] + token.text.indexOf('{=');
|
|
720
|
+
const attrsEnd = attrsStart + (match[1] ? match[1].length : 0);
|
|
721
|
+
|
|
722
|
+
processAnnotation({
|
|
723
|
+
type: 'standalone',
|
|
724
|
+
text: '',
|
|
725
|
+
range: token.range,
|
|
726
|
+
attrsRange: [attrsStart, attrsEnd],
|
|
727
|
+
valueRange: null
|
|
728
|
+
}, sem, state);
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
export function parse(text, options = {}) {
|
|
732
|
+
const state = {
|
|
733
|
+
ctx: { ...DEFAULT_CONTEXT, ...(options.context || {}) },
|
|
734
|
+
df: options.dataFactory || DataFactory,
|
|
735
|
+
quads: [],
|
|
736
|
+
origin: { blocks: new Map(), quadIndex: new Map() },
|
|
737
|
+
currentSubject: null,
|
|
738
|
+
documentSubject: null, // Track main document subject from headings
|
|
739
|
+
listStack: [],
|
|
740
|
+
pendingListContext: null,
|
|
741
|
+
tokens: null, // Store tokens for lookahead
|
|
742
|
+
currentTokenIndex: -1 // Track current token index
|
|
743
|
+
};
|
|
744
|
+
|
|
745
|
+
state.tokens = scanTokens(text);
|
|
746
|
+
|
|
747
|
+
// Process prefix declarations first
|
|
748
|
+
state.tokens.filter(t => t.type === 'prefix').forEach(t => state.ctx[t.prefix] = t.iri);
|
|
749
|
+
|
|
750
|
+
// Process all other tokens
|
|
751
|
+
for (let i = 0; i < state.tokens.length; i++) {
|
|
752
|
+
const token = state.tokens[i];
|
|
753
|
+
state.currentTokenIndex = i;
|
|
754
|
+
|
|
755
|
+
switch (token.type) {
|
|
756
|
+
case 'heading':
|
|
757
|
+
// Update document subject when processing headings
|
|
758
|
+
if (token.attrs) {
|
|
759
|
+
const headingSem = parseSemCached(token.attrs);
|
|
760
|
+
if (headingSem.subject) {
|
|
761
|
+
const subject = resolveSubject(headingSem, state);
|
|
762
|
+
if (subject) {
|
|
763
|
+
state.documentSubject = subject;
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
processTokenAnnotations(token, state, token.type);
|
|
768
|
+
break;
|
|
769
|
+
case 'code':
|
|
770
|
+
case 'blockquote':
|
|
771
|
+
processTokenAnnotations(token, state, token.type);
|
|
772
|
+
break;
|
|
773
|
+
|
|
774
|
+
case 'para':
|
|
775
|
+
processStandaloneSubject(token, state);
|
|
776
|
+
processListContextFromParagraph(token, state);
|
|
777
|
+
processTokenAnnotations(token, state, token.type);
|
|
778
|
+
break;
|
|
779
|
+
|
|
780
|
+
case 'list':
|
|
781
|
+
manageListStack(token, state);
|
|
782
|
+
processListItem(token, state);
|
|
783
|
+
break;
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
return { quads: state.quads, origin: state.origin, context: state.ctx };
|
|
788
|
+
}
|