mdld-parse 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +374 -0
- package/index.js +882 -0
- package/package.json +39 -0
- package/tests.js +409 -0
package/index.js
ADDED
|
@@ -0,0 +1,882 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MD-LD Parser — Markdown-Linked Data to RDF Quads
|
|
3
|
+
*
|
|
4
|
+
* Zero-dependency, streaming-capable parser for MD-LD documents.
|
|
5
|
+
* Outputs RDF/JS compatible quads.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// ============================================================================
|
|
9
|
+
// RDF/JS Data Factory (Minimal Implementation)
|
|
10
|
+
// ============================================================================
|
|
11
|
+
|
|
12
|
+
const DefaultDataFactory = {
|
|
13
|
+
namedNode: (value) => ({ termType: 'NamedNode', value }),
|
|
14
|
+
blankNode: (value = `b${Math.random().toString(36).slice(2, 11)}`) => ({
|
|
15
|
+
termType: 'BlankNode',
|
|
16
|
+
value
|
|
17
|
+
}),
|
|
18
|
+
literal: (value, languageOrDatatype) => {
|
|
19
|
+
if (typeof languageOrDatatype === 'string') {
|
|
20
|
+
return {
|
|
21
|
+
termType: 'Literal',
|
|
22
|
+
value,
|
|
23
|
+
language: languageOrDatatype,
|
|
24
|
+
datatype: { termType: 'NamedNode', value: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#langString' }
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
return {
|
|
28
|
+
termType: 'Literal',
|
|
29
|
+
value,
|
|
30
|
+
language: '',
|
|
31
|
+
datatype: languageOrDatatype || { termType: 'NamedNode', value: 'http://www.w3.org/2001/XMLSchema#string' }
|
|
32
|
+
};
|
|
33
|
+
},
|
|
34
|
+
quad: (subject, predicate, object, graph) => ({
|
|
35
|
+
subject,
|
|
36
|
+
predicate,
|
|
37
|
+
object,
|
|
38
|
+
graph: graph || DefaultDataFactory.defaultGraph()
|
|
39
|
+
}),
|
|
40
|
+
defaultGraph: () => ({ termType: 'DefaultGraph', value: '' })
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
// ============================================================================
|
|
44
|
+
// YAML-LD Parser (Minimal YAML subset for frontmatter)
|
|
45
|
+
// ============================================================================
|
|
46
|
+
|
|
47
|
+
function parseYAMLLD(yamlText) {
|
|
48
|
+
try {
|
|
49
|
+
const lines = yamlText.trim().split('\n');
|
|
50
|
+
const obj = {};
|
|
51
|
+
let currentKey = null;
|
|
52
|
+
let indent = 0;
|
|
53
|
+
let inArray = false;
|
|
54
|
+
let currentArray = null;
|
|
55
|
+
|
|
56
|
+
for (let line of lines) {
|
|
57
|
+
const trimmed = line.trim();
|
|
58
|
+
if (!trimmed || trimmed.startsWith('#')) continue;
|
|
59
|
+
|
|
60
|
+
const leadingSpaces = line.match(/^\s*/)[0].length;
|
|
61
|
+
|
|
62
|
+
// Array item
|
|
63
|
+
if (trimmed.startsWith('- ')) {
|
|
64
|
+
if (!inArray) {
|
|
65
|
+
currentArray = [];
|
|
66
|
+
inArray = true;
|
|
67
|
+
}
|
|
68
|
+
const value = trimmed.substring(2).trim();
|
|
69
|
+
currentArray.push(parseYAMLValue(value));
|
|
70
|
+
continue;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Key-value pair
|
|
74
|
+
const colonIndex = trimmed.indexOf(':');
|
|
75
|
+
if (colonIndex > 0) {
|
|
76
|
+
const key = trimmed.substring(0, colonIndex).trim().replace(/^['"]|['"]$/g, '');
|
|
77
|
+
let value = trimmed.substring(colonIndex + 1).trim();
|
|
78
|
+
|
|
79
|
+
// Save previous array
|
|
80
|
+
if (inArray && currentKey && currentArray) {
|
|
81
|
+
obj[currentKey] = currentArray;
|
|
82
|
+
inArray = false;
|
|
83
|
+
currentArray = null;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
currentKey = key;
|
|
87
|
+
|
|
88
|
+
if (!value) {
|
|
89
|
+
// Empty value or nested object/array coming
|
|
90
|
+
indent = leadingSpaces;
|
|
91
|
+
continue;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
obj[key] = parseYAMLValue(value);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Save last array
|
|
99
|
+
if (inArray && currentKey && currentArray) {
|
|
100
|
+
obj[currentKey] = currentArray;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return obj;
|
|
104
|
+
} catch (e) {
|
|
105
|
+
console.warn('YAML-LD parse error:', e);
|
|
106
|
+
return {};
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function parseYAMLValue(value) {
|
|
111
|
+
value = value.replace(/^['"]|['"]$/g, '');
|
|
112
|
+
|
|
113
|
+
if (value === 'true') return true;
|
|
114
|
+
if (value === 'false') return false;
|
|
115
|
+
if (value === 'null') return null;
|
|
116
|
+
if (/^-?\d+$/.test(value)) return parseInt(value, 10);
|
|
117
|
+
if (/^-?\d+\.\d+$/.test(value)) return parseFloat(value);
|
|
118
|
+
|
|
119
|
+
return value;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// ============================================================================
|
|
123
|
+
// Markdown Tokenizer (Minimal - focuses on structure)
|
|
124
|
+
// ============================================================================
|
|
125
|
+
|
|
126
|
+
function tokenizeMarkdown(text) {
|
|
127
|
+
const tokens = [];
|
|
128
|
+
const lines = text.split('\n');
|
|
129
|
+
let i = 0;
|
|
130
|
+
let inCodeBlock = false;
|
|
131
|
+
let codeFence = null;
|
|
132
|
+
let codeLang = null;
|
|
133
|
+
let codeAttrs = {};
|
|
134
|
+
let codeLines = [];
|
|
135
|
+
|
|
136
|
+
while (i < lines.length) {
|
|
137
|
+
const line = lines[i];
|
|
138
|
+
const trimmed = line.trim();
|
|
139
|
+
|
|
140
|
+
// Fenced code block ```lang {attrs}
|
|
141
|
+
const fenceMatch = line.match(/^(```+)(.*)$/);
|
|
142
|
+
if (fenceMatch) {
|
|
143
|
+
const [, fence, rest] = fenceMatch;
|
|
144
|
+
|
|
145
|
+
if (!inCodeBlock) {
|
|
146
|
+
// Start of code block
|
|
147
|
+
inCodeBlock = true;
|
|
148
|
+
codeFence = fence;
|
|
149
|
+
codeLines = [];
|
|
150
|
+
codeLang = null;
|
|
151
|
+
codeAttrs = {};
|
|
152
|
+
|
|
153
|
+
const restTrimmed = rest.trim();
|
|
154
|
+
if (restTrimmed) {
|
|
155
|
+
// Extract language (first token that is not an attribute block)
|
|
156
|
+
const attrIndex = restTrimmed.indexOf('{');
|
|
157
|
+
const langPart = attrIndex >= 0 ? restTrimmed.substring(0, attrIndex).trim() : restTrimmed;
|
|
158
|
+
if (langPart) {
|
|
159
|
+
codeLang = langPart.split(/\s+/)[0];
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Attributes after language: ```lang {#id typeof="..."}
|
|
163
|
+
const attrMatch = restTrimmed.match(/\{[^}]+\}/);
|
|
164
|
+
if (attrMatch) {
|
|
165
|
+
codeAttrs = parseAttributes(attrMatch[0]);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
i++;
|
|
170
|
+
continue;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// Closing fence (must match opening fence length)
|
|
174
|
+
if (inCodeBlock && fence === codeFence) {
|
|
175
|
+
tokens.push({
|
|
176
|
+
type: 'code',
|
|
177
|
+
lang: codeLang,
|
|
178
|
+
text: codeLines.join('\n'),
|
|
179
|
+
attrs: codeAttrs
|
|
180
|
+
});
|
|
181
|
+
|
|
182
|
+
inCodeBlock = false;
|
|
183
|
+
codeFence = null;
|
|
184
|
+
codeLang = null;
|
|
185
|
+
codeAttrs = {};
|
|
186
|
+
codeLines = [];
|
|
187
|
+
|
|
188
|
+
i++;
|
|
189
|
+
continue;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
if (inCodeBlock) {
|
|
194
|
+
codeLines.push(line);
|
|
195
|
+
i++;
|
|
196
|
+
continue;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Heading with potential attributes on next line
|
|
200
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+?)(\s*\{[^}]+\})?$/);
|
|
201
|
+
if (headingMatch) {
|
|
202
|
+
const [, hashes, text, attrs] = headingMatch;
|
|
203
|
+
let attributes = attrs ? parseAttributes(attrs) : {};
|
|
204
|
+
|
|
205
|
+
// Check next line for attributes
|
|
206
|
+
if (!attrs && i + 1 < lines.length) {
|
|
207
|
+
const nextLine = lines[i + 1].trim();
|
|
208
|
+
if (nextLine.match(/^\{[^}]+\}$/)) {
|
|
209
|
+
attributes = parseAttributes(nextLine);
|
|
210
|
+
i++; // Skip the attribute line
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
tokens.push({
|
|
215
|
+
type: 'heading',
|
|
216
|
+
depth: hashes.length,
|
|
217
|
+
text: text.trim(),
|
|
218
|
+
attrs: attributes
|
|
219
|
+
});
|
|
220
|
+
i++;
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Task list item
|
|
225
|
+
const taskMatch = line.match(/^(\s*)([-*+])\s+\[([ xX])\]\s+(.+?)(\s*\{[^}]+\})?$/);
|
|
226
|
+
if (taskMatch) {
|
|
227
|
+
const [, indent, marker, checked, text, attrs] = taskMatch;
|
|
228
|
+
tokens.push({
|
|
229
|
+
type: 'taskItem',
|
|
230
|
+
indent: indent.length,
|
|
231
|
+
checked: checked.toLowerCase() === 'x',
|
|
232
|
+
text: text.trim(),
|
|
233
|
+
attrs: attrs ? parseAttributes(attrs) : {}
|
|
234
|
+
});
|
|
235
|
+
i++;
|
|
236
|
+
continue;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// Regular list item (must come after task item check)
|
|
240
|
+
const listMatch = line.match(/^(\s*)([-*+]|\d+\.)\s+(.+?)(\s*\{[^}]+\})?$/);
|
|
241
|
+
if (listMatch) {
|
|
242
|
+
const [, indent, marker, text, attrs] = listMatch;
|
|
243
|
+
|
|
244
|
+
// If the list item has trailing attribute syntax (e.g. - [Link](#id){rel="hasPart"})
|
|
245
|
+
// treat those attributes as part of the inline content so that parseInline
|
|
246
|
+
// can correctly interpret them on the link/span itself.
|
|
247
|
+
const combinedText = attrs ? `${text}${attrs.trim()}` : text;
|
|
248
|
+
|
|
249
|
+
tokens.push({
|
|
250
|
+
type: 'listItem',
|
|
251
|
+
indent: indent.length,
|
|
252
|
+
text: combinedText.trim(),
|
|
253
|
+
attrs: attrs ? parseAttributes(attrs) : {}
|
|
254
|
+
});
|
|
255
|
+
i++;
|
|
256
|
+
continue;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// Paragraph
|
|
260
|
+
if (trimmed && !trimmed.match(/^(---|```)/)) {
|
|
261
|
+
tokens.push({
|
|
262
|
+
type: 'paragraph',
|
|
263
|
+
text: line
|
|
264
|
+
});
|
|
265
|
+
i++;
|
|
266
|
+
continue;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// Blank line
|
|
270
|
+
if (!trimmed) {
|
|
271
|
+
tokens.push({ type: 'blank' });
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
i++;
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
return tokens;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// ============================================================================
|
|
281
|
+
// Attribute Parser {#id .class key="value"}
|
|
282
|
+
// ============================================================================
|
|
283
|
+
|
|
284
|
+
function parseAttributes(attrString) {
|
|
285
|
+
const attrs = {};
|
|
286
|
+
const cleaned = attrString.replace(/^\{|\}$/g, '').trim();
|
|
287
|
+
|
|
288
|
+
// ID: #something
|
|
289
|
+
const idMatch = cleaned.match(/#([^\s.]+)/);
|
|
290
|
+
if (idMatch) attrs.id = idMatch[1];
|
|
291
|
+
|
|
292
|
+
// Classes: .class1 .class2
|
|
293
|
+
const classMatches = cleaned.match(/\.([^\s.#]+)/g);
|
|
294
|
+
if (classMatches) {
|
|
295
|
+
attrs.class = classMatches.map(c => c.substring(1)).join(' ');
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// Key-value pairs: key="value" or key='value'
|
|
299
|
+
const kvRegex = /(\w+)=["']([^"']*)["']/g;
|
|
300
|
+
let match;
|
|
301
|
+
while ((match = kvRegex.exec(cleaned)) !== null) {
|
|
302
|
+
attrs[match[1]] = match[2];
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
return attrs;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// ============================================================================
|
|
309
|
+
// Inline Parser (for [text](url){attrs} and [text]{attrs})
|
|
310
|
+
// ============================================================================
|
|
311
|
+
|
|
312
|
+
function parseInline(text) {
|
|
313
|
+
const spans = [];
|
|
314
|
+
let pos = 0;
|
|
315
|
+
|
|
316
|
+
// Pattern: [text](url){attrs} or [text]{attrs}
|
|
317
|
+
const inlineRegex = /\[([^\]]+)\](?:\(([^)]+)\))?(?:\{([^}]+)\})?/g;
|
|
318
|
+
let match;
|
|
319
|
+
let lastIndex = 0;
|
|
320
|
+
|
|
321
|
+
while ((match = inlineRegex.exec(text)) !== null) {
|
|
322
|
+
// Text before match
|
|
323
|
+
if (match.index > lastIndex) {
|
|
324
|
+
spans.push({
|
|
325
|
+
type: 'text',
|
|
326
|
+
value: text.substring(lastIndex, match.index)
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
const [fullMatch, linkText, url, attrs] = match;
|
|
331
|
+
spans.push({
|
|
332
|
+
type: url ? 'link' : 'span',
|
|
333
|
+
text: linkText,
|
|
334
|
+
url: url || null,
|
|
335
|
+
attrs: attrs ? parseAttributes(`{${attrs}}`) : {}
|
|
336
|
+
});
|
|
337
|
+
|
|
338
|
+
lastIndex = match.index + fullMatch.length;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// Remaining text
|
|
342
|
+
if (lastIndex < text.length) {
|
|
343
|
+
spans.push({
|
|
344
|
+
type: 'text',
|
|
345
|
+
value: text.substring(lastIndex)
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
return spans.length > 0 ? spans : [{ type: 'text', value: text }];
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// ============================================================================
|
|
353
|
+
// MD-LD Parser
|
|
354
|
+
// ============================================================================
|
|
355
|
+
|
|
356
|
+
export class MDLDParser {
|
|
357
|
+
constructor(options = {}) {
|
|
358
|
+
this.options = {
|
|
359
|
+
baseIRI: options.baseIRI || '',
|
|
360
|
+
defaultVocab: options.defaultVocab || 'http://schema.org/',
|
|
361
|
+
dataFactory: options.dataFactory || DefaultDataFactory,
|
|
362
|
+
...options
|
|
363
|
+
};
|
|
364
|
+
|
|
365
|
+
this.df = this.options.dataFactory;
|
|
366
|
+
this.quads = [];
|
|
367
|
+
this.context = null;
|
|
368
|
+
this.rootSubject = null;
|
|
369
|
+
this.currentSubject = null;
|
|
370
|
+
this.blankNodeCounter = 0;
|
|
371
|
+
this.subjectStack = [];
|
|
372
|
+
this.blankNodeMap = new Map();
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
hashBlankNode(input) {
|
|
376
|
+
if (this.blankNodeMap.has(input)) {
|
|
377
|
+
return this.blankNodeMap.get(input);
|
|
378
|
+
}
|
|
379
|
+
let hash = 5381;
|
|
380
|
+
for (let i = 0; i < input.length; i++) {
|
|
381
|
+
hash = ((hash << 5) + hash) + input.charCodeAt(i);
|
|
382
|
+
}
|
|
383
|
+
const bnId = `b${Math.abs(hash).toString(16).slice(0, 12)}`;
|
|
384
|
+
this.blankNodeMap.set(input, bnId);
|
|
385
|
+
return bnId;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
parse(markdown) {
|
|
389
|
+
this.quads = [];
|
|
390
|
+
|
|
391
|
+
// Extract frontmatter
|
|
392
|
+
const { frontmatter, body } = this.extractFrontmatter(markdown);
|
|
393
|
+
|
|
394
|
+
// Parse YAML-LD frontmatter
|
|
395
|
+
if (frontmatter) {
|
|
396
|
+
try {
|
|
397
|
+
this.context = parseYAMLLD(frontmatter);
|
|
398
|
+
|
|
399
|
+
// Check for @base in @context (JSON-LD standard)
|
|
400
|
+
if (this.context['@context']?.['@base']) {
|
|
401
|
+
this.options.baseIRI = this.context['@context']['@base'];
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
this.rootSubject = this.resolveRootSubject(this.context);
|
|
405
|
+
|
|
406
|
+
// Emit root subject type if present
|
|
407
|
+
if (this.context['@type']) {
|
|
408
|
+
const types = Array.isArray(this.context['@type'])
|
|
409
|
+
? this.context['@type']
|
|
410
|
+
: [this.context['@type']];
|
|
411
|
+
|
|
412
|
+
types.forEach(type => {
|
|
413
|
+
const typeNode = this.resolveResource(type);
|
|
414
|
+
if (typeNode) {
|
|
415
|
+
this.emitQuad(
|
|
416
|
+
this.rootSubject,
|
|
417
|
+
this.df.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
|
|
418
|
+
typeNode
|
|
419
|
+
);
|
|
420
|
+
}
|
|
421
|
+
});
|
|
422
|
+
}
|
|
423
|
+
} catch (e) {
|
|
424
|
+
console.error('YAML-LD parse error:', e);
|
|
425
|
+
this.context = {
|
|
426
|
+
'@context': { '@vocab': this.options.defaultVocab }
|
|
427
|
+
};
|
|
428
|
+
this.rootSubject = this.df.namedNode(this.options.baseIRI || '');
|
|
429
|
+
}
|
|
430
|
+
} else {
|
|
431
|
+
// No frontmatter - use base IRI as root
|
|
432
|
+
this.context = {
|
|
433
|
+
'@context': { '@vocab': this.options.defaultVocab }
|
|
434
|
+
};
|
|
435
|
+
this.rootSubject = this.df.namedNode(this.options.baseIRI || '');
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
this.currentSubject = this.rootSubject;
|
|
439
|
+
|
|
440
|
+
// Tokenize markdown
|
|
441
|
+
const tokens = tokenizeMarkdown(body);
|
|
442
|
+
|
|
443
|
+
// Process tokens
|
|
444
|
+
this.processTokens(tokens);
|
|
445
|
+
|
|
446
|
+
return this.quads;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
extractFrontmatter(markdown) {
|
|
450
|
+
const match = markdown.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
|
|
451
|
+
if (match) {
|
|
452
|
+
return { frontmatter: match[1], body: match[2] };
|
|
453
|
+
}
|
|
454
|
+
return { frontmatter: null, body: markdown };
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
resolveRootSubject(context) {
|
|
458
|
+
if (context['@id']) {
|
|
459
|
+
const id = context['@id'];
|
|
460
|
+
if (id.startsWith('#')) {
|
|
461
|
+
const fullIRI = (this.options.baseIRI || '') + id;
|
|
462
|
+
return this.df.namedNode(fullIRI);
|
|
463
|
+
}
|
|
464
|
+
if (id.startsWith('_:')) {
|
|
465
|
+
return this.df.blankNode(id.substring(2));
|
|
466
|
+
}
|
|
467
|
+
if (id.includes(':')) {
|
|
468
|
+
return this.df.namedNode(id);
|
|
469
|
+
}
|
|
470
|
+
return this.df.namedNode(this.options.baseIRI + id);
|
|
471
|
+
}
|
|
472
|
+
return this.df.namedNode(this.options.baseIRI || '');
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
getRootFragment() {
|
|
476
|
+
const rootValue = this.rootSubject.value;
|
|
477
|
+
const hashIndex = rootValue.lastIndexOf('#');
|
|
478
|
+
return hashIndex >= 0 ? rootValue.substring(hashIndex + 1) : '';
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
processTokens(tokens) {
|
|
482
|
+
let firstParagraph = true;
|
|
483
|
+
let titleEmitted = false;
|
|
484
|
+
|
|
485
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
486
|
+
const token = tokens[i];
|
|
487
|
+
|
|
488
|
+
if (token.type === 'heading') {
|
|
489
|
+
// First h1 becomes label (but don't emit if heading has #id attribute)
|
|
490
|
+
if (token.depth === 1 && !titleEmitted && !token.attrs.id) {
|
|
491
|
+
this.emitQuad(
|
|
492
|
+
this.rootSubject,
|
|
493
|
+
this.df.namedNode('http://www.w3.org/2000/01/rdf-schema#label'),
|
|
494
|
+
this.df.literal(token.text)
|
|
495
|
+
);
|
|
496
|
+
titleEmitted = true;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
// Heading with #id becomes new subject
|
|
500
|
+
if (token.attrs.id) {
|
|
501
|
+
const rootFragment = this.getRootFragment();
|
|
502
|
+
let newSubject;
|
|
503
|
+
|
|
504
|
+
if (token.attrs.id === rootFragment) {
|
|
505
|
+
// Same as root document subject
|
|
506
|
+
newSubject = this.rootSubject;
|
|
507
|
+
} else {
|
|
508
|
+
// Fragment relative to root
|
|
509
|
+
const baseForFragment = this.rootSubject.value.split('#')[0];
|
|
510
|
+
newSubject = this.df.namedNode(baseForFragment + '#' + token.attrs.id);
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
// Type assertion
|
|
514
|
+
if (token.attrs.typeof) {
|
|
515
|
+
const types = token.attrs.typeof.trim().split(/\s+/).filter(Boolean);
|
|
516
|
+
types.forEach(type => {
|
|
517
|
+
const typeNode = this.resolveResource(type);
|
|
518
|
+
if (typeNode) {
|
|
519
|
+
this.emitQuad(
|
|
520
|
+
newSubject,
|
|
521
|
+
this.df.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
|
|
522
|
+
typeNode
|
|
523
|
+
);
|
|
524
|
+
}
|
|
525
|
+
});
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
// Heading text becomes an rdfs:label of the subject
|
|
529
|
+
this.emitQuad(
|
|
530
|
+
newSubject,
|
|
531
|
+
this.df.namedNode('http://www.w3.org/2000/01/rdf-schema#label'),
|
|
532
|
+
this.df.literal(token.text.trim())
|
|
533
|
+
);
|
|
534
|
+
|
|
535
|
+
// Set as current subject
|
|
536
|
+
this.currentSubject = newSubject;
|
|
537
|
+
this.subjectStack.push(newSubject);
|
|
538
|
+
} else if (!titleEmitted) {
|
|
539
|
+
// Heading without id keeps parent context
|
|
540
|
+
// but h1 without attributes still sets root as current
|
|
541
|
+
if (token.depth === 1) {
|
|
542
|
+
this.currentSubject = this.rootSubject;
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
continue;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
if (token.type === 'code') {
|
|
550
|
+
// Code blocks become SoftwareSourceCode-like resources
|
|
551
|
+
let snippetSubject;
|
|
552
|
+
|
|
553
|
+
if (token.attrs && token.attrs.id) {
|
|
554
|
+
const rootFragment = this.getRootFragment();
|
|
555
|
+
if (token.attrs.id === rootFragment) {
|
|
556
|
+
snippetSubject = this.rootSubject;
|
|
557
|
+
} else {
|
|
558
|
+
const baseForFragment = this.rootSubject.value.split('#')[0];
|
|
559
|
+
snippetSubject = this.df.namedNode(baseForFragment + '#' + token.attrs.id);
|
|
560
|
+
}
|
|
561
|
+
} else {
|
|
562
|
+
snippetSubject = this.df.blankNode(
|
|
563
|
+
this.hashBlankNode(`code:${token.lang || ''}:${token.text}`)
|
|
564
|
+
);
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
// Type assertion: typeof override or default SoftwareSourceCode
|
|
568
|
+
if (token.attrs && token.attrs.typeof) {
|
|
569
|
+
const types = token.attrs.typeof.trim().split(/\s+/).filter(Boolean);
|
|
570
|
+
types.forEach(type => {
|
|
571
|
+
const typeNode = this.resolveResource(type);
|
|
572
|
+
if (typeNode) {
|
|
573
|
+
this.emitQuad(
|
|
574
|
+
snippetSubject,
|
|
575
|
+
this.df.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
|
|
576
|
+
typeNode
|
|
577
|
+
);
|
|
578
|
+
}
|
|
579
|
+
});
|
|
580
|
+
} else {
|
|
581
|
+
const defaultType = this.resolveResource('SoftwareSourceCode');
|
|
582
|
+
if (defaultType) {
|
|
583
|
+
this.emitQuad(
|
|
584
|
+
snippetSubject,
|
|
585
|
+
this.df.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
|
|
586
|
+
defaultType
|
|
587
|
+
);
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
// Programming language from fenced code info string
|
|
592
|
+
if (token.lang) {
|
|
593
|
+
const langPred = this.resolveResource('programmingLanguage');
|
|
594
|
+
if (langPred) {
|
|
595
|
+
this.emitQuad(
|
|
596
|
+
snippetSubject,
|
|
597
|
+
langPred,
|
|
598
|
+
this.df.literal(token.lang)
|
|
599
|
+
);
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
// Raw source text
|
|
604
|
+
const textPred = this.resolveResource('text');
|
|
605
|
+
if (textPred && token.text) {
|
|
606
|
+
this.emitQuad(
|
|
607
|
+
snippetSubject,
|
|
608
|
+
textPred,
|
|
609
|
+
this.df.literal(token.text)
|
|
610
|
+
);
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
// Link from current subject to code snippet
|
|
614
|
+
const hasPartPred = this.resolveResource('hasPart');
|
|
615
|
+
if (hasPartPred) {
|
|
616
|
+
this.emitQuad(
|
|
617
|
+
this.currentSubject,
|
|
618
|
+
hasPartPred,
|
|
619
|
+
snippetSubject
|
|
620
|
+
);
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
continue;
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
if (token.type === 'paragraph') {
|
|
627
|
+
// First paragraph after title becomes description
|
|
628
|
+
if (firstParagraph && titleEmitted) {
|
|
629
|
+
const text = token.text.trim();
|
|
630
|
+
if (text && !text.match(/\[.*\]/)) { // Simple text, no links
|
|
631
|
+
this.emitQuad(
|
|
632
|
+
this.rootSubject,
|
|
633
|
+
this.df.namedNode('http://purl.org/dc/terms/description'),
|
|
634
|
+
this.df.literal(text)
|
|
635
|
+
);
|
|
636
|
+
}
|
|
637
|
+
firstParagraph = false;
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
// Process inline annotations
|
|
641
|
+
this.processInline(token.text);
|
|
642
|
+
continue;
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
if (token.type === 'listItem') {
|
|
646
|
+
this.processInline(token.text);
|
|
647
|
+
continue;
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
if (token.type === 'taskItem') {
|
|
651
|
+
// Task items create Action instances
|
|
652
|
+
let action;
|
|
653
|
+
if (token.attrs.id) {
|
|
654
|
+
const rootFragment = this.getRootFragment();
|
|
655
|
+
if (token.attrs.id === rootFragment) {
|
|
656
|
+
action = this.rootSubject;
|
|
657
|
+
} else {
|
|
658
|
+
const baseForFragment = this.rootSubject.value.split('#')[0];
|
|
659
|
+
action = this.df.namedNode(baseForFragment + '#' + token.attrs.id);
|
|
660
|
+
}
|
|
661
|
+
} else {
|
|
662
|
+
action = this.df.blankNode(this.hashBlankNode(`task:${token.text}`));
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
// Type declaration (always Action, or overridden by typeof)
|
|
666
|
+
let actionType = 'http://schema.org/Action';
|
|
667
|
+
if (token.attrs.typeof) {
|
|
668
|
+
const types = token.attrs.typeof.trim().split(/\s+/).filter(Boolean);
|
|
669
|
+
types.forEach(type => {
|
|
670
|
+
const typeNode = this.resolveResource(type);
|
|
671
|
+
if (typeNode) {
|
|
672
|
+
this.emitQuad(
|
|
673
|
+
action,
|
|
674
|
+
this.df.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
|
|
675
|
+
typeNode
|
|
676
|
+
);
|
|
677
|
+
}
|
|
678
|
+
});
|
|
679
|
+
} else {
|
|
680
|
+
this.emitQuad(
|
|
681
|
+
action,
|
|
682
|
+
this.df.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
|
|
683
|
+
this.df.namedNode(actionType)
|
|
684
|
+
);
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
this.emitQuad(
|
|
688
|
+
action,
|
|
689
|
+
this.df.namedNode('http://schema.org/name'),
|
|
690
|
+
this.df.literal(token.text)
|
|
691
|
+
);
|
|
692
|
+
|
|
693
|
+
const status = token.checked
|
|
694
|
+
? 'http://schema.org/CompletedActionStatus'
|
|
695
|
+
: 'http://schema.org/PotentialActionStatus';
|
|
696
|
+
|
|
697
|
+
this.emitQuad(
|
|
698
|
+
action,
|
|
699
|
+
this.df.namedNode('http://schema.org/actionStatus'),
|
|
700
|
+
this.df.namedNode(status)
|
|
701
|
+
);
|
|
702
|
+
|
|
703
|
+
// Link to current subject
|
|
704
|
+
this.emitQuad(
|
|
705
|
+
this.currentSubject,
|
|
706
|
+
this.df.namedNode('http://schema.org/potentialAction'),
|
|
707
|
+
action
|
|
708
|
+
);
|
|
709
|
+
|
|
710
|
+
continue;
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
processInline(text) {
|
|
716
|
+
const spans = parseInline(text);
|
|
717
|
+
|
|
718
|
+
for (const span of spans) {
|
|
719
|
+
if (span.type === 'text') {
|
|
720
|
+
continue;
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
if (span.type === 'link' || span.type === 'span') {
|
|
724
|
+
const attrs = span.attrs;
|
|
725
|
+
|
|
726
|
+
// Subject declaration
|
|
727
|
+
let subject = this.currentSubject;
|
|
728
|
+
if (attrs.id) {
|
|
729
|
+
const rootFragment = this.getRootFragment();
|
|
730
|
+
|
|
731
|
+
if (attrs.id === rootFragment) {
|
|
732
|
+
// Same as root document subject
|
|
733
|
+
subject = this.rootSubject;
|
|
734
|
+
} else {
|
|
735
|
+
// Fragment relative to root
|
|
736
|
+
const baseForFragment = this.rootSubject.value.split('#')[0];
|
|
737
|
+
subject = this.df.namedNode(baseForFragment + '#' + attrs.id);
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
// Type assertion
|
|
741
|
+
if (attrs.typeof) {
|
|
742
|
+
const types = attrs.typeof.trim().split(/\s+/).filter(Boolean);
|
|
743
|
+
types.forEach(type => {
|
|
744
|
+
const typeNode = this.resolveResource(type);
|
|
745
|
+
if (typeNode) {
|
|
746
|
+
this.emitQuad(
|
|
747
|
+
subject,
|
|
748
|
+
this.df.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
|
|
749
|
+
typeNode
|
|
750
|
+
);
|
|
751
|
+
}
|
|
752
|
+
});
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
// Property (literal)
|
|
757
|
+
if (attrs.property) {
|
|
758
|
+
const properties = attrs.property.trim().split(/\s+/).filter(Boolean);
|
|
759
|
+
properties.forEach(prop => {
|
|
760
|
+
const predicate = this.resolveResource(prop);
|
|
761
|
+
if (!predicate) return;
|
|
762
|
+
|
|
763
|
+
let object;
|
|
764
|
+
if (attrs.datatype) {
|
|
765
|
+
const datatypeIRI = this.resolveResource(attrs.datatype);
|
|
766
|
+
if (datatypeIRI && datatypeIRI.value) {
|
|
767
|
+
object = this.df.literal(span.text, datatypeIRI);
|
|
768
|
+
} else {
|
|
769
|
+
object = this.df.literal(span.text);
|
|
770
|
+
}
|
|
771
|
+
} else {
|
|
772
|
+
object = this.df.literal(span.text);
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
this.emitQuad(subject, predicate, object);
|
|
776
|
+
});
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
// Relationship (object property)
|
|
780
|
+
if (attrs.rel && span.url) {
|
|
781
|
+
const rels = attrs.rel.trim().split(/\s+/).filter(Boolean);
|
|
782
|
+
let objectNode;
|
|
783
|
+
|
|
784
|
+
if (span.url.startsWith('#')) {
|
|
785
|
+
const baseForFragment = this.rootSubject.value.split('#')[0];
|
|
786
|
+
objectNode = this.df.namedNode(baseForFragment + span.url);
|
|
787
|
+
} else {
|
|
788
|
+
objectNode = this.df.namedNode(span.url);
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
rels.forEach(rel => {
|
|
792
|
+
const predicate = this.resolveResource(rel);
|
|
793
|
+
if (predicate) {
|
|
794
|
+
this.emitQuad(subject, predicate, objectNode);
|
|
795
|
+
}
|
|
796
|
+
});
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
// typeof without id creates typed blank node
|
|
800
|
+
if (attrs.typeof && !attrs.id && attrs.rel) {
|
|
801
|
+
const blankSubject = this.df.blankNode(this.hashBlankNode(`span:${span.text}:${JSON.stringify(attrs)}}`));
|
|
802
|
+
|
|
803
|
+
const types = attrs.typeof.trim().split(/\s+/).filter(Boolean);
|
|
804
|
+
types.forEach(type => {
|
|
805
|
+
const typeNode = this.resolveResource(type);
|
|
806
|
+
if (typeNode) {
|
|
807
|
+
this.emitQuad(
|
|
808
|
+
blankSubject,
|
|
809
|
+
this.df.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
|
|
810
|
+
typeNode
|
|
811
|
+
);
|
|
812
|
+
}
|
|
813
|
+
});
|
|
814
|
+
|
|
815
|
+
// Link from current subject
|
|
816
|
+
if (attrs.rel) {
|
|
817
|
+
const rels = attrs.rel.trim().split(/\s+/).filter(Boolean);
|
|
818
|
+
rels.forEach(rel => {
|
|
819
|
+
const predicate = this.resolveResource(rel);
|
|
820
|
+
if (predicate) {
|
|
821
|
+
this.emitQuad(subject, predicate, blankSubject);
|
|
822
|
+
}
|
|
823
|
+
});
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
}
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
resolveResource(term) {
|
|
831
|
+
if (!term || typeof term !== 'string') return null;
|
|
832
|
+
|
|
833
|
+
const trimmed = term.trim();
|
|
834
|
+
if (!trimmed) return null;
|
|
835
|
+
|
|
836
|
+
// Absolute IRI
|
|
837
|
+
if (trimmed.match(/^https?:/)) {
|
|
838
|
+
return this.df.namedNode(trimmed);
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
// CURIE
|
|
842
|
+
if (trimmed.includes(':')) {
|
|
843
|
+
const [prefix, reference] = trimmed.split(':', 2);
|
|
844
|
+
const contextObj = this.context?.['@context'] || {};
|
|
845
|
+
|
|
846
|
+
if (contextObj[prefix]) {
|
|
847
|
+
return this.df.namedNode(contextObj[prefix] + reference);
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
// Default XSD namespace
|
|
851
|
+
if (prefix === 'xsd') {
|
|
852
|
+
return this.df.namedNode('http://www.w3.org/2001/XMLSchema#' + reference);
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
// Default vocab
|
|
857
|
+
const vocab = this.context?.['@context']?.['@vocab'] || this.options.defaultVocab;
|
|
858
|
+
return this.df.namedNode(vocab + trimmed);
|
|
859
|
+
}
|
|
860
|
+
|
|
861
|
+
emitQuad(subject, predicate, object) {
|
|
862
|
+
if (!subject || !predicate || !object) return;
|
|
863
|
+
|
|
864
|
+
const quad = this.df.quad(subject, predicate, object);
|
|
865
|
+
this.quads.push(quad);
|
|
866
|
+
}
|
|
867
|
+
|
|
868
|
+
getQuads() {
|
|
869
|
+
return this.quads;
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
// ============================================================================
|
|
874
|
+
// Convenience API
|
|
875
|
+
// ============================================================================
|
|
876
|
+
|
|
877
|
+
export function parseMDLD(markdown, options = {}) {
|
|
878
|
+
const parser = new MDLDParser(options);
|
|
879
|
+
return parser.parse(markdown);
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
export default { MDLDParser, parseMDLD, DefaultDataFactory };
|