mdld-parse 0.6.2 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +119 -473
- package/package.json +1 -4
- package/src/generate.js +5 -89
- package/src/index.js +1 -1
- package/src/locate.js +21 -58
- package/src/merge.js +131 -0
- package/src/parse.js +134 -24
- package/src/utils.js +37 -120
- package/src/applyDiff.js +0 -583
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mdld-parse",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.7.1",
|
|
4
4
|
"description": "A standards-compliant parser for **MD-LD (Markdown-Linked Data)** — a human-friendly RDF authoring format that extends Markdown with semantic annotations.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -38,8 +38,5 @@
|
|
|
38
38
|
"homepage": "https://mdld.js.org",
|
|
39
39
|
"bugs": {
|
|
40
40
|
"url": "https://github.com/davay42/mdld-parse/issues"
|
|
41
|
-
},
|
|
42
|
-
"devDependencies": {
|
|
43
|
-
"n3": "^2.0.1"
|
|
44
41
|
}
|
|
45
42
|
}
|
package/src/generate.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { shortenIRI, expandIRI,
|
|
1
|
+
import { shortenIRI, expandIRI, DEFAULT_CONTEXT, DataFactory } from './utils.js';
|
|
2
2
|
|
|
3
3
|
// Helper functions for cleaner term type checking
|
|
4
4
|
function isLiteral(term) {
|
|
@@ -29,7 +29,7 @@ function extractLocalName(iri) {
|
|
|
29
29
|
* Generate deterministic MDLD from RDF quads
|
|
30
30
|
* Purpose: TTL→MDLD conversion with canonical structure
|
|
31
31
|
* Input: RDF quads + context
|
|
32
|
-
* Output: MDLD text
|
|
32
|
+
* Output: MDLD text
|
|
33
33
|
*/
|
|
34
34
|
export function generate(quads, context = {}) {
|
|
35
35
|
const fullContext = { ...DEFAULT_CONTEXT, ...context };
|
|
@@ -38,13 +38,9 @@ export function generate(quads, context = {}) {
|
|
|
38
38
|
|
|
39
39
|
const subjectGroups = groupQuadsBySubject(normalizedQuads);
|
|
40
40
|
|
|
41
|
-
const { text
|
|
41
|
+
const { text } = buildDeterministicMDLD(subjectGroups, fullContext);
|
|
42
42
|
|
|
43
|
-
return
|
|
44
|
-
text,
|
|
45
|
-
origin: { quadMap },
|
|
46
|
-
context: fullContext
|
|
47
|
-
};
|
|
43
|
+
return text;
|
|
48
44
|
}
|
|
49
45
|
|
|
50
46
|
function normalizeAndSortQuads(quads) {
|
|
@@ -86,8 +82,6 @@ function groupQuadsBySubject(quads) {
|
|
|
86
82
|
|
|
87
83
|
function buildDeterministicMDLD(subjectGroups, context) {
|
|
88
84
|
let text = '';
|
|
89
|
-
let currentPos = 0;
|
|
90
|
-
const quadMap = new Map();
|
|
91
85
|
|
|
92
86
|
// Add prefixes first (deterministic order), but exclude default context prefixes
|
|
93
87
|
const sortedPrefixes = Object.entries(context).sort(([a], [b]) => a.localeCompare(b));
|
|
@@ -96,13 +90,11 @@ function buildDeterministicMDLD(subjectGroups, context) {
|
|
|
96
90
|
if (prefix !== '@vocab' && !prefix.startsWith('@') && !DEFAULT_CONTEXT[prefix]) {
|
|
97
91
|
const prefixDecl = `[${prefix}] <${namespace}>\n`;
|
|
98
92
|
text += prefixDecl;
|
|
99
|
-
currentPos += prefixDecl.length;
|
|
100
93
|
}
|
|
101
94
|
}
|
|
102
95
|
|
|
103
96
|
if (sortedPrefixes.length > 0) {
|
|
104
97
|
text += '\n';
|
|
105
|
-
currentPos += 1;
|
|
106
98
|
}
|
|
107
99
|
|
|
108
100
|
// Process subjects in deterministic order
|
|
@@ -125,31 +117,7 @@ function buildDeterministicMDLD(subjectGroups, context) {
|
|
|
125
117
|
|
|
126
118
|
const headingText = `# ${localSubjectName} {=${shortSubject}${typeAnnotations}}\n\n`;
|
|
127
119
|
|
|
128
|
-
const headingBlock = {
|
|
129
|
-
id: generateBlockId(),
|
|
130
|
-
range: { start: currentPos, end: currentPos + headingText.length },
|
|
131
|
-
subject: subjectIRI,
|
|
132
|
-
types: types.map(t => t.object.value),
|
|
133
|
-
predicates: [],
|
|
134
|
-
context: { ...context },
|
|
135
|
-
carrierType: 'heading',
|
|
136
|
-
attrsRange: { start: currentPos + headingText.indexOf('{'), end: currentPos + headingText.indexOf('}') + 1 },
|
|
137
|
-
valueRange: { start: currentPos + 2, end: currentPos + 2 + localSubjectName.length }
|
|
138
|
-
};
|
|
139
|
-
|
|
140
|
-
// Add type quads to quadMap
|
|
141
|
-
types.forEach((quad, i) => {
|
|
142
|
-
const key = quadIndexKey(quad.subject, quad.predicate, quad.object);
|
|
143
|
-
quadMap.set(key, createUnifiedSlot(headingBlock, i, {
|
|
144
|
-
kind: 'type',
|
|
145
|
-
subject: quad.subject,
|
|
146
|
-
predicate: quad.predicate,
|
|
147
|
-
object: quad.object
|
|
148
|
-
}));
|
|
149
|
-
});
|
|
150
|
-
|
|
151
120
|
text += headingText;
|
|
152
|
-
currentPos += headingText.length;
|
|
153
121
|
|
|
154
122
|
// Add literals (deterministic order)
|
|
155
123
|
const sortedLiterals = literals.sort((a, b) => a.predicate.value.localeCompare(b.predicate.value));
|
|
@@ -166,30 +134,7 @@ function buildDeterministicMDLD(subjectGroups, context) {
|
|
|
166
134
|
}
|
|
167
135
|
|
|
168
136
|
const literalText = `[${quad.object.value}] {${annotation}}\n`;
|
|
169
|
-
const literalBlock = {
|
|
170
|
-
id: generateBlockId(),
|
|
171
|
-
range: { start: currentPos, end: currentPos + literalText.length },
|
|
172
|
-
subject: subjectIRI,
|
|
173
|
-
types: [],
|
|
174
|
-
predicates: [{ iri: quad.predicate.value, form: '' }],
|
|
175
|
-
context: { ...context },
|
|
176
|
-
carrierType: 'span',
|
|
177
|
-
valueRange: { start: currentPos + 1, end: currentPos + 1 + quad.object.value.length },
|
|
178
|
-
attrsRange: { start: currentPos + literalText.indexOf('{'), end: currentPos + literalText.indexOf('}') + 1 }
|
|
179
|
-
};
|
|
180
|
-
|
|
181
|
-
// Add to quadMap
|
|
182
|
-
const key = quadIndexKey(quad.subject, quad.predicate, quad.object);
|
|
183
|
-
quadMap.set(key, createUnifiedSlot(literalBlock, 0, {
|
|
184
|
-
kind: 'pred',
|
|
185
|
-
subject: quad.subject,
|
|
186
|
-
predicate: quad.predicate,
|
|
187
|
-
object: quad.object,
|
|
188
|
-
form: ''
|
|
189
|
-
}));
|
|
190
|
-
|
|
191
137
|
text += literalText;
|
|
192
|
-
currentPos += literalText.length;
|
|
193
138
|
}
|
|
194
139
|
|
|
195
140
|
// Add objects (deterministic order)
|
|
@@ -198,40 +143,11 @@ function buildDeterministicMDLD(subjectGroups, context) {
|
|
|
198
143
|
const objShort = shortenIRI(quad.object.value, context);
|
|
199
144
|
const predShort = shortenIRI(quad.predicate.value, context);
|
|
200
145
|
const objectText = `[${objShort}] {+${objShort} ?${predShort}}\n`;
|
|
201
|
-
|
|
202
|
-
const objectBlock = {
|
|
203
|
-
id: generateBlockId(),
|
|
204
|
-
range: { start: currentPos, end: currentPos + objectText.length },
|
|
205
|
-
subject: subjectIRI,
|
|
206
|
-
types: [],
|
|
207
|
-
predicates: [{ iri: quad.predicate.value, form: '?' }],
|
|
208
|
-
context: { ...context },
|
|
209
|
-
carrierType: 'span',
|
|
210
|
-
valueRange: { start: currentPos + 1, end: currentPos + 1 + objShort.length },
|
|
211
|
-
attrsRange: { start: currentPos + objectText.indexOf('{'), end: currentPos + objectText.indexOf('}') + 1 }
|
|
212
|
-
};
|
|
213
|
-
|
|
214
|
-
// Add to quadMap
|
|
215
|
-
const key = quadIndexKey(quad.subject, quad.predicate, quad.object);
|
|
216
|
-
quadMap.set(key, createUnifiedSlot(objectBlock, 0, {
|
|
217
|
-
kind: 'pred',
|
|
218
|
-
subject: quad.subject,
|
|
219
|
-
predicate: quad.predicate,
|
|
220
|
-
object: quad.object,
|
|
221
|
-
form: '?'
|
|
222
|
-
}));
|
|
223
|
-
|
|
224
146
|
text += objectText;
|
|
225
|
-
currentPos += objectText.length;
|
|
226
147
|
}
|
|
227
148
|
|
|
228
149
|
text += '\n';
|
|
229
|
-
currentPos += 1;
|
|
230
150
|
}
|
|
231
151
|
|
|
232
|
-
return { text
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
function generateBlockId() {
|
|
236
|
-
return Math.random().toString(36).substring(2, 10);
|
|
152
|
+
return { text };
|
|
237
153
|
}
|
package/src/index.js
CHANGED
package/src/locate.js
CHANGED
|
@@ -1,75 +1,38 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { normalizeQuad, quadIndexKey } from './utils.js';
|
|
1
|
+
import { quadToKeyForOrigin } from './utils.js';
|
|
3
2
|
|
|
4
3
|
/**
|
|
5
|
-
* Locate the
|
|
4
|
+
* Locate the origin entry for a quad using the lean origin system
|
|
6
5
|
*
|
|
7
6
|
* @param {Object} quad - The quad to locate (subject, predicate, object)
|
|
8
|
-
* @param {Object} origin - Origin object containing
|
|
9
|
-
* @
|
|
10
|
-
* @param {Object} context - Context for parsing (optional, used if text needs parsing)
|
|
11
|
-
* @returns {Object|null} Range information or null if not found
|
|
7
|
+
* @param {Object} origin - Origin object containing quadIndex
|
|
8
|
+
* @returns {Object|null} Origin entry or null if not found
|
|
12
9
|
*/
|
|
13
|
-
export function locate(quad, origin
|
|
14
|
-
|
|
15
|
-
if (!origin && text) {
|
|
16
|
-
const parseResult = parse(text, { context });
|
|
17
|
-
origin = parseResult.origin;
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
if (!quad || !origin || !origin.quadMap) {
|
|
10
|
+
export function locate(quad, origin) {
|
|
11
|
+
if (!quad || !origin || !origin.quadIndex) {
|
|
21
12
|
return null;
|
|
22
13
|
}
|
|
23
14
|
|
|
24
|
-
//
|
|
25
|
-
const
|
|
26
|
-
if (!
|
|
15
|
+
// Generate the quad key to lookup in quadIndex
|
|
16
|
+
const quadKey = quadToKeyForOrigin(quad);
|
|
17
|
+
if (!quadKey) {
|
|
27
18
|
return null;
|
|
28
19
|
}
|
|
29
20
|
|
|
30
|
-
//
|
|
31
|
-
const
|
|
32
|
-
|
|
33
|
-
// Find the slot information in quadMap
|
|
34
|
-
const slotInfo = origin.quadMap.get(quadKey);
|
|
35
|
-
if (!slotInfo) {
|
|
21
|
+
// Find the origin entry in quadIndex
|
|
22
|
+
const entry = origin.quadIndex.get(quadKey);
|
|
23
|
+
if (!entry) {
|
|
36
24
|
return null;
|
|
37
25
|
}
|
|
38
26
|
|
|
39
|
-
//
|
|
40
|
-
const block = slotInfo;
|
|
41
|
-
|
|
42
|
-
// Extract the actual text content based on carrier type
|
|
43
|
-
let contentRange = null;
|
|
44
|
-
let content = '';
|
|
45
|
-
|
|
46
|
-
if (block.carrierType === 'heading') {
|
|
47
|
-
// For headings, use the value range for the heading text
|
|
48
|
-
contentRange = block.valueRange;
|
|
49
|
-
content = text.substring(block.valueRange.start, block.valueRange.end);
|
|
50
|
-
} else if (block.carrierType === 'emphasis' || block.carrierType === 'blockquote' || block.carrierType === 'list' || block.carrierType === 'span') {
|
|
51
|
-
// For emphasis, blockquotes, lists, and spans, use the value range
|
|
52
|
-
if (block.valueRange) {
|
|
53
|
-
contentRange = block.valueRange;
|
|
54
|
-
content = text.substring(block.valueRange.start, block.valueRange.end);
|
|
55
|
-
} else {
|
|
56
|
-
// Fallback to block range
|
|
57
|
-
contentRange = block.range;
|
|
58
|
-
content = text.substring(block.range.start, block.range.end);
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
|
|
27
|
+
// Return the lean origin entry structure
|
|
62
28
|
return {
|
|
63
|
-
blockId:
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
subject:
|
|
67
|
-
predicate:
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
blockRange: block.range,
|
|
72
|
-
carrierType: block.carrierType,
|
|
73
|
-
isVacant: slotInfo.isVacant || false
|
|
29
|
+
blockId: entry.blockId,
|
|
30
|
+
range: entry.range,
|
|
31
|
+
carrierType: entry.carrierType,
|
|
32
|
+
subject: entry.subject,
|
|
33
|
+
predicate: entry.predicate,
|
|
34
|
+
context: entry.context,
|
|
35
|
+
value: entry.value,
|
|
36
|
+
polarity: entry.polarity
|
|
74
37
|
};
|
|
75
38
|
}
|
package/src/merge.js
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import { parse } from './parse.js';
|
|
2
|
+
import { DEFAULT_CONTEXT } from './utils.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Creates a unique key for quad identity matching
|
|
6
|
+
* @param {Quad} quad
|
|
7
|
+
* @returns {string}
|
|
8
|
+
*/
|
|
9
|
+
function quadKey(quad) {
|
|
10
|
+
const datatype = quad.object.datatype?.value || '';
|
|
11
|
+
const language = quad.object.language || '';
|
|
12
|
+
return `${quad.subject.value}|${quad.predicate.value}|${quad.object.value}|${datatype}|${language}`;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Normalizes merge input to ParseResult format
|
|
17
|
+
* @param {string|ParseResult} input
|
|
18
|
+
* @param {Object} options
|
|
19
|
+
* @param {Object} docContext
|
|
20
|
+
* @returns {ParseResult}
|
|
21
|
+
*/
|
|
22
|
+
function normalizeInput(input, options, docContext) {
|
|
23
|
+
if (typeof input === 'string') {
|
|
24
|
+
return parse(input, {
|
|
25
|
+
...options,
|
|
26
|
+
context: { ...docContext, ...options.context }
|
|
27
|
+
});
|
|
28
|
+
}
|
|
29
|
+
// ParseResult passthrough - no re-parse
|
|
30
|
+
return input;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Merges multiple MDLD documents with diff polarity resolution
|
|
35
|
+
* @param {Array<string|ParseResult>} docs
|
|
36
|
+
* @param {Object} options
|
|
37
|
+
* @returns {Object}
|
|
38
|
+
*/
|
|
39
|
+
export function merge(docs, options = {}) {
|
|
40
|
+
const sessionBuffer = new Map(); // Use Map instead of Set for proper quad storage
|
|
41
|
+
const sessionRemoveSet = new Set();
|
|
42
|
+
const allDocuments = [];
|
|
43
|
+
const quadIndex = new Map();
|
|
44
|
+
|
|
45
|
+
// Process each document in order
|
|
46
|
+
for (let i = 0; i < docs.length; i++) {
|
|
47
|
+
const input = docs[i];
|
|
48
|
+
|
|
49
|
+
// Each document gets the same context (no inheritance)
|
|
50
|
+
const docContext = { ...DEFAULT_CONTEXT, ...options.context };
|
|
51
|
+
|
|
52
|
+
// Normalize input to ParseResult
|
|
53
|
+
const doc = normalizeInput(input, options, docContext);
|
|
54
|
+
|
|
55
|
+
// Create document origin
|
|
56
|
+
const documentOrigin = {
|
|
57
|
+
index: i,
|
|
58
|
+
input: typeof input === 'string' ? 'string' : 'ParseResult',
|
|
59
|
+
origin: doc.origin,
|
|
60
|
+
context: doc.context
|
|
61
|
+
};
|
|
62
|
+
allDocuments.push(documentOrigin);
|
|
63
|
+
|
|
64
|
+
// Fold assertions into session buffer
|
|
65
|
+
for (const quad of doc.quads) {
|
|
66
|
+
const key = quadKey(quad);
|
|
67
|
+
sessionBuffer.set(key, quad);
|
|
68
|
+
|
|
69
|
+
// Create quad origin with document index and polarity
|
|
70
|
+
const existingOrigin = doc.origin.quadIndex.get(quadKey(quad));
|
|
71
|
+
if (existingOrigin) {
|
|
72
|
+
quadIndex.set(quadKey(quad), {
|
|
73
|
+
...existingOrigin,
|
|
74
|
+
documentIndex: i,
|
|
75
|
+
polarity: '+'
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Fold retractions
|
|
81
|
+
for (const quad of doc.remove) {
|
|
82
|
+
const key = quadKey(quad);
|
|
83
|
+
|
|
84
|
+
if (sessionBuffer.has(key)) {
|
|
85
|
+
// Inter-document cancel - remove from buffer
|
|
86
|
+
sessionBuffer.delete(key);
|
|
87
|
+
} else {
|
|
88
|
+
// External retract - add to remove set
|
|
89
|
+
sessionRemoveSet.add(quad);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// Create quad origin for remove quads
|
|
93
|
+
const existingOrigin = doc.origin.quadIndex.get(quadKey(quad));
|
|
94
|
+
if (existingOrigin) {
|
|
95
|
+
quadIndex.set(quadKey(quad), {
|
|
96
|
+
...existingOrigin,
|
|
97
|
+
documentIndex: i,
|
|
98
|
+
polarity: '-'
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Build final result
|
|
105
|
+
const finalQuads = Array.from(sessionBuffer.values());
|
|
106
|
+
const finalRemove = Array.from(sessionRemoveSet);
|
|
107
|
+
|
|
108
|
+
// Build merge origin
|
|
109
|
+
const mergeOrigin = {
|
|
110
|
+
documents: allDocuments,
|
|
111
|
+
quadIndex: quadIndex
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
// Build final context (union of all contexts)
|
|
115
|
+
const finalContext = { ...DEFAULT_CONTEXT, ...options.context };
|
|
116
|
+
|
|
117
|
+
// Enforce hard invariant
|
|
118
|
+
const quadKeys = new Set(finalQuads.map(quadKey));
|
|
119
|
+
const removeKeys = new Set(finalRemove.map(quadKey));
|
|
120
|
+
|
|
121
|
+
// Filter out any overlaps (shouldn't happen with correct implementation)
|
|
122
|
+
const filteredQuads = finalQuads.filter(quad => !removeKeys.has(quadKey(quad)));
|
|
123
|
+
const filteredRemove = finalRemove.filter(quad => !quadKeys.has(quadKey(quad)));
|
|
124
|
+
|
|
125
|
+
return {
|
|
126
|
+
quads: filteredQuads,
|
|
127
|
+
remove: filteredRemove,
|
|
128
|
+
origin: mergeOrigin,
|
|
129
|
+
context: finalContext
|
|
130
|
+
};
|
|
131
|
+
}
|
package/src/parse.js
CHANGED
|
@@ -4,7 +4,6 @@ import {
|
|
|
4
4
|
expandIRI,
|
|
5
5
|
parseSemanticBlock,
|
|
6
6
|
quadIndexKey,
|
|
7
|
-
createUnifiedSlot,
|
|
8
7
|
createLiteral,
|
|
9
8
|
hash
|
|
10
9
|
} from './utils.js';
|
|
@@ -343,7 +342,7 @@ function determineCarrierType(url) {
|
|
|
343
342
|
return { carrierType: 'span', resourceIRI: null };
|
|
344
343
|
}
|
|
345
344
|
|
|
346
|
-
function createBlock(subject, types, predicates, range, attrsRange, valueRange, carrierType, ctx) {
|
|
345
|
+
function createBlock(subject, types, predicates, range, attrsRange, valueRange, carrierType, ctx, text) {
|
|
347
346
|
const expanded = {
|
|
348
347
|
subject,
|
|
349
348
|
types: types.map(t => expandIRI(typeof t === 'string' ? t : t.iri, ctx)),
|
|
@@ -356,30 +355,121 @@ function createBlock(subject, types, predicates, range, attrsRange, valueRange,
|
|
|
356
355
|
return {
|
|
357
356
|
id: blockId,
|
|
358
357
|
range: { start: range[0], end: range[1] },
|
|
359
|
-
attrsRange: attrsRange ? { start: attrsRange[0], end: attrsRange[1] } : null,
|
|
360
|
-
valueRange: valueRange ? { start: valueRange[0], end: valueRange[1] } : null,
|
|
361
358
|
carrierType: carrierType || null,
|
|
362
359
|
subject,
|
|
363
360
|
types: expanded.types,
|
|
364
361
|
predicates: expanded.predicates,
|
|
365
|
-
context: ctx
|
|
362
|
+
context: ctx,
|
|
363
|
+
text: text || ''
|
|
366
364
|
};
|
|
367
365
|
}
|
|
368
366
|
|
|
369
|
-
function emitQuad(quads,
|
|
367
|
+
function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predicate, object, dataFactory, meta = null, statements = null, statementCandidates = null) {
|
|
370
368
|
if (!subject || !predicate || !object) return;
|
|
371
369
|
|
|
372
370
|
const quad = dataFactory.quad(subject, predicate, object);
|
|
373
|
-
|
|
371
|
+
const remove = meta?.remove || false;
|
|
372
|
+
|
|
373
|
+
if (remove) {
|
|
374
|
+
// Check if quad exists in current buffer
|
|
375
|
+
const quadKey = quadIndexKey(quad.subject, quad.predicate, quad.object);
|
|
376
|
+
if (quadBuffer.has(quadKey)) {
|
|
377
|
+
// In current state → cancel, appears nowhere
|
|
378
|
+
quadBuffer.delete(quadKey);
|
|
379
|
+
// Also remove from quads array if present
|
|
380
|
+
const index = quads.findIndex(q =>
|
|
381
|
+
q.subject.value === quad.subject.value &&
|
|
382
|
+
q.predicate.value === quad.predicate.value &&
|
|
383
|
+
q.object.value === quad.object.value
|
|
384
|
+
);
|
|
385
|
+
if (index !== -1) {
|
|
386
|
+
quads.splice(index, 1);
|
|
387
|
+
}
|
|
388
|
+
// Remove from quadIndex
|
|
389
|
+
quadIndex.delete(quadKey);
|
|
390
|
+
} else {
|
|
391
|
+
// Not in current state → external retract
|
|
392
|
+
removeSet.add(quad);
|
|
393
|
+
}
|
|
394
|
+
} else {
|
|
395
|
+
// Add to buffer and quads
|
|
396
|
+
const quadKey = quadIndexKey(quad.subject, quad.predicate, quad.object);
|
|
397
|
+
quadBuffer.set(quadKey, quad);
|
|
398
|
+
quads.push(quad);
|
|
399
|
+
|
|
400
|
+
// Detect rdf:Statement pattern during single-pass parsing
|
|
401
|
+
detectStatementPatternSinglePass(quad, dataFactory, meta, statements, statementCandidates);
|
|
402
|
+
|
|
403
|
+
// Create lean origin entry
|
|
404
|
+
const originEntry = {
|
|
405
|
+
blockId: block.id,
|
|
406
|
+
range: block.range,
|
|
407
|
+
carrierType: block.carrierType,
|
|
408
|
+
subject: subject.value,
|
|
409
|
+
predicate: predicate.value,
|
|
410
|
+
context: { ...block.context },
|
|
411
|
+
polarity: meta?.remove ? '-' : '+',
|
|
412
|
+
value: block.text || ''
|
|
413
|
+
};
|
|
414
|
+
|
|
415
|
+
quadIndex.set(quadKey, originEntry);
|
|
416
|
+
}
|
|
417
|
+
}
|
|
374
418
|
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
419
|
+
// Extract RDF constants once at module level for efficiency
|
|
420
|
+
const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
|
|
421
|
+
const RDF_STATEMENT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement';
|
|
422
|
+
const RDF_SUBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject';
|
|
423
|
+
const RDF_PREDICATE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate';
|
|
424
|
+
const RDF_OBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#object';
|
|
425
|
+
|
|
426
|
+
function detectStatementPatternSinglePass(quad, dataFactory, meta, statements = null, statementCandidates = null) {
|
|
427
|
+
// Skip if not called from parse context (for testing compatibility)
|
|
428
|
+
if (!statements || !statementCandidates) return;
|
|
381
429
|
|
|
382
|
-
|
|
430
|
+
const predicate = quad.predicate.value;
|
|
431
|
+
|
|
432
|
+
// Early filter: only process rdf:Statement related predicates
|
|
433
|
+
if (predicate !== RDF_TYPE &&
|
|
434
|
+
predicate !== RDF_SUBJECT &&
|
|
435
|
+
predicate !== RDF_PREDICATE &&
|
|
436
|
+
predicate !== RDF_OBJECT) {
|
|
437
|
+
return;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// Check if this quad starts a new rdf:Statement pattern
|
|
441
|
+
if (predicate === RDF_TYPE && quad.object.value === RDF_STATEMENT) {
|
|
442
|
+
statementCandidates.set(quad.subject.value, { spo: {} });
|
|
443
|
+
return;
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
// Check if this quad completes part of an existing rdf:Statement pattern
|
|
447
|
+
const candidate = statementCandidates.get(quad.subject.value);
|
|
448
|
+
if (!candidate) return;
|
|
449
|
+
|
|
450
|
+
// Direct property assignment instead of switch for better performance
|
|
451
|
+
if (predicate === RDF_SUBJECT) {
|
|
452
|
+
candidate.spo.subject = quad.object;
|
|
453
|
+
} else if (predicate === RDF_PREDICATE) {
|
|
454
|
+
candidate.spo.predicate = quad.object;
|
|
455
|
+
} else if (predicate === RDF_OBJECT) {
|
|
456
|
+
candidate.spo.object = quad.object;
|
|
457
|
+
// Store the original quad for potential literal extraction
|
|
458
|
+
candidate.objectQuad = quad;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
// Check if pattern is complete and create elevated SPO quad
|
|
462
|
+
if (candidate.spo.subject && candidate.spo.predicate && candidate.spo.object) {
|
|
463
|
+
// Use the object directly - literal detection happens at parse time
|
|
464
|
+
const spoQuad = dataFactory.quad(
|
|
465
|
+
candidate.spo.subject,
|
|
466
|
+
candidate.spo.predicate,
|
|
467
|
+
candidate.spo.object
|
|
468
|
+
);
|
|
469
|
+
statements.push(spoQuad);
|
|
470
|
+
// Clean up candidate to avoid duplicate detection
|
|
471
|
+
statementCandidates.delete(quad.subject.value);
|
|
472
|
+
}
|
|
383
473
|
}
|
|
384
474
|
|
|
385
475
|
const resolveFragment = (fragment, state) => {
|
|
@@ -406,23 +496,25 @@ function resolveObject(sem, state) {
|
|
|
406
496
|
|
|
407
497
|
const createTypeQuad = (typeIRI, subject, state, block, entryIndex = null) => {
|
|
408
498
|
const expandedType = expandIRI(typeIRI, state.ctx);
|
|
499
|
+
const typeInfo = typeof entryIndex === 'object' ? entryIndex : { entryIndex, remove: false };
|
|
409
500
|
emitQuad(
|
|
410
|
-
state.quads, state.origin.
|
|
501
|
+
state.quads, state.quadBuffer, state.removeSet, state.origin.quadIndex, block,
|
|
411
502
|
subject,
|
|
412
503
|
state.df.namedNode(expandIRI('rdf:type', state.ctx)),
|
|
413
504
|
state.df.namedNode(expandedType),
|
|
414
505
|
state.df,
|
|
415
|
-
{ kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex }
|
|
506
|
+
{ kind: 'type', token: `.${typeIRI}`, expandedType, entryIndex: typeInfo.entryIndex, remove: typeInfo.remove },
|
|
507
|
+
state.statements, state.statementCandidates
|
|
416
508
|
);
|
|
417
509
|
};
|
|
418
510
|
|
|
419
511
|
function processTypeAnnotations(sem, newSubject, localObject, carrierO, S, block, state, carrier) {
|
|
420
512
|
sem.types.forEach(t => {
|
|
421
513
|
const typeIRI = typeof t === 'string' ? t : t.iri;
|
|
422
|
-
const
|
|
514
|
+
const typeInfo = typeof t === 'string' ? { entryIndex: null, remove: false } : t;
|
|
423
515
|
// Type subject priority: explicit subject > soft object > carrier URL > current subject
|
|
424
516
|
let typeSubject = newSubject || localObject || carrierO || S;
|
|
425
|
-
createTypeQuad(typeIRI, typeSubject, state, block,
|
|
517
|
+
createTypeQuad(typeIRI, typeSubject, state, block, typeInfo);
|
|
426
518
|
});
|
|
427
519
|
}
|
|
428
520
|
|
|
@@ -453,9 +545,10 @@ function processPredicateAnnotations(sem, newSubject, previousSubject, localObje
|
|
|
453
545
|
const role = determinePredicateRole(pred, carrier, newSubject, previousSubject, localObject, newSubjectOrCarrierO, S, L);
|
|
454
546
|
if (role) {
|
|
455
547
|
const P = state.df.namedNode(expandIRI(pred.iri, state.ctx));
|
|
456
|
-
emitQuad(state.quads, state.origin.
|
|
548
|
+
emitQuad(state.quads, state.quadBuffer, state.removeSet, state.origin.quadIndex, block,
|
|
457
549
|
role.subject, P, role.object, state.df,
|
|
458
|
-
{ kind: 'pred', token: `${pred.form}${pred.iri}`, form: pred.form, expandedPredicate: P.value, entryIndex: pred.entryIndex }
|
|
550
|
+
{ kind: 'pred', token: `${pred.form}${pred.iri}`, form: pred.form, expandedPredicate: P.value, entryIndex: pred.entryIndex, remove: pred.remove || false },
|
|
551
|
+
state.statements, state.statementCandidates
|
|
459
552
|
);
|
|
460
553
|
}
|
|
461
554
|
});
|
|
@@ -483,7 +576,7 @@ function processAnnotation(carrier, sem, state, options = {}) {
|
|
|
483
576
|
const block = createBlock(
|
|
484
577
|
S.value, sem.types, sem.predicates,
|
|
485
578
|
carrier.range, carrier.attrsRange || null, carrier.valueRange || null,
|
|
486
|
-
carrier.type || null, state.ctx
|
|
579
|
+
carrier.type || null, state.ctx, carrier.text
|
|
487
580
|
);
|
|
488
581
|
|
|
489
582
|
const L = createLiteral(carrier.text, sem.datatype, sem.language, state.ctx, state.df);
|
|
@@ -555,10 +648,14 @@ export function parse(text, options = {}) {
|
|
|
555
648
|
ctx: { ...DEFAULT_CONTEXT, ...(options.context || {}) },
|
|
556
649
|
df: options.dataFactory || DataFactory,
|
|
557
650
|
quads: [],
|
|
558
|
-
|
|
651
|
+
quadBuffer: new Map(),
|
|
652
|
+
removeSet: new Set(),
|
|
653
|
+
origin: { quadIndex: new Map() },
|
|
559
654
|
currentSubject: null,
|
|
560
655
|
tokens: null,
|
|
561
|
-
currentTokenIndex: -1
|
|
656
|
+
currentTokenIndex: -1,
|
|
657
|
+
statements: [],
|
|
658
|
+
statementCandidates: new Map() // Track incomplete rdf:Statement patterns
|
|
562
659
|
};
|
|
563
660
|
|
|
564
661
|
state.tokens = scanTokens(text);
|
|
@@ -582,5 +679,18 @@ export function parse(text, options = {}) {
|
|
|
582
679
|
TOKEN_PROCESSORS[token.type]?.(token, state);
|
|
583
680
|
}
|
|
584
681
|
|
|
585
|
-
|
|
682
|
+
// Convert removeSet to array and ensure hard invariant: quads ∩ remove = ∅
|
|
683
|
+
const removeArray = Array.from(state.removeSet);
|
|
684
|
+
const quadKeys = new Set();
|
|
685
|
+
state.quads.forEach(q => {
|
|
686
|
+
quadKeys.add(quadIndexKey(q.subject, q.predicate, q.object));
|
|
687
|
+
});
|
|
688
|
+
|
|
689
|
+
// Filter removeArray to ensure no overlap with quads
|
|
690
|
+
const filteredRemove = removeArray.filter(quad => {
|
|
691
|
+
const key = quadIndexKey(quad.subject, quad.predicate, quad.object);
|
|
692
|
+
return !quadKeys.has(key);
|
|
693
|
+
});
|
|
694
|
+
|
|
695
|
+
return { quads: state.quads, remove: filteredRemove, statements: state.statements, origin: state.origin, context: state.ctx };
|
|
586
696
|
}
|