mdld-parse 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,531 @@
1
+ import { parse } from './parse.js';
2
+ import {
3
+ shortenIRI,
4
+ normalizeQuad,
5
+ quadToKeyForOrigin,
6
+ parseQuadIndexKey,
7
+ findVacantSlot,
8
+ occupySlot,
9
+ markSlotAsVacant,
10
+ normalizeAttrsTokens,
11
+ writeAttrsTokens,
12
+ removeOneToken,
13
+ addObjectToken,
14
+ removeObjectToken,
15
+ addSoftFragmentToken,
16
+ removeSoftFragmentToken,
17
+ objectSignature,
18
+ expandIRI
19
+ } from './utils.js';
20
+
21
+ function getBlockById(base, blockId) {
22
+ return blockId ? base?.blocks?.get(blockId) : null;
23
+ }
24
+
25
+ function getEntryByQuadKey(base, quadKey) {
26
+ return quadKey ? base?.quadIndex?.get(quadKey) : null;
27
+ }
28
+
29
+ function isValidQuad(quad) {
30
+ return quad && quad.subject && quad.predicate && quad.object;
31
+ }
32
+
33
+ function createLiteralAnnotation(value, predicate, language, datatype, ctx) {
34
+ let ann = predicate;
35
+ if (language) ann += ` @${language}`;
36
+ else if (datatype?.value && datatype.value !== 'http://www.w3.org/2001/XMLSchema#string') {
37
+ ann += ` ^^${shortenIRI(datatype.value, ctx)}`;
38
+ }
39
+ return ann;
40
+ }
41
+
42
+ function createObjectAnnotation(objectShort, predicateShort, isSoftFragment = false, fragment = null) {
43
+ if (isSoftFragment) {
44
+ return `[${objectShort}] {+#${fragment} ?${predicateShort}}`;
45
+ }
46
+ return `[${objectShort}] {+${objectShort} ?${predicateShort}}`;
47
+ }
48
+
49
+ function readSpan(block, text, spanType = 'attrs') {
50
+ const range = spanType === 'attrs' ? block?.attrsRange : block?.valueRange;
51
+ if (!range) return null;
52
+ const { start, end } = range;
53
+ return (Number.isFinite(start) && Number.isFinite(end) && start >= 0 && end >= start)
54
+ ? { start, end, text: text.substring(start, end) }
55
+ : null;
56
+ }
57
+
58
+ function sanitizeCarrierValueForBlock(block, raw) {
59
+ const s = String(raw ?? '');
60
+ const t = block?.carrierType;
61
+ if (t === 'code') return s.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
62
+ const oneLine = s.replace(/[\n\r]+/g, ' ').trim();
63
+ return (t === 'span' || t === 'link') ? oneLine.replace(/[\[\]]/g, ' ') : oneLine;
64
+ }
65
+
66
+ function blockTokensFromEntries(block) {
67
+ return block?.entries?.length ? block.entries.map(e => e.raw).filter(Boolean) : null;
68
+ }
69
+
70
+ function removeEntryAt(block, entryIndex) {
71
+ if (!block?.entries || entryIndex == null || entryIndex < 0 || entryIndex >= block.entries.length) return null;
72
+ return [...block.entries.slice(0, entryIndex), ...block.entries.slice(entryIndex + 1)];
73
+ }
74
+
75
+ function replaceLangDatatypeEntries(block, lit, ctx) {
76
+ if (!block?.entries) return null;
77
+ const filtered = block.entries.filter(e => e.kind !== 'language' && e.kind !== 'datatype');
78
+ const extras = [];
79
+ if (lit?.language) extras.push({ kind: 'language', language: lit.language, raw: `@${lit.language}`, relRange: { start: 0, end: 0 } });
80
+ const dt = lit?.datatype?.value;
81
+ if (!lit?.language && dt && dt !== 'http://www.w3.org/2001/XMLSchema#string') {
82
+ extras.push({ kind: 'datatype', datatype: shortenIRI(dt, ctx), raw: `^^${shortenIRI(dt, ctx)}`, relRange: { start: 0, end: 0 } });
83
+ }
84
+ return [...filtered, ...extras];
85
+ }
86
+
87
+ function updateAttrsDatatypeLang(tokens, newLit, ctx) {
88
+ const predicatesAndTypes = tokens.filter(t => !t.startsWith('@') && !t.startsWith('^^'));
89
+ if (newLit?.language) return [...predicatesAndTypes, `@${newLit.language}`];
90
+ const dt = newLit?.datatype?.value;
91
+ if (dt && dt !== 'http://www.w3.org/2001/XMLSchema#string') {
92
+ return [...predicatesAndTypes, `^^${shortenIRI(dt, ctx)}`];
93
+ }
94
+ return predicatesAndTypes;
95
+ }
96
+
97
+ // Slot abstraction for cleaner operations
98
+ class Slot {
99
+ constructor(block, entry, kind = null) {
100
+ this.block = block;
101
+ this.entry = entry;
102
+ this.kind = kind || entry?.kind;
103
+ this.entryIndex = entry?.entryIndex;
104
+ this.isVacant = entry?.isVacant || false;
105
+ this.form = entry?.form;
106
+ }
107
+
108
+ removeToken(tokens, ctx, quad) {
109
+ if (!this.entry) return { tokens, removed: false };
110
+
111
+ if (this.kind === 'object') {
112
+ const objectIRI = shortenIRI(quad.object.value, ctx);
113
+ return removeObjectToken(tokens, objectIRI);
114
+ } else if (this.kind === 'softFragment') {
115
+ const fragment = this.entry.fragment;
116
+ return removeSoftFragmentToken(tokens, fragment);
117
+ } else if (this.kind === 'type' && quad.predicate.value.endsWith('rdf-syntax-ns#type')) {
118
+ const expectedType = this.entry.expandedType || quad.object.value;
119
+ return removeOneToken(tokens, t => {
120
+ if (!t.startsWith('.')) return false;
121
+ const raw = t.slice(1);
122
+ return expandIRI(raw, ctx) === expectedType;
123
+ });
124
+ } else {
125
+ const expectedPred = this.entry.expandedPredicate || quad.predicate.value;
126
+ const expectedForm = this.entry.form;
127
+ return removeOneToken(tokens, t => {
128
+ const m = String(t).match(/^(\^\?|\^|\?|)(.+)$/);
129
+ if (!m) return false;
130
+ const form = m[1] || '';
131
+ const raw = m[2];
132
+ if (expectedForm != null && form !== expectedForm) return false;
133
+ return expandIRI(raw, ctx) === expectedPred;
134
+ });
135
+ }
136
+ }
137
+
138
+ addToken(tokens, ctx, quad) {
139
+ if (quad.predicate.value.endsWith('rdf-syntax-ns#type') && quad.object?.termType === 'NamedNode') {
140
+ const typeShort = shortenIRI(quad.object.value, ctx);
141
+ const typeToken = typeShort.includes(':') || !typeShort.startsWith('http') ? `.${typeShort}` : null;
142
+ if (typeToken && !tokens.includes(typeToken)) {
143
+ return [...tokens, typeToken];
144
+ }
145
+ } else if (quad.object.termType === 'NamedNode') {
146
+ const objectShort = shortenIRI(quad.object.value, ctx);
147
+ const isSoftFragment = quad.object.value.includes('#');
148
+ const fragment = isSoftFragment ? quad.object.value.split('#')[1] : null;
149
+
150
+ if (isSoftFragment) {
151
+ return addSoftFragmentToken(tokens, fragment);
152
+ } else {
153
+ return addObjectToken(tokens, objectShort);
154
+ }
155
+ } else if (quad.object.termType === 'Literal') {
156
+ // For literal predicates, we need to add the predicate token
157
+ const predShort = shortenIRI(quad.predicate.value, ctx);
158
+ if (!tokens.includes(predShort)) {
159
+ return [...tokens, predShort];
160
+ }
161
+ }
162
+ return tokens;
163
+ }
164
+
165
+ markVacant(quad) {
166
+ if (this.entry && this.entry.slotId) {
167
+ return markSlotAsVacant(this.entry, quad.object);
168
+ }
169
+ return null;
170
+ }
171
+ }
172
+
173
+ export function serialize({ text, diff, origin, options = {} }) {
174
+ if (!diff || (!diff.add?.length && !diff.delete?.length)) {
175
+ const reparsed = parse(text, { context: options.context || {} });
176
+ return { text, origin: reparsed.origin };
177
+ }
178
+
179
+ const base = origin || parse(text, { context: options.context || {} }).origin;
180
+ const ctx = options.context || {};
181
+
182
+ // Phase 1: Plan operations (pure, no text edits)
183
+ const plan = planOperations(diff, base, ctx);
184
+
185
+ // Phase 2: Materialize edits (ranges + strings)
186
+ const edits = materializeEdits(plan, text, ctx, base);
187
+
188
+ // Phase 3: Apply edits + reparse
189
+ return applyEdits(text, edits, ctx, base);
190
+ }
191
+
192
+ function planOperations(diff, base, ctx) {
193
+ // Normalize quads once
194
+ const normAdds = (diff.add || []).map(normalizeQuad).filter(isValidQuad);
195
+ const normDeletes = (diff.delete || []).map(normalizeQuad).filter(isValidQuad);
196
+
197
+ const plan = {
198
+ literalUpdates: [],
199
+ vacantSlotOccupations: [],
200
+ deletes: [],
201
+ adds: [],
202
+ consumedAdds: new Set()
203
+ };
204
+
205
+ // Build lookup maps
206
+ const addBySP = new Map();
207
+ for (const quad of normAdds) {
208
+ const k = JSON.stringify([quad.subject.value, quad.predicate.value]);
209
+ const list = addBySP.get(k) || [];
210
+ list.push(quad);
211
+ addBySP.set(k, list);
212
+ }
213
+
214
+ // Build anchors for delete operations
215
+ const anchors = new Map();
216
+ for (const quad of normDeletes) {
217
+ const key = JSON.stringify([quad.subject.value, objectSignature(quad.object)]);
218
+ const quadKey = quadToKeyForOrigin(quad);
219
+ const entry = getEntryByQuadKey(base, quadKey);
220
+ const blockId = entry?.blockId || entry;
221
+ const block = getBlockById(base, blockId);
222
+ if (block?.attrsRange) {
223
+ anchors.set(key, { block, entry });
224
+ }
225
+ }
226
+
227
+ // Detect literal updates early
228
+ for (const deleteQuad of normDeletes) {
229
+ if (deleteQuad.object.termType !== 'Literal') continue;
230
+
231
+ const k = JSON.stringify([deleteQuad.subject.value, deleteQuad.predicate.value]);
232
+ const candidates = addBySP.get(k) || [];
233
+ const addQuad = candidates.find(x =>
234
+ x?.object?.termType === 'Literal' && !plan.consumedAdds.has(quadToKeyForOrigin(x))
235
+ );
236
+
237
+ if (!addQuad) continue;
238
+
239
+ const entry = resolveOriginEntry(deleteQuad, base);
240
+ const block = entry ? getBlockById(base, entry.blockId || entry) : null;
241
+
242
+ if (block) {
243
+ plan.literalUpdates.push({ deleteQuad, addQuad, entry, block });
244
+ plan.consumedAdds.add(quadToKeyForOrigin(addQuad));
245
+ }
246
+ }
247
+
248
+ // Find vacant slot occupations
249
+ for (const quad of normAdds) {
250
+ if (quad.object.termType !== 'Literal') continue;
251
+ if (plan.consumedAdds.has(quadToKeyForOrigin(quad))) continue;
252
+
253
+ const vacantSlot = findVacantSlot(base?.quadIndex, quad.subject, quad.predicate);
254
+ if (!vacantSlot) continue;
255
+
256
+ const block = base?.blocks?.get(vacantSlot.blockId);
257
+ if (block) {
258
+ plan.vacantSlotOccupations.push({ quad, vacantSlot, block });
259
+ plan.consumedAdds.add(quadToKeyForOrigin(quad));
260
+ }
261
+ }
262
+
263
+ // Plan remaining deletes
264
+ for (const quad of normDeletes) {
265
+ if (quad.object.termType === 'Literal') {
266
+ const isUpdated = plan.literalUpdates.some(u =>
267
+ u.deleteQuad.subject.value === quad.subject.value &&
268
+ u.deleteQuad.predicate.value === quad.predicate.value &&
269
+ u.deleteQuad.object.value === quad.object.value
270
+ );
271
+ if (isUpdated) continue;
272
+ }
273
+
274
+ const entry = resolveOriginEntry(quad, base);
275
+ const block = entry ? getBlockById(base, entry.blockId || entry) : null;
276
+ if (block) {
277
+ plan.deletes.push({ quad, entry, block });
278
+ }
279
+ }
280
+
281
+ // Plan remaining adds
282
+ for (const quad of normAdds) {
283
+ if (plan.consumedAdds.has(quadToKeyForOrigin(quad))) continue;
284
+
285
+ const targetBlock = findTargetBlock(quad, base, anchors);
286
+ plan.adds.push({ quad, targetBlock });
287
+ }
288
+
289
+ return plan;
290
+ }
291
+
292
+ function materializeEdits(plan, text, ctx, base) {
293
+ const edits = [];
294
+
295
+ // Materialize vacant slot occupations
296
+ for (const { quad, vacantSlot, block } of plan.vacantSlotOccupations) {
297
+ const span = readSpan(block, text, 'attrs');
298
+ if (!span) continue;
299
+
300
+ // Update carrier value
301
+ const valueSpan = readSpan(block, text, 'value');
302
+ if (valueSpan) {
303
+ edits.push({ start: valueSpan.start, end: valueSpan.end, text: quad.object.value });
304
+ }
305
+
306
+ // Update annotation block
307
+ const tokens = normalizeAttrsTokens(span.text);
308
+ const predToken = `${vacantSlot.form || ''}${shortenIRI(quad.predicate.value, ctx)}`;
309
+
310
+ if (tokens.length === 0) {
311
+ edits.push({ start: span.start, end: span.end, text: `{${predToken}}` });
312
+ } else if (!tokens.includes(predToken)) {
313
+ const updated = [...tokens, predToken];
314
+ edits.push({ start: span.start, end: span.end, text: writeAttrsTokens(updated) });
315
+ }
316
+ }
317
+
318
+ // Materialize literal updates
319
+ for (const { deleteQuad, addQuad, entry, block } of plan.literalUpdates) {
320
+ const span = readSpan(block, text, 'value');
321
+ if (span) {
322
+ const newValue = sanitizeCarrierValueForBlock(block, addQuad.object.value);
323
+ edits.push({ start: span.start, end: span.end, text: newValue });
324
+ }
325
+
326
+ const aSpan = readSpan(block, text, 'attrs');
327
+ if (aSpan) {
328
+ if (block?.entries?.length) {
329
+ const nextEntries = replaceLangDatatypeEntries(block, addQuad.object, ctx);
330
+ if (nextEntries) {
331
+ const nextTokens = nextEntries.map(e => e.raw).filter(Boolean);
332
+ const newText = nextTokens.length === 0 ? '{}' : writeAttrsTokens(nextTokens);
333
+ edits.push({ start: aSpan.start, end: aSpan.end, text: newText });
334
+ }
335
+ } else {
336
+ const tokens = normalizeAttrsTokens(aSpan.text);
337
+ const updated = updateAttrsDatatypeLang(tokens, addQuad.object, ctx);
338
+ if (updated.join(' ') !== tokens.join(' ')) {
339
+ const newText = updated.length === 0 ? '{}' : writeAttrsTokens(updated);
340
+ edits.push({ start: aSpan.start, end: aSpan.end, text: newText });
341
+ }
342
+ }
343
+ }
344
+ }
345
+
346
+ // Materialize deletes
347
+ for (const { quad, entry, block } of plan.deletes) {
348
+ const slot = new Slot(block, entry);
349
+
350
+ // Mark slot as vacant
351
+ const vacantSlot = slot.markVacant(quad);
352
+ if (vacantSlot && block) {
353
+ const blockInfo = {
354
+ id: entry.blockId,
355
+ range: block.range,
356
+ attrsRange: block.attrsRange,
357
+ valueRange: block.valueRange,
358
+ carrierType: block.carrierType,
359
+ subject: block.subject,
360
+ context: block.context
361
+ };
362
+ vacantSlot.blockInfo = blockInfo;
363
+ const key = quadToKeyForOrigin(quad);
364
+ if (key) base.quadIndex.set(key, vacantSlot);
365
+ }
366
+
367
+ const span = readSpan(block, text, 'attrs');
368
+ if (!span) continue;
369
+
370
+ // Handle entry removal by index
371
+ if (entry?.entryIndex != null && block?.entries?.length) {
372
+ const nextEntries = removeEntryAt(block, entry.entryIndex);
373
+ if (nextEntries) {
374
+ const nextTokens = nextEntries.map(e => e.raw).filter(Boolean);
375
+ const newText = nextTokens.length === 0 ? '{}' : writeAttrsTokens(nextTokens);
376
+ edits.push({ start: span.start, end: span.end, text: newText });
377
+ continue;
378
+ }
379
+ }
380
+
381
+ // Handle token-based removals using Slot abstraction
382
+ const tokens = normalizeAttrsTokens(span.text);
383
+ const { tokens: updated, removed } = slot.removeToken(tokens, ctx, quad);
384
+
385
+ if (removed) {
386
+ const newText = updated.length === 0 ? '{}' : writeAttrsTokens(updated);
387
+ edits.push({ start: span.start, end: span.end, text: newText });
388
+ }
389
+ }
390
+
391
+ // Materialize adds
392
+ for (const { quad, targetBlock } of plan.adds) {
393
+ if (quad.object.termType === 'Literal' || quad.object.termType === 'NamedNode') {
394
+ if (!targetBlock) {
395
+ const predShort = shortenIRI(quad.predicate.value, ctx);
396
+ if (quad.object.termType === 'Literal') {
397
+ const value = String(quad.object.value ?? '');
398
+ const ann = createLiteralAnnotation(value, predShort, quad.object.language, quad.object.datatype, ctx);
399
+ edits.push({ start: text.length, end: text.length, text: `\n[${value}] {${ann}}` });
400
+ } else {
401
+ const objectShort = shortenIRI(quad.object.value, ctx);
402
+ edits.push({ start: text.length, end: text.length, text: createObjectAnnotation(objectShort, predShort) });
403
+ }
404
+ continue;
405
+ }
406
+
407
+ const span = readSpan(targetBlock, text, 'attrs');
408
+ if (!span) continue;
409
+
410
+ // Check if this is a subject-only block (like {=ex:order-123})
411
+ const tokens = normalizeAttrsTokens(span.text);
412
+ const hasSubjectToken = tokens.some(t => t.startsWith('='));
413
+ const hasPredicateTokens = tokens.some(t => !t.startsWith('=') && !t.startsWith('.'));
414
+
415
+ if (tokens.length === 1 && tokens[0].startsWith('=')) {
416
+ // This is a subject-only block, create new annotation
417
+ const predShort = shortenIRI(quad.predicate.value, ctx);
418
+ if (quad.object.termType === 'Literal') {
419
+ const value = String(quad.object.value ?? '');
420
+ const ann = createLiteralAnnotation(value, predShort, quad.object.language, quad.object.datatype, ctx);
421
+ edits.push({ start: text.length, end: text.length, text: `\n[${value}] {${ann}}` });
422
+ } else {
423
+ const objectShort = shortenIRI(quad.object.value, ctx);
424
+ edits.push({ start: text.length, end: text.length, text: createObjectAnnotation(objectShort, predShort) });
425
+ }
426
+ continue;
427
+ }
428
+
429
+ // Normal annotation block, add tokens
430
+ const existingTokens = blockTokensFromEntries(targetBlock) || tokens;
431
+ const slot = new Slot(targetBlock, null);
432
+ let updated = slot.addToken(existingTokens, ctx, quad);
433
+
434
+ // For literal predicates with datatypes, we need to add datatype token too
435
+ if (quad.object.termType === 'Literal' && quad.object.datatype && quad.object.datatype.value !== 'http://www.w3.org/2001/XMLSchema#string') {
436
+ const datatypeToken = `^^${shortenIRI(quad.object.datatype.value, ctx)}`;
437
+ if (!updated.includes(datatypeToken)) {
438
+ updated = [...updated, datatypeToken];
439
+ }
440
+ }
441
+
442
+ if (updated.length !== existingTokens.length) {
443
+ edits.push({ start: span.start, end: span.end, text: writeAttrsTokens(updated) });
444
+ }
445
+ }
446
+ }
447
+
448
+ return edits;
449
+ }
450
+
451
+ function applyEdits(text, edits, ctx, base) {
452
+ let result = text;
453
+
454
+ // Sort edits descending to avoid position shifts
455
+ edits.sort((a, b) => b.start - a.start);
456
+ edits.forEach(edit => {
457
+ result = result.substring(0, edit.start) + edit.text + result.substring(edit.end);
458
+ });
459
+
460
+ // Extract vacant slots before reparsing
461
+ const vacantSlots = new Map();
462
+ base?.quadIndex?.forEach((slot, key) => {
463
+ if (slot.isVacant) {
464
+ vacantSlots.set(key, slot);
465
+ }
466
+ });
467
+
468
+ const reparsed = parse(result, { context: ctx });
469
+
470
+ // Merge vacant slots back
471
+ vacantSlots.forEach((vacantSlot, key) => {
472
+ if (!reparsed.origin.blocks.has(vacantSlot.blockId)) {
473
+ const blockInfo = vacantSlot.blockInfo;
474
+ if (blockInfo) {
475
+ const emptyBlock = {
476
+ id: blockInfo.id,
477
+ range: blockInfo.range || { start: 0, end: 0 },
478
+ attrsRange: blockInfo.attrsRange,
479
+ valueRange: blockInfo.valueRange,
480
+ carrierType: blockInfo.carrierType || 'span',
481
+ subject: blockInfo.subject || '',
482
+ types: [],
483
+ predicates: [],
484
+ entries: [],
485
+ context: blockInfo.context || { ...ctx }
486
+ };
487
+ reparsed.origin.blocks.set(vacantSlot.blockId, emptyBlock);
488
+ }
489
+ }
490
+ reparsed.origin.quadIndex.set(key, vacantSlot);
491
+ });
492
+
493
+ return { text: result, origin: reparsed.origin };
494
+ }
495
+
496
+ // Helper functions for origin lookup
497
+ function resolveOriginEntry(quad, base) {
498
+ const key = quadToKeyForOrigin(quad);
499
+ let entry = key ? base?.quadIndex?.get(key) : null;
500
+
501
+ if (!entry && quad.object?.termType === 'Literal') {
502
+ // Fallback: search by value
503
+ for (const [k, e] of base?.quadIndex || []) {
504
+ const parsed = parseQuadIndexKey(k);
505
+ if (parsed && parsed.s === quad.subject.value &&
506
+ parsed.p === quad.predicate.value &&
507
+ parsed.o?.t === 'Literal' &&
508
+ parsed.o?.v === quad.object.value) {
509
+ entry = e;
510
+ break;
511
+ }
512
+ }
513
+ }
514
+
515
+ return entry;
516
+ }
517
+
518
+ function findTargetBlock(quad, base, anchors) {
519
+ const anchorKey = JSON.stringify([quad.subject.value, objectSignature(quad.object)]);
520
+ const anchored = anchors.get(anchorKey);
521
+ if (anchored?.block) return anchored.block;
522
+
523
+ // Block affinity: prefer same block, then same subject
524
+ for (const [, block] of base?.blocks || []) {
525
+ if (block.subject === quad.subject.value && block.attrsRange) {
526
+ return block;
527
+ }
528
+ }
529
+
530
+ return null;
531
+ }