mdld-parse 0.7.3 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mdld-parse",
3
- "version": "0.7.3",
3
+ "version": "0.7.4",
4
4
  "description": "A standards-compliant parser for **MD-LD (Markdown-Linked Data)** — a human-friendly RDF authoring format that extends Markdown with semantic annotations.",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -38,8 +38,5 @@
38
38
  "homepage": "https://mdld.js.org",
39
39
  "bugs": {
40
40
  "url": "https://github.com/davay42/mdld-parse/issues"
41
- },
42
- "dependencies": {
43
- "rdfa-parse": "^1.0.1"
44
41
  }
45
42
  }
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Shared utilities for MD-LD Parser and Renderer
3
+ * Ensures DRY code and consistent CommonMark processing
4
+ */
5
+
6
+
7
+
8
+ export const DEFAULT_CONTEXT = {
9
+ '@vocab': "http://www.w3.org/2000/01/rdf-schema#",
10
+ rdf: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
11
+ rdfs: 'http://www.w3.org/2000/01/rdf-schema#',
12
+ xsd: 'http://www.w3.org/2001/XMLSchema#',
13
+ sh: "http://www.w3.org/ns/shacl#",
14
+ prov: 'http://www.w3.org/ns/prov#'
15
+ };
16
+
17
+ // CommonMark patterns - shared between parser and renderer
18
+ export const URL_REGEX = /^(https?|ftp|mailto|tag|nih|urn|uuid|did|web|ipfs|ipns|data|file|urn:uuid):/;
19
+ export const FENCE_REGEX = /^(`{3,}|~{3,})(.*)/;
20
+ export const PREFIX_REGEX = /^\[([^\]]+)\]\s*<([^>]+)>/;
21
+ export const HEADING_REGEX = /^(#{1,6})\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
22
+ export const UNORDERED_LIST_REGEX = /^(\s*)([-*+]|\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
23
+ export const BLOCKQUOTE_REGEX = /^>\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
24
+ export const STANDALONE_SUBJECT_REGEX = /^\s*\{=(.*?)\}\s*$/;
25
+
26
+ // Pre-compiled carrier patterns for performance
27
+ export const CARRIER_PATTERN_ARRAY = [
28
+ ['EMPHASIS', /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y],
29
+ ['CODE_SPAN', /``(.+?)``\s*\{([^}]+)\}/y]
30
+ ];
package/src/generate.js CHANGED
@@ -1,21 +1,26 @@
1
1
  import { shortenIRI, expandIRI, DataFactory } from './utils.js';
2
- import { DEFAULT_CONTEXT } from './shared.js';
3
-
4
- // Helper functions for cleaner term type checking
5
- function isLiteral(term) {
6
- return term?.termType === 'Literal';
7
- }
8
-
9
- function isNamedNode(term) {
10
- return term?.termType === 'NamedNode';
11
- }
12
-
13
- function isRdfType(term) {
14
- return term?.value === 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
15
- }
16
-
2
+ import { DEFAULT_CONTEXT } from './constants.js';
3
+ import {
4
+ isLiteral,
5
+ collectUsedPrefixes,
6
+ sortQuadsByPredicate,
7
+ generatePrefixDeclaration,
8
+ generateLiteralText,
9
+ generateObjectText,
10
+ filterQuadsByType
11
+ } from './shared.js';
12
+
13
+ export function extractLocalName(iri, ctx = {}) {
14
+ if (!iri) return iri;
15
+
16
+ // Check for exact prefix matches first
17
+ for (const [prefix, namespace] of Object.entries(ctx)) {
18
+ if (iri.startsWith(namespace) || iri.startsWith(namespace.slice(0, -1))) {
19
+ return iri.substring(namespace.length);
20
+ }
21
+ }
17
22
 
18
- function extractLocalName(iri) {
23
+ // Fallback to original logic for local names
19
24
  const separators = ['#', '/', ':'];
20
25
  for (const sep of separators) {
21
26
  const lastSep = iri.lastIndexOf(sep);
@@ -83,14 +88,14 @@ function groupQuadsBySubject(quads) {
83
88
 
84
89
  function buildDeterministicMDLD(subjectGroups, context) {
85
90
  let text = '';
91
+ const usedPrefixes = collectUsedPrefixes(subjectGroups, context);
86
92
 
87
93
  // Add prefixes first (deterministic order), but exclude default context prefixes
88
94
  const sortedPrefixes = Object.entries(context).sort(([a], [b]) => a.localeCompare(b));
89
95
  for (const [prefix, namespace] of sortedPrefixes) {
90
96
  // Skip default context prefixes - they're implicit in MDLD
91
- if (prefix !== '@vocab' && !prefix.startsWith('@') && !DEFAULT_CONTEXT[prefix]) {
92
- const prefixDecl = `[${prefix}] <${namespace}>\n`;
93
- text += prefixDecl;
97
+ if (prefix !== '@vocab' && !prefix.startsWith('@') && !DEFAULT_CONTEXT[prefix] && usedPrefixes.has(prefix)) {
98
+ text += generatePrefixDeclaration(prefix, namespace);
94
99
  }
95
100
  }
96
101
 
@@ -105,47 +110,25 @@ function buildDeterministicMDLD(subjectGroups, context) {
105
110
  const subjectQuads = subjectGroups.get(subjectIRI);
106
111
  const shortSubject = shortenIRI(subjectIRI, context);
107
112
 
108
- // Separate types, literals, and objects using helper functions
109
- const types = subjectQuads.filter(q => isRdfType(q.predicate));
110
- const literals = subjectQuads.filter(q => isLiteral(q.object) && !isRdfType(q.predicate));
111
- const objects = subjectQuads.filter(q => isNamedNode(q.object) && !isRdfType(q.predicate));
113
+ // Separate types, literals, and objects using shared utility
114
+ const { types, literals, objects } = filterQuadsByType(subjectQuads);
112
115
 
113
116
  // Generate heading
114
- const localSubjectName = extractLocalName(subjectIRI);
117
+ const localSubjectName = extractLocalName(subjectIRI, context);
115
118
  const typeAnnotations = types.length > 0
116
- ? ' ' + types.map(t => '.' + extractLocalName(t.object.value)).sort().join(' ')
119
+ ? ' ' + types.map(t => '.' + shortenIRI(t.object.value, context)).sort().join(' ')
117
120
  : '';
118
121
 
119
- const headingText = `# ${localSubjectName} {=${shortSubject}${typeAnnotations}}\n\n`;
120
-
121
- text += headingText;
122
-
123
- // Add literals (deterministic order)
124
- const sortedLiterals = literals.sort((a, b) => a.predicate.value.localeCompare(b.predicate.value));
125
- for (const quad of sortedLiterals) {
126
- const predShort = shortenIRI(quad.predicate.value, context);
127
- let annotation = predShort;
128
-
129
- // Use DataFactory XSD constants for datatype comparison
130
- const xsdString = 'http://www.w3.org/2001/XMLSchema#string';
131
- if (quad.object.language) {
132
- annotation += ` @${quad.object.language}`;
133
- } else if (quad.object.datatype.value !== xsdString) {
134
- annotation += ` ^^${shortenIRI(quad.object.datatype.value, context)}`;
135
- }
122
+ text += `# ${localSubjectName} {=${shortSubject}${typeAnnotations}}\n`;
136
123
 
137
- const literalText = `[${quad.object.value}] {${annotation}}\n`;
138
- text += literalText;
139
- }
124
+ // Add literals and objects using shared utilities
125
+ sortQuadsByPredicate(literals).forEach(quad => {
126
+ text += generateLiteralText(quad, context);
127
+ });
140
128
 
141
- // Add objects (deterministic order)
142
- const sortedObjects = objects.sort((a, b) => a.predicate.value.localeCompare(b.predicate.value));
143
- for (const quad of sortedObjects) {
144
- const objShort = shortenIRI(quad.object.value, context);
145
- const predShort = shortenIRI(quad.predicate.value, context);
146
- const objectText = `[${objShort}] {+${objShort} ?${predShort}}\n`;
147
- text += objectText;
148
- }
129
+ sortQuadsByPredicate(objects).forEach(quad => {
130
+ text += generateObjectText(quad, context);
131
+ });
149
132
 
150
133
  text += '\n';
151
134
  }
package/src/index.js CHANGED
@@ -3,7 +3,7 @@ export { merge } from './merge.js';
3
3
  export { generate } from './generate.js';
4
4
  export { locate } from './locate.js';
5
5
  export { render } from './render.js';
6
- export { DEFAULT_CONTEXT } from './shared.js';
6
+ export { DEFAULT_CONTEXT } from './constants.js';
7
7
  export {
8
8
  DataFactory,
9
9
  hash,
package/src/locate.js CHANGED
@@ -18,21 +18,6 @@ export function locate(quad, origin) {
18
18
  return null;
19
19
  }
20
20
 
21
- // Find the origin entry in quadIndex
22
- const entry = origin.quadIndex.get(quadKey);
23
- if (!entry) {
24
- return null;
25
- }
26
-
27
- // Return the lean origin entry structure
28
- return {
29
- blockId: entry.blockId,
30
- range: entry.range,
31
- carrierType: entry.carrierType,
32
- subject: entry.subject,
33
- predicate: entry.predicate,
34
- context: entry.context,
35
- value: entry.value,
36
- polarity: entry.polarity
37
- };
21
+ // Return the origin entry directly - no need to create new object
22
+ return origin.quadIndex.get(quadKey) || null;
38
23
  }
package/src/merge.js CHANGED
@@ -1,15 +1,14 @@
1
1
  import { parse } from './parse.js';
2
- import { DEFAULT_CONTEXT } from './shared.js';
2
+ import { quadIndexKey } from './shared.js';
3
+ import { DEFAULT_CONTEXT } from './constants.js';
3
4
 
4
5
  /**
5
- * Creates a unique key for quad identity matching
6
+ * Creates a unique key for quad identity matching - using shared utility
6
7
  * @param {Quad} quad
7
8
  * @returns {string}
8
9
  */
9
10
  function quadKey(quad) {
10
- const datatype = quad.object.datatype?.value || '';
11
- const language = quad.object.language || '';
12
- return `${quad.subject.value}|${quad.predicate.value}|${quad.object.value}|${datatype}|${language}`;
11
+ return quadIndexKey(quad.subject, quad.predicate, quad.object);
13
12
  }
14
13
 
15
14
  /**
package/src/parse.js CHANGED
@@ -1,93 +1,130 @@
1
1
  import {
2
2
  DataFactory,
3
3
  expandIRI,
4
- parseSemanticBlock,
5
4
  quadIndexKey,
6
5
  createLiteral,
7
6
  hash
8
7
  } from './utils.js';
9
- import { DEFAULT_CONTEXT, URL_REGEX, FENCE_REGEX, PREFIX_REGEX, HEADING_REGEX, UNORDERED_LIST_REGEX, BLOCKQUOTE_REGEX, STANDALONE_SUBJECT_REGEX, INLINE_CARRIER_PATTERNS } from './shared.js';
8
+ import {
9
+ DEFAULT_CONTEXT,
10
+ URL_REGEX,
11
+ FENCE_REGEX,
12
+ PREFIX_REGEX,
13
+ HEADING_REGEX,
14
+ UNORDERED_LIST_REGEX,
15
+ BLOCKQUOTE_REGEX,
16
+ STANDALONE_SUBJECT_REGEX,
17
+ CARRIER_PATTERN_ARRAY,
18
+
19
+ } from './constants.js';
20
+ import {
10
21
 
11
- // Cache for fence regex patterns to avoid recreation
12
- const FENCE_CLOSE_PATTERNS = new Map();
22
+ getFenceClosePattern,
23
+ calcRangeInfo,
24
+ calcAttrsRange,
25
+ createToken,
26
+ createCarrier,
27
+ createListToken,
28
+ parseSemCached,
29
+ EMPTY_SEM,
30
+ parseLangAndAttrs,
31
+ findMatchingBracket,
32
+ extractUrlFromBrackets,
33
+ extractAttributesFromText,
34
+ determineCarrierType,
35
+ calcCarrierRanges,
36
+ extractCleanText,
37
+ RDF_TYPE,
38
+ RDF_STATEMENT,
39
+ RDF_SUBJECT,
40
+ RDF_PREDICATE,
41
+ RDF_OBJECT,
42
+ createLeanOriginEntry,
43
+ resolveFragment,
44
+ resolveSubject,
45
+ resolveObject,
46
+ processTokenWithBlockTracking
47
+ } from './shared.js';
13
48
 
14
- function getFenceClosePattern(fenceChar) {
15
- if (!FENCE_CLOSE_PATTERNS.has(fenceChar)) {
16
- FENCE_CLOSE_PATTERNS.set(fenceChar, new RegExp(`^(${fenceChar}{3,})`));
17
- }
18
- return FENCE_CLOSE_PATTERNS.get(fenceChar);
19
- }
20
49
 
21
- function parseLangAndAttrs(langAndAttrs) {
22
- const spaceIndex = langAndAttrs.indexOf(' ');
23
- const braceIndex = langAndAttrs.indexOf('{');
24
- const langEnd = Math.min(
25
- spaceIndex > -1 ? spaceIndex : Infinity,
26
- braceIndex > -1 ? braceIndex : Infinity
27
- );
28
- return {
29
- lang: langAndAttrs.substring(0, langEnd),
30
- attrsText: langAndAttrs.substring(langEnd).match(/\{[^{}]*\}/)?.[0] || null
50
+ export function parse(text, options = {}) {
51
+ const state = {
52
+ ctx: { ...DEFAULT_CONTEXT, ...(options.context || {}) },
53
+ df: options.dataFactory || DataFactory,
54
+ quads: [],
55
+ quadBuffer: new Map(),
56
+ removeSet: new Set(),
57
+ origin: {
58
+ quadIndex: new Map(),
59
+ blocks: new Map(),
60
+ documentStructure: []
61
+ },
62
+ currentSubject: null,
63
+ tokens: null,
64
+ currentTokenIndex: -1,
65
+ statements: [],
66
+ statementCandidates: new Map(),
67
+ currentBlock: null,
68
+ blockStack: []
31
69
  };
32
- }
33
70
 
34
- const semCache = {};
35
- const EMPTY_SEM = Object.freeze({ predicates: [], types: [], subject: null });
71
+ state.tokens = scanTokens(text);
72
+
73
+ // Single-pass processing: resolve prefixes AND process tokens together
74
+ for (let i = 0; i < state.tokens.length; i++) {
75
+ const token = state.tokens[i];
76
+ state.currentTokenIndex = i;
77
+
78
+ // Handle prefix tokens immediately during main pass
79
+ if (token.type === 'prefix') {
80
+ let resolvedIri = token.iri;
81
+ if (token.iri.includes(':')) {
82
+ const colonIndex = token.iri.indexOf(':');
83
+ const potentialPrefix = token.iri.substring(0, colonIndex);
84
+ const reference = token.iri.substring(colonIndex + 1);
85
+ if (state.ctx[potentialPrefix] && potentialPrefix !== '@vocab') {
86
+ resolvedIri = state.ctx[potentialPrefix] + reference;
87
+ }
88
+ }
89
+ state.ctx[token.prefix] = resolvedIri;
90
+ continue; // Skip token processor for prefixes
91
+ }
36
92
 
37
- function parseSemCached(attrs) {
38
- if (!attrs) return EMPTY_SEM;
39
- let sem = semCache[attrs];
40
- if (!sem) {
41
- sem = Object.freeze(parseSemanticBlock(attrs));
42
- semCache[attrs] = sem;
93
+ // Process all other tokens
94
+ TOKEN_PROCESSORS[token.type]?.(token, state);
43
95
  }
44
- return sem;
45
- }
46
96
 
47
- function calcRangeInfo(line, attrs, lineStart, prefixLength, valueLength) {
48
- const wsLength = prefixLength < line.length && line[prefixLength] === ' ' ? 1 :
49
- line.slice(prefixLength).match(/^\s+/)?.[0]?.length || 0;
50
- const valueStartInLine = prefixLength + wsLength;
51
- return {
52
- valueRange: [lineStart + valueStartInLine, lineStart + valueStartInLine + valueLength],
53
- attrsRange: calcAttrsRange(line, attrs, lineStart)
54
- };
55
- }
97
+ // Optimized quad filtering - use Set.has() instead of array.includes()
98
+ const quadKeys = new Set();
99
+ for (const quad of state.quads) {
100
+ quadKeys.add(quadIndexKey(quad.subject, quad.predicate, quad.object));
101
+ }
56
102
 
57
- function calcAttrsRange(line, attrs, lineStart) {
58
- if (!attrs) return null;
59
- const attrsStartInLine = line.lastIndexOf(attrs);
60
- return attrsStartInLine >= 0 ? [lineStart + attrsStartInLine, lineStart + attrsStartInLine + attrs.length] : null;
61
- }
103
+ // Direct Set iteration - more efficient than filter()
104
+ const filteredRemove = [];
105
+ for (const quad of state.removeSet) {
106
+ const key = quadIndexKey(quad.subject, quad.predicate, quad.object);
107
+ if (!quadKeys.has(key)) {
108
+ filteredRemove.push(quad);
109
+ }
110
+ }
62
111
 
63
- function createToken(type, range, text, attrs = null, attrsRange = null, valueRange = null, extra = {}) {
64
- const token = { type, range, text, attrs, attrsRange, valueRange, ...extra };
65
- Object.defineProperty(token, '_carriers', {
66
- enumerable: false, writable: true, value: null
67
- });
68
- return token;
112
+ return { quads: state.quads, remove: filteredRemove, statements: state.statements, origin: state.origin, context: state.ctx };
69
113
  }
70
114
 
115
+
116
+ // Cache for fence regex patterns - using shared utility
117
+
71
118
  function getCarriers(token) {
72
119
  if (token.type === 'code') return [];
73
120
  return token._carriers || (token._carriers = extractInlineCarriers(token.text, token.range[0]));
74
121
  }
75
122
 
76
- const createListToken = (type, line, lineStart, pos, match) => {
77
- const attrs = match[4] || null;
78
- const prefix = match[1].length + (match[2] ? match[2].length : 0);
79
- const rangeInfo = calcRangeInfo(line, attrs, lineStart, prefix, match[3].length);
80
- return createToken(type, [lineStart, pos - 1], match[3].trim(), attrs,
81
- rangeInfo.attrsRange, rangeInfo.valueRange, { indent: match[1].length });
82
- };
83
-
84
123
  function scanTokens(text) {
85
124
  const tokens = [];
86
125
  const lines = text.split('\n');
87
126
  let pos = 0;
88
127
  let codeBlock = null;
89
-
90
- // Direct lookup instead of linear search
91
128
  const PROCESSORS = [
92
129
  { type: 'fence', test: line => FENCE_REGEX.test(line.trim()), process: handleFence },
93
130
  { type: 'content', test: () => codeBlock, process: line => codeBlock.content.push(line) },
@@ -194,16 +231,6 @@ function scanTokens(text) {
194
231
  return tokens;
195
232
  }
196
233
 
197
- function createCarrier(type, text, attrs, attrsRange, valueRange, range, pos, extra = {}) {
198
- return { type, text, attrs, attrsRange, valueRange, range, pos, ...extra };
199
- }
200
-
201
- // Pre-compiled carrier patterns for better performance
202
- const CARRIER_PATTERN_ARRAY = [
203
- ['EMPHASIS', /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y],
204
- ['CODE_SPAN', /``(.+?)``\s*\{([^}]+)\}/y]
205
- ];
206
-
207
234
  function extractInlineCarriers(text, baseOffset = 0) {
208
235
  const carriers = [];
209
236
  let pos = 0;
@@ -268,74 +295,6 @@ function extractInlineCarriers(text, baseOffset = 0) {
268
295
  return carriers;
269
296
  }
270
297
 
271
- function calcCarrierRanges(match, baseOffset, matchStart) {
272
- const valueStart = baseOffset + matchStart + match[0].indexOf(match[1]);
273
- const valueEnd = valueStart + match[1].length;
274
- const attrsStart = baseOffset + matchStart + match[0].indexOf('{');
275
- const attrsEnd = attrsStart + match[2].length + 2; // +2 for { and }
276
- return {
277
- valueRange: [valueStart, valueEnd],
278
- attrsRange: [attrsStart + 1, attrsEnd - 1], // Exclude braces
279
- range: [baseOffset + matchStart, attrsEnd],
280
- pos: matchStart + match[0].length // pos should be relative to current text, not document
281
- };
282
- }
283
-
284
- function findMatchingBracket(text, bracketStart) {
285
- let bracketDepth = 1;
286
- let bracketEnd = bracketStart + 1;
287
-
288
- while (bracketEnd < text.length && bracketDepth > 0) {
289
- if (text[bracketEnd] === '[') bracketDepth++;
290
- else if (text[bracketEnd] === ']') bracketDepth--;
291
- bracketEnd++;
292
- }
293
-
294
- return bracketDepth > 0 ? null : bracketEnd;
295
- }
296
-
297
- function extractUrlFromBrackets(text, bracketEnd) {
298
- let url = null;
299
- let spanEnd = bracketEnd;
300
-
301
- if (text[spanEnd] === '(') {
302
- const parenEnd = text.indexOf(')', spanEnd);
303
- if (parenEnd !== -1) {
304
- url = text.substring(spanEnd + 1, parenEnd);
305
- spanEnd = parenEnd + 1;
306
- }
307
- }
308
-
309
- return { url, spanEnd };
310
- }
311
-
312
- function extractAttributesFromText(text, spanEnd, baseOffset) {
313
- let attrs = null;
314
- let attrsRange = null;
315
- const remaining = text.substring(spanEnd);
316
-
317
- const wsMatch = remaining.match(/^\s+/);
318
- const attrsStart = wsMatch ? wsMatch[0].length : 0;
319
-
320
- if (remaining[attrsStart] === '{') {
321
- const braceEnd = remaining.indexOf('}', attrsStart);
322
- if (braceEnd !== -1) {
323
- attrs = remaining.substring(attrsStart, braceEnd + 1);
324
- const absStart = baseOffset + spanEnd + attrsStart;
325
- attrsRange = [absStart, absStart + attrs.length];
326
- spanEnd += braceEnd + 1;
327
- }
328
- }
329
-
330
- return { attrs, attrsRange, finalSpanEnd: spanEnd };
331
- }
332
-
333
- function determineCarrierType(url) {
334
- if (url && !url.startsWith('=')) {
335
- return { carrierType: 'link', resourceIRI: url };
336
- }
337
- return { carrierType: 'span', resourceIRI: null };
338
- }
339
298
 
340
299
  function createBlockEntry(token, state) {
341
300
  const blockId = token._blockId || hash(`${token.type}:${token.range?.[0]}:${token.range?.[1]}`);
@@ -364,31 +323,6 @@ function createBlockEntry(token, state) {
364
323
  return blockEntry;
365
324
  }
366
325
 
367
- function extractCleanText(token) {
368
- if (!token.text) return '';
369
-
370
- let text = token.text;
371
-
372
- // Remove semantic annotations
373
- if (token.attrsRange) {
374
- const beforeAttrs = text.substring(0, token.attrsRange[0] - (token.range?.[0] || 0));
375
- const afterAttrs = text.substring(token.attrsRange[1] - (token.range?.[0] || 0));
376
- text = beforeAttrs + afterAttrs;
377
- }
378
-
379
- // Clean based on token type
380
- switch (token.type) {
381
- case 'heading':
382
- return text.replace(/^#+\s*/, '').trim();
383
- case 'list':
384
- return text.replace(/^[-*+]\s*/, '').trim();
385
- case 'blockquote':
386
- return text.replace(/^>\s*/, '').trim();
387
- default:
388
- return text.trim();
389
- }
390
- }
391
-
392
326
  function enrichBlockFromAnnotation(blockEntry, sem, carrier, state) {
393
327
  // Update subject if available
394
328
  if (sem.subject && sem.subject !== 'RESET') {
@@ -539,17 +473,8 @@ function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predi
539
473
  // Detect rdf:Statement pattern during single-pass parsing
540
474
  detectStatementPatternSinglePass(quad, dataFactory, meta, statements, statementCandidates);
541
475
 
542
- // Create lean origin entry - avoid spread operator for better performance
543
- const originEntry = {
544
- blockId: block.id,
545
- range: block.range,
546
- carrierType: block.carrierType,
547
- subject: subject.value,
548
- predicate: predicate.value,
549
- context: block.context, // Direct reference instead of spread
550
- polarity: meta?.remove ? '-' : '+',
551
- value: block.text || ''
552
- };
476
+ // Create lean origin entry using shared utility
477
+ const originEntry = createLeanOriginEntry(block, subject, predicate, meta);
553
478
 
554
479
  quadIndex.set(quadKey, originEntry);
555
480
 
@@ -563,13 +488,6 @@ function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predi
563
488
  }
564
489
  }
565
490
 
566
- // Extract RDF constants once at module level for efficiency
567
- const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
568
- const RDF_STATEMENT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement';
569
- const RDF_SUBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject';
570
- const RDF_PREDICATE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate';
571
- const RDF_OBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#object';
572
-
573
491
  function detectStatementPatternSinglePass(quad, dataFactory, meta, statements = null, statementCandidates = null) {
574
492
  // Skip if not called from parse context (for testing compatibility)
575
493
  if (!statements || !statementCandidates) return;
@@ -619,30 +537,6 @@ function detectStatementPatternSinglePass(quad, dataFactory, meta, statements =
619
537
  }
620
538
  }
621
539
 
622
- const resolveFragment = (fragment, state) => {
623
- if (!state.currentSubject) return null;
624
- const subjectValue = state.currentSubject.value;
625
- const hashIndex = subjectValue.indexOf('#');
626
- const baseIRI = hashIndex > -1 ? subjectValue.slice(0, hashIndex) : subjectValue;
627
- return state.df.namedNode(baseIRI + '#' + fragment);
628
- };
629
-
630
- function resolveSubject(sem, state) {
631
- if (!sem.subject) return null;
632
- if (sem.subject === 'RESET') {
633
- state.currentSubject = null;
634
- return null;
635
- }
636
- if (sem.subject.startsWith('=#')) return resolveFragment(sem.subject.substring(2), state);
637
- return state.df.namedNode(expandIRI(sem.subject, state.ctx));
638
- }
639
-
640
- function resolveObject(sem, state) {
641
- if (!sem.object) return null;
642
- if (sem.object.startsWith('#')) return resolveFragment(sem.object.substring(1), state);
643
- return state.df.namedNode(expandIRI(sem.object, state.ctx));
644
- }
645
-
646
540
  const createTypeQuad = (typeIRI, subject, state, block, entryIndex = null) => {
647
541
  const expandedType = expandIRI(typeIRI, state.ctx);
648
542
  const typeInfo = typeof entryIndex === 'object' ? entryIndex : { entryIndex, remove: false };
@@ -748,123 +642,9 @@ function processStandaloneSubject(token, state) {
748
642
  }
749
643
 
750
644
  const TOKEN_PROCESSORS = {
751
- heading: (token, state) => {
752
- const blockEntry = createBlockEntry(token, state);
753
- state.currentBlock = blockEntry;
754
- state.blockStack.push(blockEntry.id);
755
-
756
- processTokenAnnotations(token, state, token.type);
757
-
758
- state.blockStack.pop();
759
- state.currentBlock = state.blockStack.length > 0 ?
760
- state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
761
- },
762
- code: (token, state) => {
763
- const blockEntry = createBlockEntry(token, state);
764
- state.currentBlock = blockEntry;
765
- state.blockStack.push(blockEntry.id);
766
-
767
- processTokenAnnotations(token, state, token.type);
768
-
769
- state.blockStack.pop();
770
- state.currentBlock = state.blockStack.length > 0 ?
771
- state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
772
- },
773
- blockquote: (token, state) => {
774
- const blockEntry = createBlockEntry(token, state);
775
- state.currentBlock = blockEntry;
776
- state.blockStack.push(blockEntry.id);
777
-
778
- processTokenAnnotations(token, state, token.type);
779
-
780
- state.blockStack.pop();
781
- state.currentBlock = state.blockStack.length > 0 ?
782
- state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
783
- },
784
- para: (token, state) => {
785
- const blockEntry = createBlockEntry(token, state);
786
- state.currentBlock = blockEntry;
787
- state.blockStack.push(blockEntry.id);
788
-
789
- processStandaloneSubject(token, state);
790
- processTokenAnnotations(token, state, token.type);
791
-
792
- state.blockStack.pop();
793
- state.currentBlock = state.blockStack.length > 0 ?
794
- state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
795
- },
796
- list: (token, state) => {
797
- const blockEntry = createBlockEntry(token, state);
798
- state.currentBlock = blockEntry;
799
- state.blockStack.push(blockEntry.id);
800
-
801
- processTokenAnnotations(token, state, token.type);
802
-
803
- state.blockStack.pop();
804
- state.currentBlock = state.blockStack.length > 0 ?
805
- state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
806
- },
645
+ heading: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
646
+ code: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
647
+ blockquote: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
648
+ para: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry, [processStandaloneSubject]),
649
+ list: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
807
650
  };
808
-
809
- export function parse(text, options = {}) {
810
- const state = {
811
- ctx: { ...DEFAULT_CONTEXT, ...(options.context || {}) },
812
- df: options.dataFactory || DataFactory,
813
- quads: [],
814
- quadBuffer: new Map(),
815
- removeSet: new Set(),
816
- origin: {
817
- quadIndex: new Map(),
818
- blocks: new Map(),
819
- documentStructure: []
820
- },
821
- currentSubject: null,
822
- tokens: null,
823
- currentTokenIndex: -1,
824
- statements: [],
825
- statementCandidates: new Map(), // Track incomplete rdf:Statement patterns
826
- currentBlock: null,
827
- blockStack: []
828
- };
829
-
830
- state.tokens = scanTokens(text);
831
-
832
- // Single loop instead of filter+forEach for better performance
833
- for (const token of state.tokens) {
834
- if (token.type === 'prefix') {
835
- let resolvedIri = token.iri;
836
- if (token.iri.includes(':')) {
837
- const colonIndex = token.iri.indexOf(':');
838
- const potentialPrefix = token.iri.substring(0, colonIndex);
839
- const reference = token.iri.substring(colonIndex + 1);
840
- if (state.ctx[potentialPrefix] && potentialPrefix !== '@vocab') {
841
- resolvedIri = state.ctx[potentialPrefix] + reference;
842
- }
843
- }
844
- state.ctx[token.prefix] = resolvedIri;
845
- }
846
- }
847
-
848
- for (let i = 0; i < state.tokens.length; i++) {
849
- const token = state.tokens[i];
850
- state.currentTokenIndex = i;
851
- TOKEN_PROCESSORS[token.type]?.(token, state);
852
- }
853
-
854
- // Optimize array operations - avoid Array.from() and filter()
855
- const quadKeys = new Set();
856
- for (const quad of state.quads) {
857
- quadKeys.add(quadIndexKey(quad.subject, quad.predicate, quad.object));
858
- }
859
-
860
- // Direct iteration instead of Array.from() + filter()
861
- const filteredRemove = [];
862
- for (const quad of state.removeSet) {
863
- const key = quadIndexKey(quad.subject, quad.predicate, quad.object);
864
- if (!quadKeys.has(key)) {
865
- filteredRemove.push(quad);
866
- }
867
- }
868
-
869
- return { quads: state.quads, remove: filteredRemove, statements: state.statements, origin: state.origin, context: state.ctx };
870
- }
package/src/render.js CHANGED
@@ -6,7 +6,12 @@ import {
6
6
  parseSemanticBlock,
7
7
  hash
8
8
  } from './utils.js';
9
- import { DEFAULT_CONTEXT } from './shared.js';
9
+ import {
10
+ escapeHtml,
11
+ getIndentLevel,
12
+ processPredicates
13
+ } from './shared.js';
14
+ import { DEFAULT_CONTEXT } from './constants.js';
10
15
 
11
16
  /**
12
17
  * Render MD-LD to HTML+RDFa
@@ -302,17 +307,6 @@ function parseMarkdownList(markdownList, blocks, state) {
302
307
  return html;
303
308
  }
304
309
 
305
- /**
306
- * Get indent level from source text
307
- */
308
- function getIndentLevel(block, sourceText) {
309
- if (!block.range || !sourceText) return 0;
310
-
311
- const text = sourceText.substring(block.range.start, block.range.end);
312
- const indentMatch = text.match(/^(\s*)/);
313
- return indentMatch ? indentMatch[1].length : 0;
314
- }
315
-
316
310
  /**
317
311
  * Render a single block
318
312
  */
@@ -408,26 +402,9 @@ function buildRDFaAttrsFromBlock(block, ctx) {
408
402
  attrs.push(`typeof="${escapeHtml(types)}"`);
409
403
  }
410
404
 
411
- // Predicates
405
+ // Predicates using shared utility
412
406
  if (block.predicates && block.predicates.length > 0) {
413
- const literalProps = [];
414
- const objectProps = [];
415
- const reverseProps = [];
416
-
417
- block.predicates.forEach(pred => {
418
- const iri = typeof pred === 'string' ? pred : pred.iri;
419
- const expanded = expandIRI(iri, ctx);
420
- const shortened = shortenIRI(expanded, ctx);
421
- const form = typeof pred === 'string' ? '' : (pred.form || '');
422
-
423
- if (form === '!') {
424
- reverseProps.push(shortened);
425
- } else if (form === '?') {
426
- objectProps.push(shortened);
427
- } else {
428
- literalProps.push(shortened);
429
- }
430
- });
407
+ const { literalProps, objectProps, reverseProps } = processPredicates(block.predicates, ctx);
431
408
 
432
409
  if (literalProps.length > 0) {
433
410
  attrs.push(`property="${escapeHtml(literalProps.join(' '))}"`);
@@ -474,17 +451,3 @@ function wrapWithRDFaContext(html, ctx) {
474
451
 
475
452
  return `<div${prefixDecl}${vocabDecl}>${html}</div>`;
476
453
  }
477
-
478
- /**
479
- * Escape HTML special characters
480
- */
481
- function escapeHtml(text) {
482
- const map = {
483
- '&': '&amp;',
484
- '<': '&lt;',
485
- '>': '&gt;',
486
- '"': '&quot;',
487
- "'": '&#39;'
488
- };
489
- return String(text || '').replace(/[&<>"']/g, m => map[m]);
490
- }
package/src/shared.js CHANGED
@@ -1,37 +1,5 @@
1
- /**
2
- * Shared utilities for MD-LD Parser and Renderer
3
- * Ensures DRY code and consistent CommonMark processing
4
- */
5
-
6
- export const DEFAULT_CONTEXT = {
7
- '@vocab': "http://www.w3.org/2000/01/rdf-schema#",
8
- rdf: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
9
- rdfs: 'http://www.w3.org/2000/01/rdf-schema#',
10
- xsd: 'http://www.w3.org/2001/XMLSchema#',
11
- sh: "http://www.w3.org/ns/shacl#",
12
- prov: 'http://www.w3.org/ns/prov#'
13
- };
14
-
15
- // CommonMark patterns - shared between parser and renderer
16
- export const URL_REGEX = /^(https?|ftp|mailto|tag|nih|urn|uuid|did|web|ipfs|ipns|data|file|urn:uuid):/;
17
- export const FENCE_REGEX = /^(`{3,}|~{3,})(.*)/;
18
- export const PREFIX_REGEX = /^\[([^\]]+)\]\s*<([^>]+)>/;
19
- export const HEADING_REGEX = /^(#{1,6})\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
20
- export const UNORDERED_LIST_REGEX = /^(\s*)([-*+]|\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
21
- export const BLOCKQUOTE_REGEX = /^>\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
22
- export const STANDALONE_SUBJECT_REGEX = /^\s*\{=(.*?)\}\s*$/;
23
-
24
- // Inline carrier patterns - shared extraction logic
25
- export const INLINE_CARRIER_PATTERNS = {
26
- EMPHASIS: /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y,
27
- CODE_SPAN: /``(.+?)``\s*\{([^}]+)\}/y
28
- };
29
-
30
- // Pre-compiled carrier patterns for performance
31
- export const CARRIER_PATTERN_ARRAY = [
32
- ['EMPHASIS', /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y],
33
- ['CODE_SPAN', /``(.+?)``\s*\{([^}]+)\}/y]
34
- ];
1
+ import { DEFAULT_CONTEXT, STANDALONE_SUBJECT_REGEX, FENCE_REGEX, PREFIX_REGEX, HEADING_REGEX, UNORDERED_LIST_REGEX, BLOCKQUOTE_REGEX } from './constants.js';
2
+ import { parseSemanticBlock, expandIRI, shortenIRI } from './utils.js';
35
3
 
36
4
  // Cache for fence regex patterns
37
5
  export const FENCE_CLOSE_PATTERNS = new Map();
@@ -156,6 +124,172 @@ export const PROCESSORS = [
156
124
  { test: line => true, process: null } // Default: paragraph
157
125
  ];
158
126
 
127
+ // Token scanning processors - shared between parser and renderer
128
+ export const TOKEN_PROCESSORS = [
129
+ { type: 'fence', test: line => FENCE_REGEX.test(line.trim()), process: null }, // Will be overridden in parse.js
130
+ { type: 'content', test: line => false, process: null }, // Will be overridden in parse.js
131
+ { type: 'prefix', test: line => PREFIX_REGEX.test(line), process: null }, // Will be overridden in parse.js
132
+ { type: 'heading', test: line => HEADING_REGEX.test(line), process: null }, // Will be overridden in parse.js
133
+ { type: 'list', test: line => UNORDERED_LIST_REGEX.test(line), process: null }, // Will be overridden in parse.js
134
+ { type: 'blockquote', test: line => BLOCKQUOTE_REGEX.test(line), process: null }, // Will be overridden in parse.js
135
+ { type: 'para', test: line => line.trim(), process: null } // Will be overridden in parse.js
136
+ ];
137
+
138
+ // Language and attributes parsing
139
+ export function parseLangAndAttrs(langAndAttrs) {
140
+ const spaceIndex = langAndAttrs.indexOf(' ');
141
+ const braceIndex = langAndAttrs.indexOf('{');
142
+ const langEnd = Math.min(
143
+ spaceIndex > -1 ? spaceIndex : Infinity,
144
+ braceIndex > -1 ? braceIndex : Infinity
145
+ );
146
+ return {
147
+ lang: langAndAttrs.substring(0, langEnd),
148
+ attrsText: langAndAttrs.substring(langEnd).match(/\{[^{}]*\}/)?.[0] || null
149
+ };
150
+ }
151
+
152
+ // Carrier extraction utilities
153
+ export function findMatchingBracket(text, bracketStart) {
154
+ let bracketDepth = 1;
155
+ let bracketEnd = bracketStart + 1;
156
+
157
+ while (bracketEnd < text.length && bracketDepth > 0) {
158
+ if (text[bracketEnd] === '[') bracketDepth++;
159
+ else if (text[bracketEnd] === ']') bracketDepth--;
160
+ bracketEnd++;
161
+ }
162
+
163
+ return bracketDepth > 0 ? null : bracketEnd;
164
+ }
165
+
166
+ export function extractUrlFromBrackets(text, bracketEnd) {
167
+ let url = null;
168
+ let spanEnd = bracketEnd;
169
+
170
+ if (text[spanEnd] === '(') {
171
+ const parenEnd = text.indexOf(')', spanEnd);
172
+ if (parenEnd !== -1) {
173
+ url = text.substring(spanEnd + 1, parenEnd);
174
+ spanEnd = parenEnd + 1;
175
+ }
176
+ }
177
+
178
+ return { url, spanEnd };
179
+ }
180
+
181
+ export function extractAttributesFromText(text, spanEnd, baseOffset) {
182
+ let attrs = null;
183
+ let attrsRange = null;
184
+ const remaining = text.substring(spanEnd);
185
+
186
+ const wsMatch = remaining.match(/^\s+/);
187
+ const attrsStart = wsMatch ? wsMatch[0].length : 0;
188
+
189
+ if (remaining[attrsStart] === '{') {
190
+ const braceEnd = remaining.indexOf('}', attrsStart);
191
+ if (braceEnd !== -1) {
192
+ attrs = remaining.substring(attrsStart, braceEnd + 1);
193
+ const absStart = baseOffset + spanEnd + attrsStart;
194
+ attrsRange = [absStart, absStart + attrs.length];
195
+ spanEnd += braceEnd + 1;
196
+ }
197
+ }
198
+
199
+ return { attrs, attrsRange, finalSpanEnd: spanEnd };
200
+ }
201
+
202
+ export function determineCarrierType(url) {
203
+ if (url && !url.startsWith('=')) {
204
+ return { carrierType: 'link', resourceIRI: url };
205
+ }
206
+ return { carrierType: 'span', resourceIRI: null };
207
+ }
208
+
209
+ export function calcCarrierRanges(match, baseOffset, matchStart) {
210
+ const valueStart = baseOffset + matchStart + match[0].indexOf(match[1]);
211
+ const valueEnd = valueStart + match[1].length;
212
+ const attrsStart = baseOffset + matchStart + match[0].indexOf('{');
213
+ const attrsEnd = attrsStart + match[2].length + 2; // +2 for { and }
214
+ return {
215
+ valueRange: [valueStart, valueEnd],
216
+ attrsRange: [attrsStart + 1, attrsEnd - 1], // Exclude braces
217
+ range: [baseOffset + matchStart, attrsEnd],
218
+ pos: matchStart + match[0].length // pos should be relative to current text, not document
219
+ };
220
+ }
221
+
222
+ // Clean text extraction utilities
223
+ export function extractCleanText(token) {
224
+ if (!token.text) return '';
225
+
226
+ let text = token.text;
227
+
228
+ // Remove semantic annotations
229
+ if (token.attrsRange) {
230
+ const beforeAttrs = text.substring(0, token.attrsRange[0] - (token.range?.[0] || 0));
231
+ const afterAttrs = text.substring(token.attrsRange[1] - (token.range?.[0] || 0));
232
+ text = beforeAttrs + afterAttrs;
233
+ }
234
+
235
+ // Clean based on token type
236
+ switch (token.type) {
237
+ case 'heading':
238
+ return text.replace(/^#+\s*/, '').trim();
239
+ case 'list':
240
+ return text.replace(/^[-*+]\s*/, '').trim();
241
+ case 'blockquote':
242
+ return text.replace(/^>\s*/, '').trim();
243
+ default:
244
+ return text.trim();
245
+ }
246
+ }
247
+
248
+ // Quad emission utilities
249
+ export const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
250
+ export const RDF_STATEMENT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement';
251
+ export const RDF_SUBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject';
252
+ export const RDF_PREDICATE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate';
253
+ export const RDF_OBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#object';
254
+
255
+ export function createLeanOriginEntry(block, subject, predicate, meta = null) {
256
+ return {
257
+ blockId: block.id,
258
+ range: block.range,
259
+ carrierType: block.carrierType,
260
+ subject: subject.value,
261
+ predicate: predicate.value,
262
+ context: block.context, // Direct reference instead of spread
263
+ polarity: meta?.remove ? '-' : '+',
264
+ value: block.text || ''
265
+ };
266
+ }
267
+
268
+ // Fragment resolution utilities
269
+ export function resolveFragment(fragment, currentSubject, dataFactory) {
270
+ if (!currentSubject) return null;
271
+ const subjectValue = currentSubject.value;
272
+ const hashIndex = subjectValue.indexOf('#');
273
+ const baseIRI = hashIndex > -1 ? subjectValue.slice(0, hashIndex) : subjectValue;
274
+ return dataFactory.namedNode(baseIRI + '#' + fragment);
275
+ }
276
+
277
+ export function resolveSubject(sem, state) {
278
+ if (!sem.subject) return null;
279
+ if (sem.subject === 'RESET') {
280
+ state.currentSubject = null;
281
+ return null;
282
+ }
283
+ if (sem.subject.startsWith('=#')) return resolveFragment(sem.subject.substring(2), state.currentSubject, state.df);
284
+ return state.df.namedNode(expandIRI(sem.subject, state.ctx));
285
+ }
286
+
287
+ export function resolveObject(sem, state) {
288
+ if (!sem.object) return null;
289
+ if (sem.object.startsWith('#')) return resolveFragment(sem.object.substring(1), state.currentSubject, state.df);
290
+ return state.df.namedNode(expandIRI(sem.object, state.ctx));
291
+ }
292
+
159
293
  // HTML escaping - shared utility
160
294
  export function escapeHtml(text) {
161
295
  if (!text) return '';
@@ -167,6 +301,77 @@ export function escapeHtml(text) {
167
301
  .replace(/'/g, '&#x27;');
168
302
  }
169
303
 
304
+ // RDF term type checking utilities - shared across modules
305
+ export function isLiteral(term) {
306
+ return term?.termType === 'Literal';
307
+ }
308
+
309
+ export function isNamedNode(term) {
310
+ return term?.termType === 'NamedNode';
311
+ }
312
+
313
+ export function isRdfType(term) {
314
+ return term?.value === 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
315
+ }
316
+
317
+ // IRI prefix extraction utility
318
+ export function getPrefixFromIRI(iri, context) {
319
+ if (!iri) return null;
320
+ const shortened = shortenIRI(iri, context);
321
+ if (shortened.includes(':')) {
322
+ return shortened.split(':')[0];
323
+ }
324
+ return null;
325
+ }
326
+
327
+ // Prefix collection utility - used by generate.js
328
+ export function collectUsedPrefixes(subjectGroups, context) {
329
+ const usedPrefixes = new Set();
330
+
331
+ for (const subjectQuads of subjectGroups.values()) {
332
+ for (const quad of subjectQuads) {
333
+ // Check subject prefix
334
+ const subjectPrefix = getPrefixFromIRI(quad.subject.value, context);
335
+ if (subjectPrefix) usedPrefixes.add(subjectPrefix);
336
+
337
+ // Check predicate prefix
338
+ const predicatePrefix = getPrefixFromIRI(quad.predicate.value, context);
339
+ if (predicatePrefix) usedPrefixes.add(predicatePrefix);
340
+
341
+ // Check object prefix if it's a named node
342
+ if (isNamedNode(quad.object)) {
343
+ const objectPrefix = getPrefixFromIRI(quad.object.value, context);
344
+ if (objectPrefix) usedPrefixes.add(objectPrefix);
345
+ }
346
+
347
+ // Check datatype prefix if present
348
+ if (quad.object.datatype && quad.object.datatype.value) {
349
+ const datatypePrefix = getPrefixFromIRI(quad.object.datatype.value, context);
350
+ if (datatypePrefix) usedPrefixes.add(datatypePrefix);
351
+ }
352
+ }
353
+ }
354
+
355
+ return usedPrefixes;
356
+ }
357
+
358
+ // Token processing utility - eliminates duplication in TOKEN_PROCESSORS
359
+ export function processTokenWithBlockTracking(token, state, processAnnotations, createBlockEntry, additionalProcessors = []) {
360
+ const blockEntry = createBlockEntry(token, state);
361
+ state.currentBlock = blockEntry;
362
+ state.blockStack.push(blockEntry.id);
363
+
364
+ // Run any additional processors first
365
+ additionalProcessors.forEach(processor => processor(token, state));
366
+
367
+ // Process annotations
368
+ processAnnotations(token, state, token.type);
369
+
370
+ state.blockStack.pop();
371
+ state.currentBlock = state.blockStack.length > 0 ?
372
+ state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
373
+ }
374
+
170
375
  // Quad key generation - shared between parser and renderer
171
376
  export function quadIndexKey(subject, predicate, object) {
172
377
  const datatype = object.datatype?.value || '';
@@ -199,14 +404,126 @@ export function resolveSubjectType(subjectDecl) {
199
404
  return 'full-iri';
200
405
  }
201
406
 
202
- // Fragment resolution - shared logic
203
- export function resolveFragment(fragment, currentSubject) {
204
- if (!currentSubject) {
205
- throw new Error('Fragment requires current subject');
407
+ // Constants - shared across modules (bundle-size optimized)
408
+ export const XSD_STRING = 'http://www.w3.org/2001/XMLSchema#string';
409
+
410
+ // Optimized sorting utilities - inline for better minification
411
+ export function sortQuadsByPredicate(quads) {
412
+ return quads.sort((a, b) => a.predicate.value.localeCompare(b.predicate.value));
413
+ }
414
+
415
+ // Optimized text generation - template literals for smaller bundle
416
+ export const generatePrefixDeclaration = (prefix, namespace) => `[${prefix}] <${namespace}>\n`;
417
+
418
+ export function generateLiteralText(quad, context) {
419
+ const predShort = shortenIRI(quad.predicate.value, context);
420
+ let annotation = predShort;
421
+
422
+ if (quad.object.language) {
423
+ annotation += ` @${quad.object.language}`;
424
+ } else if (quad.object.datatype.value !== XSD_STRING) {
425
+ annotation += ` ^^${shortenIRI(quad.object.datatype.value, context)}`;
426
+ }
427
+
428
+ return `[${quad.object.value}] {${annotation}}\n`;
429
+ }
430
+
431
+ export const generateObjectText = (quad, context) => {
432
+ const objShort = shortenIRI(quad.object.value, context);
433
+ const predShort = shortenIRI(quad.predicate.value, context);
434
+ return `[${objShort}] {+${objShort} ?${predShort}}\n`;
435
+ };
436
+
437
+ // Optimized quad filtering - destructuring for smaller minified output
438
+ export function filterQuadsByType(subjectQuads) {
439
+ const types = [], literals = [], objects = [];
440
+ for (const q of subjectQuads) {
441
+ if (isRdfType(q.predicate)) {
442
+ types.push(q);
443
+ } else if (isLiteral(q.object)) {
444
+ literals.push(q);
445
+ } else if (isNamedNode(q.object)) {
446
+ objects.push(q);
447
+ }
206
448
  }
207
- const fragmentName = fragment.substring(2); // Remove =#
208
- const baseIRI = currentSubject.value;
209
- const hashIndex = baseIRI.indexOf('#');
210
- const base = hashIndex > -1 ? baseIRI.slice(0, hashIndex) : baseIRI;
211
- return base + '#' + fragmentName;
449
+ return { types, literals, objects };
450
+ }
451
+
452
+ // Predicate processing utilities - common RDFa patterns
453
+ export function processPredicates(predicates, ctx) {
454
+ const literalProps = [];
455
+ const objectProps = [];
456
+ const reverseProps = [];
457
+
458
+ predicates.forEach(pred => {
459
+ const iri = typeof pred === 'string' ? pred : pred.iri;
460
+ const expanded = expandIRI(iri, ctx);
461
+ const shortened = shortenIRI(expanded, ctx);
462
+ const form = typeof pred === 'string' ? '' : (pred.form || '');
463
+
464
+ if (form === '!') {
465
+ reverseProps.push(shortened);
466
+ } else if (form === '?') {
467
+ objectProps.push(shortened);
468
+ } else {
469
+ literalProps.push(shortened);
470
+ }
471
+ });
472
+
473
+ return { literalProps, objectProps, reverseProps };
474
+ }
475
+
476
+ // Deterministic sorting utilities - ensure consistent output
477
+ export function sortDeterministic(array, keyFn) {
478
+ return array.sort((a, b) => {
479
+ const keyA = keyFn(a);
480
+ const keyB = keyFn(b);
481
+ return keyA.localeCompare(keyB);
482
+ });
483
+ }
484
+
485
+ export function sortQuadsDeterministically(quads) {
486
+ return quads.sort((a, b) => {
487
+ // Deterministic sorting: subject -> predicate -> object
488
+ const sComp = a.subject.value.localeCompare(b.subject.value);
489
+ if (sComp !== 0) return sComp;
490
+ const pComp = a.predicate.value.localeCompare(b.predicate.value);
491
+ if (pComp !== 0) return pComp;
492
+ const oA = isLiteral(a.object) ? a.object.value : a.object.value;
493
+ const oB = isLiteral(b.object) ? b.object.value : b.object.value;
494
+ return oA.localeCompare(oB);
495
+ });
496
+ }
497
+
498
+ // Optimized deterministic prefix generation
499
+ export function generateDeterministicPrefixes(context, usedPrefixes) {
500
+ const sortedEntries = Object.entries(context).sort(([a], [b]) => a.localeCompare(b));
501
+ let text = '';
502
+
503
+ for (const [prefix, namespace] of sortedEntries) {
504
+ if (prefix !== '@vocab' && !prefix.startsWith('@') && !DEFAULT_CONTEXT[prefix] && usedPrefixes.has(prefix)) {
505
+ text += generatePrefixDeclaration(prefix, namespace);
506
+ }
507
+ }
508
+
509
+ return text;
510
+ }
511
+
512
+ // Memory-efficient block creation
513
+ export function createOptimizedBlockEntry(token, state) {
514
+ const id = hash(`${token.range[0]}-${token.range[1]}-${token.text.slice(0, 50)}`);
515
+ const block = {
516
+ id,
517
+ type: token.type,
518
+ carrierType: token.type,
519
+ range: token.range,
520
+ text: token.text,
521
+ carriers: [],
522
+ predicates: [],
523
+ subject: state.currentSubject,
524
+ context: { ...state.ctx }
525
+ };
526
+
527
+ state.origin.blocks.set(id, block);
528
+ return block;
212
529
  }
package/src/utils.js CHANGED
@@ -1,4 +1,4 @@
1
- import { URL_REGEX, DEFAULT_CONTEXT } from './shared.js';
1
+ import { URL_REGEX, DEFAULT_CONTEXT } from './constants.js';
2
2
 
3
3
  // Base Term class for RDF/JS compatibility
4
4
  export class Term {