mdld-parse 0.7.3 → 0.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mdld-parse",
3
- "version": "0.7.3",
3
+ "version": "0.7.5",
4
4
  "description": "A standards-compliant parser for **MD-LD (Markdown-Linked Data)** — a human-friendly RDF authoring format that extends Markdown with semantic annotations.",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -38,8 +38,5 @@
38
38
  "homepage": "https://mdld.js.org",
39
39
  "bugs": {
40
40
  "url": "https://github.com/davay42/mdld-parse/issues"
41
- },
42
- "dependencies": {
43
- "rdfa-parse": "^1.0.1"
44
41
  }
45
42
  }
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Shared utilities for MD-LD Parser and Renderer
3
+ * Ensures DRY code and consistent CommonMark processing
4
+ */
5
+
6
+
7
+
8
+ export const DEFAULT_CONTEXT = {
9
+ '@vocab': "http://www.w3.org/2000/01/rdf-schema#",
10
+ rdf: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
11
+ rdfs: 'http://www.w3.org/2000/01/rdf-schema#',
12
+ xsd: 'http://www.w3.org/2001/XMLSchema#',
13
+ sh: "http://www.w3.org/ns/shacl#",
14
+ prov: 'http://www.w3.org/ns/prov#'
15
+ };
16
+
17
+ // CommonMark patterns - shared between parser and renderer
18
+ export const URL_REGEX = /^(https?|ftp|mailto|tag|nih|urn|uuid|did|web|ipfs|ipns|data|file|urn:uuid):/;
19
+ export const FENCE_REGEX = /^(`{3,}|~{3,})(.*)/;
20
+ export const PREFIX_REGEX = /^\[([^\]]+)\]\s*<([^>]+)>/;
21
+ export const HEADING_REGEX = /^(#{1,6})\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
22
+ export const UNORDERED_LIST_REGEX = /^(\s*)([-*+]|\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
23
+ export const BLOCKQUOTE_REGEX = /^>\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
24
+ export const STANDALONE_SUBJECT_REGEX = /^\s*\{=(.*?)\}\s*$/;
25
+
26
+ // Pre-compiled carrier patterns for performance
27
+ export const CARRIER_PATTERN_ARRAY = [
28
+ ['EMPHASIS', /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y],
29
+ ['CODE_SPAN', /``(.+?)``\s*\{([^}]+)\}/y]
30
+ ];
package/src/generate.js CHANGED
@@ -1,21 +1,26 @@
1
1
  import { shortenIRI, expandIRI, DataFactory } from './utils.js';
2
- import { DEFAULT_CONTEXT } from './shared.js';
3
-
4
- // Helper functions for cleaner term type checking
5
- function isLiteral(term) {
6
- return term?.termType === 'Literal';
7
- }
8
-
9
- function isNamedNode(term) {
10
- return term?.termType === 'NamedNode';
11
- }
12
-
13
- function isRdfType(term) {
14
- return term?.value === 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
15
- }
16
-
2
+ import { DEFAULT_CONTEXT } from './constants.js';
3
+ import {
4
+ isLiteral,
5
+ collectUsedPrefixes,
6
+ sortQuadsByPredicate,
7
+ generatePrefixDeclaration,
8
+ generateLiteralText,
9
+ generateObjectText,
10
+ filterQuadsByType
11
+ } from './shared.js';
12
+
13
+ export function extractLocalName(iri, ctx = {}) {
14
+ if (!iri) return iri;
15
+
16
+ // Check for exact prefix matches first
17
+ for (const [prefix, namespace] of Object.entries(ctx)) {
18
+ if (iri.startsWith(namespace) || iri.startsWith(namespace.slice(0, -1))) {
19
+ return iri.substring(namespace.length);
20
+ }
21
+ }
17
22
 
18
- function extractLocalName(iri) {
23
+ // Fallback to original logic for local names
19
24
  const separators = ['#', '/', ':'];
20
25
  for (const sep of separators) {
21
26
  const lastSep = iri.lastIndexOf(sep);
@@ -83,14 +88,14 @@ function groupQuadsBySubject(quads) {
83
88
 
84
89
  function buildDeterministicMDLD(subjectGroups, context) {
85
90
  let text = '';
91
+ const usedPrefixes = collectUsedPrefixes(subjectGroups, context);
86
92
 
87
93
  // Add prefixes first (deterministic order), but exclude default context prefixes
88
94
  const sortedPrefixes = Object.entries(context).sort(([a], [b]) => a.localeCompare(b));
89
95
  for (const [prefix, namespace] of sortedPrefixes) {
90
96
  // Skip default context prefixes - they're implicit in MDLD
91
- if (prefix !== '@vocab' && !prefix.startsWith('@') && !DEFAULT_CONTEXT[prefix]) {
92
- const prefixDecl = `[${prefix}] <${namespace}>\n`;
93
- text += prefixDecl;
97
+ if (prefix !== '@vocab' && !prefix.startsWith('@') && !DEFAULT_CONTEXT[prefix] && usedPrefixes.has(prefix)) {
98
+ text += generatePrefixDeclaration(prefix, namespace);
94
99
  }
95
100
  }
96
101
 
@@ -105,47 +110,25 @@ function buildDeterministicMDLD(subjectGroups, context) {
105
110
  const subjectQuads = subjectGroups.get(subjectIRI);
106
111
  const shortSubject = shortenIRI(subjectIRI, context);
107
112
 
108
- // Separate types, literals, and objects using helper functions
109
- const types = subjectQuads.filter(q => isRdfType(q.predicate));
110
- const literals = subjectQuads.filter(q => isLiteral(q.object) && !isRdfType(q.predicate));
111
- const objects = subjectQuads.filter(q => isNamedNode(q.object) && !isRdfType(q.predicate));
113
+ // Separate types, literals, and objects using shared utility
114
+ const { types, literals, objects } = filterQuadsByType(subjectQuads);
112
115
 
113
116
  // Generate heading
114
- const localSubjectName = extractLocalName(subjectIRI);
117
+ const localSubjectName = extractLocalName(subjectIRI, context);
115
118
  const typeAnnotations = types.length > 0
116
- ? ' ' + types.map(t => '.' + extractLocalName(t.object.value)).sort().join(' ')
119
+ ? ' ' + types.map(t => '.' + shortenIRI(t.object.value, context)).sort().join(' ')
117
120
  : '';
118
121
 
119
- const headingText = `# ${localSubjectName} {=${shortSubject}${typeAnnotations}}\n\n`;
120
-
121
- text += headingText;
122
-
123
- // Add literals (deterministic order)
124
- const sortedLiterals = literals.sort((a, b) => a.predicate.value.localeCompare(b.predicate.value));
125
- for (const quad of sortedLiterals) {
126
- const predShort = shortenIRI(quad.predicate.value, context);
127
- let annotation = predShort;
128
-
129
- // Use DataFactory XSD constants for datatype comparison
130
- const xsdString = 'http://www.w3.org/2001/XMLSchema#string';
131
- if (quad.object.language) {
132
- annotation += ` @${quad.object.language}`;
133
- } else if (quad.object.datatype.value !== xsdString) {
134
- annotation += ` ^^${shortenIRI(quad.object.datatype.value, context)}`;
135
- }
122
+ text += `# ${localSubjectName} {=${shortSubject}${typeAnnotations}}\n`;
136
123
 
137
- const literalText = `[${quad.object.value}] {${annotation}}\n`;
138
- text += literalText;
139
- }
124
+ // Add literals and objects using shared utilities
125
+ sortQuadsByPredicate(literals).forEach(quad => {
126
+ text += generateLiteralText(quad, context);
127
+ });
140
128
 
141
- // Add objects (deterministic order)
142
- const sortedObjects = objects.sort((a, b) => a.predicate.value.localeCompare(b.predicate.value));
143
- for (const quad of sortedObjects) {
144
- const objShort = shortenIRI(quad.object.value, context);
145
- const predShort = shortenIRI(quad.predicate.value, context);
146
- const objectText = `[${objShort}] {+${objShort} ?${predShort}}\n`;
147
- text += objectText;
148
- }
129
+ sortQuadsByPredicate(objects).forEach(quad => {
130
+ text += generateObjectText(quad, context);
131
+ });
149
132
 
150
133
  text += '\n';
151
134
  }
package/src/index.js CHANGED
@@ -3,7 +3,7 @@ export { merge } from './merge.js';
3
3
  export { generate } from './generate.js';
4
4
  export { locate } from './locate.js';
5
5
  export { render } from './render.js';
6
- export { DEFAULT_CONTEXT } from './shared.js';
6
+ export { DEFAULT_CONTEXT } from './constants.js';
7
7
  export {
8
8
  DataFactory,
9
9
  hash,
package/src/locate.js CHANGED
@@ -18,21 +18,6 @@ export function locate(quad, origin) {
18
18
  return null;
19
19
  }
20
20
 
21
- // Find the origin entry in quadIndex
22
- const entry = origin.quadIndex.get(quadKey);
23
- if (!entry) {
24
- return null;
25
- }
26
-
27
- // Return the lean origin entry structure
28
- return {
29
- blockId: entry.blockId,
30
- range: entry.range,
31
- carrierType: entry.carrierType,
32
- subject: entry.subject,
33
- predicate: entry.predicate,
34
- context: entry.context,
35
- value: entry.value,
36
- polarity: entry.polarity
37
- };
21
+ // Return the origin entry directly - no need to create new object
22
+ return origin.quadIndex.get(quadKey) || null;
38
23
  }
package/src/merge.js CHANGED
@@ -1,15 +1,14 @@
1
1
  import { parse } from './parse.js';
2
- import { DEFAULT_CONTEXT } from './shared.js';
2
+ import { quadToKeyForOrigin } from './utils.js';
3
+ import { DEFAULT_CONTEXT } from './constants.js';
3
4
 
4
5
  /**
5
- * Creates a unique key for quad identity matching
6
+ * Creates a unique key for quad identity matching - using shared utility
6
7
  * @param {Quad} quad
7
8
  * @returns {string}
8
9
  */
9
10
  function quadKey(quad) {
10
- const datatype = quad.object.datatype?.value || '';
11
- const language = quad.object.language || '';
12
- return `${quad.subject.value}|${quad.predicate.value}|${quad.object.value}|${datatype}|${language}`;
11
+ return quadToKeyForOrigin(quad);
13
12
  }
14
13
 
15
14
  /**
@@ -42,6 +41,7 @@ export function merge(docs, options = {}) {
42
41
  const allDocuments = [];
43
42
  const quadIndex = new Map();
44
43
  const allStatements = []; // Collect statements from all documents
44
+ const accumulatedContext = new Map(); // Track all unique prefixes across documents
45
45
 
46
46
  // Process each document in order
47
47
  for (let i = 0; i < docs.length; i++) {
@@ -53,6 +53,16 @@ export function merge(docs, options = {}) {
53
53
  // Normalize input to ParseResult
54
54
  const doc = normalizeInput(input, options, docContext);
55
55
 
56
+ // Accumulate context from this document
57
+ if (doc.context) {
58
+ for (const [prefix, namespace] of Object.entries(doc.context)) {
59
+ // Don't override default context entries unless explicitly provided in options
60
+ if (!accumulatedContext.has(prefix) && !DEFAULT_CONTEXT[prefix]) {
61
+ accumulatedContext.set(prefix, namespace);
62
+ }
63
+ }
64
+ }
65
+
56
66
  // Create document origin
57
67
  const documentOrigin = {
58
68
  index: i,
@@ -74,14 +84,12 @@ export function merge(docs, options = {}) {
74
84
  sessionBuffer.set(key, quad);
75
85
 
76
86
  // Create quad origin with document index and polarity
77
- const existingOrigin = doc.origin.quadIndex.get(quadKey(quad));
78
- if (existingOrigin) {
79
- quadIndex.set(quadKey(quad), {
80
- ...existingOrigin,
81
- documentIndex: i,
82
- polarity: '+'
83
- });
84
- }
87
+ const existingOrigin = doc.origin.quadIndex.get(key);
88
+ quadIndex.set(key, {
89
+ ...(existingOrigin || {}),
90
+ documentIndex: i,
91
+ polarity: '+'
92
+ });
85
93
  }
86
94
 
87
95
  // Fold retractions
@@ -97,14 +105,12 @@ export function merge(docs, options = {}) {
97
105
  }
98
106
 
99
107
  // Create quad origin for remove quads
100
- const existingOrigin = doc.origin.quadIndex.get(quadKey(quad));
101
- if (existingOrigin) {
102
- quadIndex.set(quadKey(quad), {
103
- ...existingOrigin,
104
- documentIndex: i,
105
- polarity: '-'
106
- });
107
- }
108
+ const existingOrigin = doc.origin.quadIndex.get(key);
109
+ quadIndex.set(key, {
110
+ ...(existingOrigin || {}),
111
+ documentIndex: i,
112
+ polarity: '-'
113
+ });
108
114
  }
109
115
  }
110
116
 
@@ -119,7 +125,11 @@ export function merge(docs, options = {}) {
119
125
  };
120
126
 
121
127
  // Build final context (union of all contexts)
122
- const finalContext = { ...DEFAULT_CONTEXT, ...options.context };
128
+ const finalContext = {
129
+ ...DEFAULT_CONTEXT,
130
+ ...options.context,
131
+ ...Object.fromEntries(accumulatedContext)
132
+ };
123
133
 
124
134
  // Enforce hard invariant
125
135
  const quadKeys = new Set(finalQuads.map(quadKey));
package/src/parse.js CHANGED
@@ -1,93 +1,128 @@
1
1
  import {
2
2
  DataFactory,
3
3
  expandIRI,
4
- parseSemanticBlock,
5
4
  quadIndexKey,
6
5
  createLiteral,
7
6
  hash
8
7
  } from './utils.js';
9
- import { DEFAULT_CONTEXT, URL_REGEX, FENCE_REGEX, PREFIX_REGEX, HEADING_REGEX, UNORDERED_LIST_REGEX, BLOCKQUOTE_REGEX, STANDALONE_SUBJECT_REGEX, INLINE_CARRIER_PATTERNS } from './shared.js';
8
+ import {
9
+ DEFAULT_CONTEXT,
10
+ URL_REGEX,
11
+ FENCE_REGEX,
12
+ PREFIX_REGEX,
13
+ HEADING_REGEX,
14
+ UNORDERED_LIST_REGEX,
15
+ BLOCKQUOTE_REGEX,
16
+ STANDALONE_SUBJECT_REGEX,
17
+ CARRIER_PATTERN_ARRAY,
18
+
19
+ } from './constants.js';
20
+ import {
10
21
 
11
- // Cache for fence regex patterns to avoid recreation
12
- const FENCE_CLOSE_PATTERNS = new Map();
22
+ getFenceClosePattern,
23
+ calcRangeInfo,
24
+ calcAttrsRange,
25
+ createToken,
26
+ createCarrier,
27
+ createListToken,
28
+ parseSemCached,
29
+ parseLangAndAttrs,
30
+ findMatchingBracket,
31
+ extractUrlFromBrackets,
32
+ extractAttributesFromText,
33
+ determineCarrierType,
34
+ calcCarrierRanges,
35
+ extractCleanText,
36
+ RDF_TYPE,
37
+ RDF_STATEMENT,
38
+ RDF_SUBJECT,
39
+ RDF_PREDICATE,
40
+ RDF_OBJECT,
41
+ createLeanOriginEntry,
42
+ resolveSubject,
43
+ resolveObject,
44
+ processTokenWithBlockTracking
45
+ } from './shared.js';
13
46
 
14
- function getFenceClosePattern(fenceChar) {
15
- if (!FENCE_CLOSE_PATTERNS.has(fenceChar)) {
16
- FENCE_CLOSE_PATTERNS.set(fenceChar, new RegExp(`^(${fenceChar}{3,})`));
17
- }
18
- return FENCE_CLOSE_PATTERNS.get(fenceChar);
19
- }
20
47
 
21
- function parseLangAndAttrs(langAndAttrs) {
22
- const spaceIndex = langAndAttrs.indexOf(' ');
23
- const braceIndex = langAndAttrs.indexOf('{');
24
- const langEnd = Math.min(
25
- spaceIndex > -1 ? spaceIndex : Infinity,
26
- braceIndex > -1 ? braceIndex : Infinity
27
- );
28
- return {
29
- lang: langAndAttrs.substring(0, langEnd),
30
- attrsText: langAndAttrs.substring(langEnd).match(/\{[^{}]*\}/)?.[0] || null
48
+ export function parse(text, options = {}) {
49
+ const state = {
50
+ ctx: { ...DEFAULT_CONTEXT, ...(options.context || {}) },
51
+ df: options.dataFactory || DataFactory,
52
+ quads: [],
53
+ quadBuffer: new Map(),
54
+ removeSet: new Set(),
55
+ origin: {
56
+ quadIndex: new Map(),
57
+ blocks: new Map(),
58
+ documentStructure: []
59
+ },
60
+ currentSubject: null,
61
+ tokens: null,
62
+ currentTokenIndex: -1,
63
+ statements: [],
64
+ statementCandidates: new Map(),
65
+ currentBlock: null,
66
+ blockStack: []
31
67
  };
32
- }
33
68
 
34
- const semCache = {};
35
- const EMPTY_SEM = Object.freeze({ predicates: [], types: [], subject: null });
69
+ state.tokens = scanTokens(text);
70
+
71
+ // Single-pass processing: resolve prefixes AND process tokens together
72
+ for (let i = 0; i < state.tokens.length; i++) {
73
+ const token = state.tokens[i];
74
+ state.currentTokenIndex = i;
75
+
76
+ // Handle prefix tokens immediately during main pass
77
+ if (token.type === 'prefix') {
78
+ let resolvedIri = token.iri;
79
+ if (token.iri.includes(':')) {
80
+ const colonIndex = token.iri.indexOf(':');
81
+ const potentialPrefix = token.iri.substring(0, colonIndex);
82
+ const reference = token.iri.substring(colonIndex + 1);
83
+ if (state.ctx[potentialPrefix] && potentialPrefix !== '@vocab') {
84
+ resolvedIri = state.ctx[potentialPrefix] + reference;
85
+ }
86
+ }
87
+ state.ctx[token.prefix] = resolvedIri;
88
+ continue; // Skip token processor for prefixes
89
+ }
36
90
 
37
- function parseSemCached(attrs) {
38
- if (!attrs) return EMPTY_SEM;
39
- let sem = semCache[attrs];
40
- if (!sem) {
41
- sem = Object.freeze(parseSemanticBlock(attrs));
42
- semCache[attrs] = sem;
91
+ // Process all other tokens
92
+ TOKEN_PROCESSORS[token.type]?.(token, state);
43
93
  }
44
- return sem;
45
- }
46
94
 
47
- function calcRangeInfo(line, attrs, lineStart, prefixLength, valueLength) {
48
- const wsLength = prefixLength < line.length && line[prefixLength] === ' ' ? 1 :
49
- line.slice(prefixLength).match(/^\s+/)?.[0]?.length || 0;
50
- const valueStartInLine = prefixLength + wsLength;
51
- return {
52
- valueRange: [lineStart + valueStartInLine, lineStart + valueStartInLine + valueLength],
53
- attrsRange: calcAttrsRange(line, attrs, lineStart)
54
- };
55
- }
95
+ // Optimized quad filtering - use Set.has() instead of array.includes()
96
+ const quadKeys = new Set();
97
+ for (const quad of state.quads) {
98
+ quadKeys.add(quadIndexKey(quad.subject, quad.predicate, quad.object));
99
+ }
56
100
 
57
- function calcAttrsRange(line, attrs, lineStart) {
58
- if (!attrs) return null;
59
- const attrsStartInLine = line.lastIndexOf(attrs);
60
- return attrsStartInLine >= 0 ? [lineStart + attrsStartInLine, lineStart + attrsStartInLine + attrs.length] : null;
61
- }
101
+ // Direct Set iteration - more efficient than filter()
102
+ const filteredRemove = [];
103
+ for (const quad of state.removeSet) {
104
+ const key = quadIndexKey(quad.subject, quad.predicate, quad.object);
105
+ if (!quadKeys.has(key)) {
106
+ filteredRemove.push(quad);
107
+ }
108
+ }
62
109
 
63
- function createToken(type, range, text, attrs = null, attrsRange = null, valueRange = null, extra = {}) {
64
- const token = { type, range, text, attrs, attrsRange, valueRange, ...extra };
65
- Object.defineProperty(token, '_carriers', {
66
- enumerable: false, writable: true, value: null
67
- });
68
- return token;
110
+ return { quads: state.quads, remove: filteredRemove, statements: state.statements, origin: state.origin, context: state.ctx };
69
111
  }
70
112
 
113
+
114
+ // Cache for fence regex patterns - using shared utility
115
+
71
116
  function getCarriers(token) {
72
117
  if (token.type === 'code') return [];
73
118
  return token._carriers || (token._carriers = extractInlineCarriers(token.text, token.range[0]));
74
119
  }
75
120
 
76
- const createListToken = (type, line, lineStart, pos, match) => {
77
- const attrs = match[4] || null;
78
- const prefix = match[1].length + (match[2] ? match[2].length : 0);
79
- const rangeInfo = calcRangeInfo(line, attrs, lineStart, prefix, match[3].length);
80
- return createToken(type, [lineStart, pos - 1], match[3].trim(), attrs,
81
- rangeInfo.attrsRange, rangeInfo.valueRange, { indent: match[1].length });
82
- };
83
-
84
121
  function scanTokens(text) {
85
122
  const tokens = [];
86
123
  const lines = text.split('\n');
87
124
  let pos = 0;
88
125
  let codeBlock = null;
89
-
90
- // Direct lookup instead of linear search
91
126
  const PROCESSORS = [
92
127
  { type: 'fence', test: line => FENCE_REGEX.test(line.trim()), process: handleFence },
93
128
  { type: 'content', test: () => codeBlock, process: line => codeBlock.content.push(line) },
@@ -194,16 +229,6 @@ function scanTokens(text) {
194
229
  return tokens;
195
230
  }
196
231
 
197
- function createCarrier(type, text, attrs, attrsRange, valueRange, range, pos, extra = {}) {
198
- return { type, text, attrs, attrsRange, valueRange, range, pos, ...extra };
199
- }
200
-
201
- // Pre-compiled carrier patterns for better performance
202
- const CARRIER_PATTERN_ARRAY = [
203
- ['EMPHASIS', /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y],
204
- ['CODE_SPAN', /``(.+?)``\s*\{([^}]+)\}/y]
205
- ];
206
-
207
232
  function extractInlineCarriers(text, baseOffset = 0) {
208
233
  const carriers = [];
209
234
  let pos = 0;
@@ -268,74 +293,6 @@ function extractInlineCarriers(text, baseOffset = 0) {
268
293
  return carriers;
269
294
  }
270
295
 
271
- function calcCarrierRanges(match, baseOffset, matchStart) {
272
- const valueStart = baseOffset + matchStart + match[0].indexOf(match[1]);
273
- const valueEnd = valueStart + match[1].length;
274
- const attrsStart = baseOffset + matchStart + match[0].indexOf('{');
275
- const attrsEnd = attrsStart + match[2].length + 2; // +2 for { and }
276
- return {
277
- valueRange: [valueStart, valueEnd],
278
- attrsRange: [attrsStart + 1, attrsEnd - 1], // Exclude braces
279
- range: [baseOffset + matchStart, attrsEnd],
280
- pos: matchStart + match[0].length // pos should be relative to current text, not document
281
- };
282
- }
283
-
284
- function findMatchingBracket(text, bracketStart) {
285
- let bracketDepth = 1;
286
- let bracketEnd = bracketStart + 1;
287
-
288
- while (bracketEnd < text.length && bracketDepth > 0) {
289
- if (text[bracketEnd] === '[') bracketDepth++;
290
- else if (text[bracketEnd] === ']') bracketDepth--;
291
- bracketEnd++;
292
- }
293
-
294
- return bracketDepth > 0 ? null : bracketEnd;
295
- }
296
-
297
- function extractUrlFromBrackets(text, bracketEnd) {
298
- let url = null;
299
- let spanEnd = bracketEnd;
300
-
301
- if (text[spanEnd] === '(') {
302
- const parenEnd = text.indexOf(')', spanEnd);
303
- if (parenEnd !== -1) {
304
- url = text.substring(spanEnd + 1, parenEnd);
305
- spanEnd = parenEnd + 1;
306
- }
307
- }
308
-
309
- return { url, spanEnd };
310
- }
311
-
312
- function extractAttributesFromText(text, spanEnd, baseOffset) {
313
- let attrs = null;
314
- let attrsRange = null;
315
- const remaining = text.substring(spanEnd);
316
-
317
- const wsMatch = remaining.match(/^\s+/);
318
- const attrsStart = wsMatch ? wsMatch[0].length : 0;
319
-
320
- if (remaining[attrsStart] === '{') {
321
- const braceEnd = remaining.indexOf('}', attrsStart);
322
- if (braceEnd !== -1) {
323
- attrs = remaining.substring(attrsStart, braceEnd + 1);
324
- const absStart = baseOffset + spanEnd + attrsStart;
325
- attrsRange = [absStart, absStart + attrs.length];
326
- spanEnd += braceEnd + 1;
327
- }
328
- }
329
-
330
- return { attrs, attrsRange, finalSpanEnd: spanEnd };
331
- }
332
-
333
- function determineCarrierType(url) {
334
- if (url && !url.startsWith('=')) {
335
- return { carrierType: 'link', resourceIRI: url };
336
- }
337
- return { carrierType: 'span', resourceIRI: null };
338
- }
339
296
 
340
297
  function createBlockEntry(token, state) {
341
298
  const blockId = token._blockId || hash(`${token.type}:${token.range?.[0]}:${token.range?.[1]}`);
@@ -364,31 +321,6 @@ function createBlockEntry(token, state) {
364
321
  return blockEntry;
365
322
  }
366
323
 
367
- function extractCleanText(token) {
368
- if (!token.text) return '';
369
-
370
- let text = token.text;
371
-
372
- // Remove semantic annotations
373
- if (token.attrsRange) {
374
- const beforeAttrs = text.substring(0, token.attrsRange[0] - (token.range?.[0] || 0));
375
- const afterAttrs = text.substring(token.attrsRange[1] - (token.range?.[0] || 0));
376
- text = beforeAttrs + afterAttrs;
377
- }
378
-
379
- // Clean based on token type
380
- switch (token.type) {
381
- case 'heading':
382
- return text.replace(/^#+\s*/, '').trim();
383
- case 'list':
384
- return text.replace(/^[-*+]\s*/, '').trim();
385
- case 'blockquote':
386
- return text.replace(/^>\s*/, '').trim();
387
- default:
388
- return text.trim();
389
- }
390
- }
391
-
392
324
  function enrichBlockFromAnnotation(blockEntry, sem, carrier, state) {
393
325
  // Update subject if available
394
326
  if (sem.subject && sem.subject !== 'RESET') {
@@ -539,17 +471,8 @@ function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predi
539
471
  // Detect rdf:Statement pattern during single-pass parsing
540
472
  detectStatementPatternSinglePass(quad, dataFactory, meta, statements, statementCandidates);
541
473
 
542
- // Create lean origin entry - avoid spread operator for better performance
543
- const originEntry = {
544
- blockId: block.id,
545
- range: block.range,
546
- carrierType: block.carrierType,
547
- subject: subject.value,
548
- predicate: predicate.value,
549
- context: block.context, // Direct reference instead of spread
550
- polarity: meta?.remove ? '-' : '+',
551
- value: block.text || ''
552
- };
474
+ // Create lean origin entry using shared utility
475
+ const originEntry = createLeanOriginEntry(block, subject, predicate, meta);
553
476
 
554
477
  quadIndex.set(quadKey, originEntry);
555
478
 
@@ -563,13 +486,6 @@ function emitQuad(quads, quadBuffer, removeSet, quadIndex, block, subject, predi
563
486
  }
564
487
  }
565
488
 
566
- // Extract RDF constants once at module level for efficiency
567
- const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
568
- const RDF_STATEMENT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement';
569
- const RDF_SUBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject';
570
- const RDF_PREDICATE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate';
571
- const RDF_OBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#object';
572
-
573
489
  function detectStatementPatternSinglePass(quad, dataFactory, meta, statements = null, statementCandidates = null) {
574
490
  // Skip if not called from parse context (for testing compatibility)
575
491
  if (!statements || !statementCandidates) return;
@@ -619,30 +535,6 @@ function detectStatementPatternSinglePass(quad, dataFactory, meta, statements =
619
535
  }
620
536
  }
621
537
 
622
- const resolveFragment = (fragment, state) => {
623
- if (!state.currentSubject) return null;
624
- const subjectValue = state.currentSubject.value;
625
- const hashIndex = subjectValue.indexOf('#');
626
- const baseIRI = hashIndex > -1 ? subjectValue.slice(0, hashIndex) : subjectValue;
627
- return state.df.namedNode(baseIRI + '#' + fragment);
628
- };
629
-
630
- function resolveSubject(sem, state) {
631
- if (!sem.subject) return null;
632
- if (sem.subject === 'RESET') {
633
- state.currentSubject = null;
634
- return null;
635
- }
636
- if (sem.subject.startsWith('=#')) return resolveFragment(sem.subject.substring(2), state);
637
- return state.df.namedNode(expandIRI(sem.subject, state.ctx));
638
- }
639
-
640
- function resolveObject(sem, state) {
641
- if (!sem.object) return null;
642
- if (sem.object.startsWith('#')) return resolveFragment(sem.object.substring(1), state);
643
- return state.df.namedNode(expandIRI(sem.object, state.ctx));
644
- }
645
-
646
538
  const createTypeQuad = (typeIRI, subject, state, block, entryIndex = null) => {
647
539
  const expandedType = expandIRI(typeIRI, state.ctx);
648
540
  const typeInfo = typeof entryIndex === 'object' ? entryIndex : { entryIndex, remove: false };
@@ -748,123 +640,9 @@ function processStandaloneSubject(token, state) {
748
640
  }
749
641
 
750
642
  const TOKEN_PROCESSORS = {
751
- heading: (token, state) => {
752
- const blockEntry = createBlockEntry(token, state);
753
- state.currentBlock = blockEntry;
754
- state.blockStack.push(blockEntry.id);
755
-
756
- processTokenAnnotations(token, state, token.type);
757
-
758
- state.blockStack.pop();
759
- state.currentBlock = state.blockStack.length > 0 ?
760
- state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
761
- },
762
- code: (token, state) => {
763
- const blockEntry = createBlockEntry(token, state);
764
- state.currentBlock = blockEntry;
765
- state.blockStack.push(blockEntry.id);
766
-
767
- processTokenAnnotations(token, state, token.type);
768
-
769
- state.blockStack.pop();
770
- state.currentBlock = state.blockStack.length > 0 ?
771
- state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
772
- },
773
- blockquote: (token, state) => {
774
- const blockEntry = createBlockEntry(token, state);
775
- state.currentBlock = blockEntry;
776
- state.blockStack.push(blockEntry.id);
777
-
778
- processTokenAnnotations(token, state, token.type);
779
-
780
- state.blockStack.pop();
781
- state.currentBlock = state.blockStack.length > 0 ?
782
- state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
783
- },
784
- para: (token, state) => {
785
- const blockEntry = createBlockEntry(token, state);
786
- state.currentBlock = blockEntry;
787
- state.blockStack.push(blockEntry.id);
788
-
789
- processStandaloneSubject(token, state);
790
- processTokenAnnotations(token, state, token.type);
791
-
792
- state.blockStack.pop();
793
- state.currentBlock = state.blockStack.length > 0 ?
794
- state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
795
- },
796
- list: (token, state) => {
797
- const blockEntry = createBlockEntry(token, state);
798
- state.currentBlock = blockEntry;
799
- state.blockStack.push(blockEntry.id);
800
-
801
- processTokenAnnotations(token, state, token.type);
802
-
803
- state.blockStack.pop();
804
- state.currentBlock = state.blockStack.length > 0 ?
805
- state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
806
- },
643
+ heading: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
644
+ code: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
645
+ blockquote: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
646
+ para: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry, [processStandaloneSubject]),
647
+ list: (token, state) => processTokenWithBlockTracking(token, state, processTokenAnnotations, createBlockEntry),
807
648
  };
808
-
809
- export function parse(text, options = {}) {
810
- const state = {
811
- ctx: { ...DEFAULT_CONTEXT, ...(options.context || {}) },
812
- df: options.dataFactory || DataFactory,
813
- quads: [],
814
- quadBuffer: new Map(),
815
- removeSet: new Set(),
816
- origin: {
817
- quadIndex: new Map(),
818
- blocks: new Map(),
819
- documentStructure: []
820
- },
821
- currentSubject: null,
822
- tokens: null,
823
- currentTokenIndex: -1,
824
- statements: [],
825
- statementCandidates: new Map(), // Track incomplete rdf:Statement patterns
826
- currentBlock: null,
827
- blockStack: []
828
- };
829
-
830
- state.tokens = scanTokens(text);
831
-
832
- // Single loop instead of filter+forEach for better performance
833
- for (const token of state.tokens) {
834
- if (token.type === 'prefix') {
835
- let resolvedIri = token.iri;
836
- if (token.iri.includes(':')) {
837
- const colonIndex = token.iri.indexOf(':');
838
- const potentialPrefix = token.iri.substring(0, colonIndex);
839
- const reference = token.iri.substring(colonIndex + 1);
840
- if (state.ctx[potentialPrefix] && potentialPrefix !== '@vocab') {
841
- resolvedIri = state.ctx[potentialPrefix] + reference;
842
- }
843
- }
844
- state.ctx[token.prefix] = resolvedIri;
845
- }
846
- }
847
-
848
- for (let i = 0; i < state.tokens.length; i++) {
849
- const token = state.tokens[i];
850
- state.currentTokenIndex = i;
851
- TOKEN_PROCESSORS[token.type]?.(token, state);
852
- }
853
-
854
- // Optimize array operations - avoid Array.from() and filter()
855
- const quadKeys = new Set();
856
- for (const quad of state.quads) {
857
- quadKeys.add(quadIndexKey(quad.subject, quad.predicate, quad.object));
858
- }
859
-
860
- // Direct iteration instead of Array.from() + filter()
861
- const filteredRemove = [];
862
- for (const quad of state.removeSet) {
863
- const key = quadIndexKey(quad.subject, quad.predicate, quad.object);
864
- if (!quadKeys.has(key)) {
865
- filteredRemove.push(quad);
866
- }
867
- }
868
-
869
- return { quads: state.quads, remove: filteredRemove, statements: state.statements, origin: state.origin, context: state.ctx };
870
- }
package/src/render.js CHANGED
@@ -6,7 +6,12 @@ import {
6
6
  parseSemanticBlock,
7
7
  hash
8
8
  } from './utils.js';
9
- import { DEFAULT_CONTEXT } from './shared.js';
9
+ import {
10
+ escapeHtml,
11
+ getIndentLevel,
12
+ processPredicates
13
+ } from './shared.js';
14
+ import { DEFAULT_CONTEXT } from './constants.js';
10
15
 
11
16
  /**
12
17
  * Render MD-LD to HTML+RDFa
@@ -302,17 +307,6 @@ function parseMarkdownList(markdownList, blocks, state) {
302
307
  return html;
303
308
  }
304
309
 
305
- /**
306
- * Get indent level from source text
307
- */
308
- function getIndentLevel(block, sourceText) {
309
- if (!block.range || !sourceText) return 0;
310
-
311
- const text = sourceText.substring(block.range.start, block.range.end);
312
- const indentMatch = text.match(/^(\s*)/);
313
- return indentMatch ? indentMatch[1].length : 0;
314
- }
315
-
316
310
  /**
317
311
  * Render a single block
318
312
  */
@@ -408,26 +402,9 @@ function buildRDFaAttrsFromBlock(block, ctx) {
408
402
  attrs.push(`typeof="${escapeHtml(types)}"`);
409
403
  }
410
404
 
411
- // Predicates
405
+ // Predicates using shared utility
412
406
  if (block.predicates && block.predicates.length > 0) {
413
- const literalProps = [];
414
- const objectProps = [];
415
- const reverseProps = [];
416
-
417
- block.predicates.forEach(pred => {
418
- const iri = typeof pred === 'string' ? pred : pred.iri;
419
- const expanded = expandIRI(iri, ctx);
420
- const shortened = shortenIRI(expanded, ctx);
421
- const form = typeof pred === 'string' ? '' : (pred.form || '');
422
-
423
- if (form === '!') {
424
- reverseProps.push(shortened);
425
- } else if (form === '?') {
426
- objectProps.push(shortened);
427
- } else {
428
- literalProps.push(shortened);
429
- }
430
- });
407
+ const { literalProps, objectProps, reverseProps } = processPredicates(block.predicates, ctx);
431
408
 
432
409
  if (literalProps.length > 0) {
433
410
  attrs.push(`property="${escapeHtml(literalProps.join(' '))}"`);
@@ -474,17 +451,3 @@ function wrapWithRDFaContext(html, ctx) {
474
451
 
475
452
  return `<div${prefixDecl}${vocabDecl}>${html}</div>`;
476
453
  }
477
-
478
- /**
479
- * Escape HTML special characters
480
- */
481
- function escapeHtml(text) {
482
- const map = {
483
- '&': '&amp;',
484
- '<': '&lt;',
485
- '>': '&gt;',
486
- '"': '&quot;',
487
- "'": '&#39;'
488
- };
489
- return String(text || '').replace(/[&<>"']/g, m => map[m]);
490
- }
package/src/shared.js CHANGED
@@ -1,37 +1,5 @@
1
- /**
2
- * Shared utilities for MD-LD Parser and Renderer
3
- * Ensures DRY code and consistent CommonMark processing
4
- */
5
-
6
- export const DEFAULT_CONTEXT = {
7
- '@vocab': "http://www.w3.org/2000/01/rdf-schema#",
8
- rdf: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
9
- rdfs: 'http://www.w3.org/2000/01/rdf-schema#',
10
- xsd: 'http://www.w3.org/2001/XMLSchema#',
11
- sh: "http://www.w3.org/ns/shacl#",
12
- prov: 'http://www.w3.org/ns/prov#'
13
- };
14
-
15
- // CommonMark patterns - shared between parser and renderer
16
- export const URL_REGEX = /^(https?|ftp|mailto|tag|nih|urn|uuid|did|web|ipfs|ipns|data|file|urn:uuid):/;
17
- export const FENCE_REGEX = /^(`{3,}|~{3,})(.*)/;
18
- export const PREFIX_REGEX = /^\[([^\]]+)\]\s*<([^>]+)>/;
19
- export const HEADING_REGEX = /^(#{1,6})\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
20
- export const UNORDERED_LIST_REGEX = /^(\s*)([-*+]|\d+\.)\s+(.+?)(?:\s*(\{[^}]+\}))?\s*$/;
21
- export const BLOCKQUOTE_REGEX = /^>\s+(.+?)(?:\s*(\{[^}]+\}))?$/;
22
- export const STANDALONE_SUBJECT_REGEX = /^\s*\{=(.*?)\}\s*$/;
23
-
24
- // Inline carrier patterns - shared extraction logic
25
- export const INLINE_CARRIER_PATTERNS = {
26
- EMPHASIS: /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y,
27
- CODE_SPAN: /``(.+?)``\s*\{([^}]+)\}/y
28
- };
29
-
30
- // Pre-compiled carrier patterns for performance
31
- export const CARRIER_PATTERN_ARRAY = [
32
- ['EMPHASIS', /[*__`]+(.+?)[*__`]+\s*\{([^}]+)\}/y],
33
- ['CODE_SPAN', /``(.+?)``\s*\{([^}]+)\}/y]
34
- ];
1
+ import { DEFAULT_CONTEXT, STANDALONE_SUBJECT_REGEX, FENCE_REGEX, PREFIX_REGEX, HEADING_REGEX, UNORDERED_LIST_REGEX, BLOCKQUOTE_REGEX } from './constants.js';
2
+ import { parseSemanticBlock, expandIRI, shortenIRI } from './utils.js';
35
3
 
36
4
  // Cache for fence regex patterns
37
5
  export const FENCE_CLOSE_PATTERNS = new Map();
@@ -156,6 +124,172 @@ export const PROCESSORS = [
156
124
  { test: line => true, process: null } // Default: paragraph
157
125
  ];
158
126
 
127
+ // Token scanning processors - shared between parser and renderer
128
+ export const TOKEN_PROCESSORS = [
129
+ { type: 'fence', test: line => FENCE_REGEX.test(line.trim()), process: null }, // Will be overridden in parse.js
130
+ { type: 'content', test: line => false, process: null }, // Will be overridden in parse.js
131
+ { type: 'prefix', test: line => PREFIX_REGEX.test(line), process: null }, // Will be overridden in parse.js
132
+ { type: 'heading', test: line => HEADING_REGEX.test(line), process: null }, // Will be overridden in parse.js
133
+ { type: 'list', test: line => UNORDERED_LIST_REGEX.test(line), process: null }, // Will be overridden in parse.js
134
+ { type: 'blockquote', test: line => BLOCKQUOTE_REGEX.test(line), process: null }, // Will be overridden in parse.js
135
+ { type: 'para', test: line => line.trim(), process: null } // Will be overridden in parse.js
136
+ ];
137
+
138
+ // Language and attributes parsing
139
+ export function parseLangAndAttrs(langAndAttrs) {
140
+ const spaceIndex = langAndAttrs.indexOf(' ');
141
+ const braceIndex = langAndAttrs.indexOf('{');
142
+ const langEnd = Math.min(
143
+ spaceIndex > -1 ? spaceIndex : Infinity,
144
+ braceIndex > -1 ? braceIndex : Infinity
145
+ );
146
+ return {
147
+ lang: langAndAttrs.substring(0, langEnd),
148
+ attrsText: langAndAttrs.substring(langEnd).match(/\{[^{}]*\}/)?.[0] || null
149
+ };
150
+ }
151
+
152
+ // Carrier extraction utilities
153
+ export function findMatchingBracket(text, bracketStart) {
154
+ let bracketDepth = 1;
155
+ let bracketEnd = bracketStart + 1;
156
+
157
+ while (bracketEnd < text.length && bracketDepth > 0) {
158
+ if (text[bracketEnd] === '[') bracketDepth++;
159
+ else if (text[bracketEnd] === ']') bracketDepth--;
160
+ bracketEnd++;
161
+ }
162
+
163
+ return bracketDepth > 0 ? null : bracketEnd;
164
+ }
165
+
166
+ export function extractUrlFromBrackets(text, bracketEnd) {
167
+ let url = null;
168
+ let spanEnd = bracketEnd;
169
+
170
+ if (text[spanEnd] === '(') {
171
+ const parenEnd = text.indexOf(')', spanEnd);
172
+ if (parenEnd !== -1) {
173
+ url = text.substring(spanEnd + 1, parenEnd);
174
+ spanEnd = parenEnd + 1;
175
+ }
176
+ }
177
+
178
+ return { url, spanEnd };
179
+ }
180
+
181
+ export function extractAttributesFromText(text, spanEnd, baseOffset) {
182
+ let attrs = null;
183
+ let attrsRange = null;
184
+ const remaining = text.substring(spanEnd);
185
+
186
+ const wsMatch = remaining.match(/^\s+/);
187
+ const attrsStart = wsMatch ? wsMatch[0].length : 0;
188
+
189
+ if (remaining[attrsStart] === '{') {
190
+ const braceEnd = remaining.indexOf('}', attrsStart);
191
+ if (braceEnd !== -1) {
192
+ attrs = remaining.substring(attrsStart, braceEnd + 1);
193
+ const absStart = baseOffset + spanEnd + attrsStart;
194
+ attrsRange = [absStart, absStart + attrs.length];
195
+ spanEnd += braceEnd + 1;
196
+ }
197
+ }
198
+
199
+ return { attrs, attrsRange, finalSpanEnd: spanEnd };
200
+ }
201
+
202
+ export function determineCarrierType(url) {
203
+ if (url && !url.startsWith('=')) {
204
+ return { carrierType: 'link', resourceIRI: url };
205
+ }
206
+ return { carrierType: 'span', resourceIRI: null };
207
+ }
208
+
209
+ export function calcCarrierRanges(match, baseOffset, matchStart) {
210
+ const valueStart = baseOffset + matchStart + match[0].indexOf(match[1]);
211
+ const valueEnd = valueStart + match[1].length;
212
+ const attrsStart = baseOffset + matchStart + match[0].indexOf('{');
213
+ const attrsEnd = attrsStart + match[2].length + 2; // +2 for { and }
214
+ return {
215
+ valueRange: [valueStart, valueEnd],
216
+ attrsRange: [attrsStart + 1, attrsEnd - 1], // Exclude braces
217
+ range: [baseOffset + matchStart, attrsEnd],
218
+ pos: matchStart + match[0].length // pos should be relative to current text, not document
219
+ };
220
+ }
221
+
222
+ // Clean text extraction utilities
223
+ export function extractCleanText(token) {
224
+ if (!token.text) return '';
225
+
226
+ let text = token.text;
227
+
228
+ // Remove semantic annotations
229
+ if (token.attrsRange) {
230
+ const beforeAttrs = text.substring(0, token.attrsRange[0] - (token.range?.[0] || 0));
231
+ const afterAttrs = text.substring(token.attrsRange[1] - (token.range?.[0] || 0));
232
+ text = beforeAttrs + afterAttrs;
233
+ }
234
+
235
+ // Clean based on token type
236
+ switch (token.type) {
237
+ case 'heading':
238
+ return text.replace(/^#+\s*/, '').trim();
239
+ case 'list':
240
+ return text.replace(/^[-*+]\s*/, '').trim();
241
+ case 'blockquote':
242
+ return text.replace(/^>\s*/, '').trim();
243
+ default:
244
+ return text.trim();
245
+ }
246
+ }
247
+
248
+ // Quad emission utilities
249
+ export const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
250
+ export const RDF_STATEMENT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement';
251
+ export const RDF_SUBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject';
252
+ export const RDF_PREDICATE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate';
253
+ export const RDF_OBJECT = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#object';
254
+
255
+ export function createLeanOriginEntry(block, subject, predicate, meta = null) {
256
+ return {
257
+ blockId: block.id,
258
+ range: block.range,
259
+ carrierType: block.carrierType,
260
+ subject: subject.value,
261
+ predicate: predicate.value,
262
+ context: block.context, // Direct reference instead of spread
263
+ polarity: meta?.remove ? '-' : '+',
264
+ value: block.text || ''
265
+ };
266
+ }
267
+
268
+ // Fragment resolution utilities
269
+ export function resolveFragment(fragment, currentSubject, dataFactory) {
270
+ if (!currentSubject) return null;
271
+ const subjectValue = currentSubject.value;
272
+ const hashIndex = subjectValue.indexOf('#');
273
+ const baseIRI = hashIndex > -1 ? subjectValue.slice(0, hashIndex) : subjectValue;
274
+ return dataFactory.namedNode(baseIRI + '#' + fragment);
275
+ }
276
+
277
+ export function resolveSubject(sem, state) {
278
+ if (!sem.subject) return null;
279
+ if (sem.subject === 'RESET') {
280
+ state.currentSubject = null;
281
+ return null;
282
+ }
283
+ if (sem.subject.startsWith('=#')) return resolveFragment(sem.subject.substring(2), state.currentSubject, state.df);
284
+ return state.df.namedNode(expandIRI(sem.subject, state.ctx));
285
+ }
286
+
287
+ export function resolveObject(sem, state) {
288
+ if (!sem.object) return null;
289
+ if (sem.object.startsWith('#')) return resolveFragment(sem.object.substring(1), state.currentSubject, state.df);
290
+ return state.df.namedNode(expandIRI(sem.object, state.ctx));
291
+ }
292
+
159
293
  // HTML escaping - shared utility
160
294
  export function escapeHtml(text) {
161
295
  if (!text) return '';
@@ -167,6 +301,77 @@ export function escapeHtml(text) {
167
301
  .replace(/'/g, '&#x27;');
168
302
  }
169
303
 
304
+ // RDF term type checking utilities - shared across modules
305
+ export function isLiteral(term) {
306
+ return term?.termType === 'Literal';
307
+ }
308
+
309
+ export function isNamedNode(term) {
310
+ return term?.termType === 'NamedNode';
311
+ }
312
+
313
+ export function isRdfType(term) {
314
+ return term?.value === 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
315
+ }
316
+
317
+ // IRI prefix extraction utility
318
+ export function getPrefixFromIRI(iri, context) {
319
+ if (!iri) return null;
320
+ const shortened = shortenIRI(iri, context);
321
+ if (shortened.includes(':')) {
322
+ return shortened.split(':')[0];
323
+ }
324
+ return null;
325
+ }
326
+
327
+ // Prefix collection utility - used by generate.js
328
+ export function collectUsedPrefixes(subjectGroups, context) {
329
+ const usedPrefixes = new Set();
330
+
331
+ for (const subjectQuads of subjectGroups.values()) {
332
+ for (const quad of subjectQuads) {
333
+ // Check subject prefix
334
+ const subjectPrefix = getPrefixFromIRI(quad.subject.value, context);
335
+ if (subjectPrefix) usedPrefixes.add(subjectPrefix);
336
+
337
+ // Check predicate prefix
338
+ const predicatePrefix = getPrefixFromIRI(quad.predicate.value, context);
339
+ if (predicatePrefix) usedPrefixes.add(predicatePrefix);
340
+
341
+ // Check object prefix if it's a named node
342
+ if (isNamedNode(quad.object)) {
343
+ const objectPrefix = getPrefixFromIRI(quad.object.value, context);
344
+ if (objectPrefix) usedPrefixes.add(objectPrefix);
345
+ }
346
+
347
+ // Check datatype prefix if present
348
+ if (quad.object.datatype && quad.object.datatype.value) {
349
+ const datatypePrefix = getPrefixFromIRI(quad.object.datatype.value, context);
350
+ if (datatypePrefix) usedPrefixes.add(datatypePrefix);
351
+ }
352
+ }
353
+ }
354
+
355
+ return usedPrefixes;
356
+ }
357
+
358
+ // Token processing utility - eliminates duplication in TOKEN_PROCESSORS
359
+ export function processTokenWithBlockTracking(token, state, processAnnotations, createBlockEntry, additionalProcessors = []) {
360
+ const blockEntry = createBlockEntry(token, state);
361
+ state.currentBlock = blockEntry;
362
+ state.blockStack.push(blockEntry.id);
363
+
364
+ // Run any additional processors first
365
+ additionalProcessors.forEach(processor => processor(token, state));
366
+
367
+ // Process annotations
368
+ processAnnotations(token, state, token.type);
369
+
370
+ state.blockStack.pop();
371
+ state.currentBlock = state.blockStack.length > 0 ?
372
+ state.origin.blocks.get(state.blockStack[state.blockStack.length - 1]) : null;
373
+ }
374
+
170
375
  // Quad key generation - shared between parser and renderer
171
376
  export function quadIndexKey(subject, predicate, object) {
172
377
  const datatype = object.datatype?.value || '';
@@ -199,14 +404,126 @@ export function resolveSubjectType(subjectDecl) {
199
404
  return 'full-iri';
200
405
  }
201
406
 
202
- // Fragment resolution - shared logic
203
- export function resolveFragment(fragment, currentSubject) {
204
- if (!currentSubject) {
205
- throw new Error('Fragment requires current subject');
407
+ // Constants - shared across modules (bundle-size optimized)
408
+ export const XSD_STRING = 'http://www.w3.org/2001/XMLSchema#string';
409
+
410
+ // Optimized sorting utilities - inline for better minification
411
+ export function sortQuadsByPredicate(quads) {
412
+ return quads.sort((a, b) => a.predicate.value.localeCompare(b.predicate.value));
413
+ }
414
+
415
+ // Optimized text generation - template literals for smaller bundle
416
+ export const generatePrefixDeclaration = (prefix, namespace) => `[${prefix}] <${namespace}>\n`;
417
+
418
+ export function generateLiteralText(quad, context) {
419
+ const predShort = shortenIRI(quad.predicate.value, context);
420
+ let annotation = predShort;
421
+
422
+ if (quad.object.language) {
423
+ annotation += ` @${quad.object.language}`;
424
+ } else if (quad.object.datatype.value !== XSD_STRING) {
425
+ annotation += ` ^^${shortenIRI(quad.object.datatype.value, context)}`;
426
+ }
427
+
428
+ return `[${quad.object.value}] {${annotation}}\n`;
429
+ }
430
+
431
+ export const generateObjectText = (quad, context) => {
432
+ const objShort = shortenIRI(quad.object.value, context);
433
+ const predShort = shortenIRI(quad.predicate.value, context);
434
+ return `[${objShort}] {+${objShort} ?${predShort}}\n`;
435
+ };
436
+
437
+ // Optimized quad filtering - destructuring for smaller minified output
438
+ export function filterQuadsByType(subjectQuads) {
439
+ const types = [], literals = [], objects = [];
440
+ for (const q of subjectQuads) {
441
+ if (isRdfType(q.predicate)) {
442
+ types.push(q);
443
+ } else if (isLiteral(q.object)) {
444
+ literals.push(q);
445
+ } else if (isNamedNode(q.object)) {
446
+ objects.push(q);
447
+ }
206
448
  }
207
- const fragmentName = fragment.substring(2); // Remove =#
208
- const baseIRI = currentSubject.value;
209
- const hashIndex = baseIRI.indexOf('#');
210
- const base = hashIndex > -1 ? baseIRI.slice(0, hashIndex) : baseIRI;
211
- return base + '#' + fragmentName;
449
+ return { types, literals, objects };
450
+ }
451
+
452
+ // Predicate processing utilities - common RDFa patterns
453
+ export function processPredicates(predicates, ctx) {
454
+ const literalProps = [];
455
+ const objectProps = [];
456
+ const reverseProps = [];
457
+
458
+ predicates.forEach(pred => {
459
+ const iri = typeof pred === 'string' ? pred : pred.iri;
460
+ const expanded = expandIRI(iri, ctx);
461
+ const shortened = shortenIRI(expanded, ctx);
462
+ const form = typeof pred === 'string' ? '' : (pred.form || '');
463
+
464
+ if (form === '!') {
465
+ reverseProps.push(shortened);
466
+ } else if (form === '?') {
467
+ objectProps.push(shortened);
468
+ } else {
469
+ literalProps.push(shortened);
470
+ }
471
+ });
472
+
473
+ return { literalProps, objectProps, reverseProps };
474
+ }
475
+
476
+ // Deterministic sorting utilities - ensure consistent output
477
+ export function sortDeterministic(array, keyFn) {
478
+ return array.sort((a, b) => {
479
+ const keyA = keyFn(a);
480
+ const keyB = keyFn(b);
481
+ return keyA.localeCompare(keyB);
482
+ });
483
+ }
484
+
485
+ export function sortQuadsDeterministically(quads) {
486
+ return quads.sort((a, b) => {
487
+ // Deterministic sorting: subject -> predicate -> object
488
+ const sComp = a.subject.value.localeCompare(b.subject.value);
489
+ if (sComp !== 0) return sComp;
490
+ const pComp = a.predicate.value.localeCompare(b.predicate.value);
491
+ if (pComp !== 0) return pComp;
492
+ const oA = isLiteral(a.object) ? a.object.value : a.object.value;
493
+ const oB = isLiteral(b.object) ? b.object.value : b.object.value;
494
+ return oA.localeCompare(oB);
495
+ });
496
+ }
497
+
498
+ // Optimized deterministic prefix generation
499
+ export function generateDeterministicPrefixes(context, usedPrefixes) {
500
+ const sortedEntries = Object.entries(context).sort(([a], [b]) => a.localeCompare(b));
501
+ let text = '';
502
+
503
+ for (const [prefix, namespace] of sortedEntries) {
504
+ if (prefix !== '@vocab' && !prefix.startsWith('@') && !DEFAULT_CONTEXT[prefix] && usedPrefixes.has(prefix)) {
505
+ text += generatePrefixDeclaration(prefix, namespace);
506
+ }
507
+ }
508
+
509
+ return text;
510
+ }
511
+
512
+ // Memory-efficient block creation
513
+ export function createOptimizedBlockEntry(token, state) {
514
+ const id = hash(`${token.range[0]}-${token.range[1]}-${token.text.slice(0, 50)}`);
515
+ const block = {
516
+ id,
517
+ type: token.type,
518
+ carrierType: token.type,
519
+ range: token.range,
520
+ text: token.text,
521
+ carriers: [],
522
+ predicates: [],
523
+ subject: state.currentSubject,
524
+ context: { ...state.ctx }
525
+ };
526
+
527
+ state.origin.blocks.set(id, block);
528
+ return block;
212
529
  }
package/src/utils.js CHANGED
@@ -1,4 +1,4 @@
1
- import { URL_REGEX, DEFAULT_CONTEXT } from './shared.js';
1
+ import { URL_REGEX, DEFAULT_CONTEXT } from './constants.js';
2
2
 
3
3
  // Base Term class for RDF/JS compatibility
4
4
  export class Term {