@mdream/js 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,347 @@
1
+ import { join, relative } from "pathe";
2
+ import { mkdir, open } from "node:fs/promises";
3
+ //#region src/llms-txt.ts
4
+ const FRONTMATTER_RE = /^---\n([\s\S]*?)\n---\n([\s\S]*)$/;
5
+ const ANCHOR_INVALID_CHARS_RE = /[^a-z0-9]/g;
6
+ const LEADING_SLASH_RE = /^\//;
7
+ const TRAILING_SLASH_RE = /\/$/;
8
+ /**
9
+ * Generate llms.txt content
10
+ */
11
+ function generateLlmsTxtContent(files, options) {
12
+ const { siteName = "Site", description, origin = "", sections, notes } = options;
13
+ let content = `# ${siteName}\n\n`;
14
+ if (description) content += `> ${description}\n\n`;
15
+ if (origin) content += `Canonical Origin: ${origin}\n\n`;
16
+ if (sections) for (const section of sections) content += formatSection(section);
17
+ if (files.length > 0) {
18
+ content += `## Pages\n\n`;
19
+ for (const file of files) {
20
+ const desc = file.metadata?.description;
21
+ const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
22
+ if (file.filePath && options.outputDir && file.filePath.endsWith(".md")) {
23
+ const relativePath = relative(options.outputDir, file.filePath);
24
+ content += `- [${file.title}](${relativePath})${descText}\n`;
25
+ } else {
26
+ const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
27
+ content += `- [${file.title}](${url})${descText}\n`;
28
+ }
29
+ }
30
+ }
31
+ if (notes) content += `\n${formatNotes(notes)}`;
32
+ return content;
33
+ }
34
+ /**
35
+ * Parse frontmatter from markdown content
36
+ */
37
+ function parseFrontmatter(content) {
38
+ const match = content.match(FRONTMATTER_RE);
39
+ if (!match) return {
40
+ frontmatter: null,
41
+ body: content
42
+ };
43
+ const frontmatterContent = match[1];
44
+ const body = match[2];
45
+ const frontmatter = {};
46
+ const lines = frontmatterContent.split("\n");
47
+ for (const line of lines) {
48
+ const colonIndex = line.indexOf(":");
49
+ if (colonIndex > 0) {
50
+ const key = line.substring(0, colonIndex).trim();
51
+ frontmatter[key] = line.substring(colonIndex + 1).trim();
52
+ }
53
+ }
54
+ return {
55
+ frontmatter,
56
+ body
57
+ };
58
+ }
59
+ /**
60
+ * Serialize frontmatter object to YAML-like format
61
+ */
62
+ function serializeFrontmatter(data) {
63
+ const lines = [];
64
+ for (const [key, value] of Object.entries(data)) if (value !== void 0 && value !== null) lines.push(`${key}: ${String(value)}`);
65
+ return lines.join("\n");
66
+ }
67
+ /**
68
+ * Generate llms-full.txt content with complete page content
69
+ */
70
+ function generateLlmsFullTxtContent(files, options) {
71
+ const { siteName = "Site", description, origin = "", sections, notes } = options;
72
+ let content = `# ${siteName}\n\n`;
73
+ if (description) content += `> ${description}\n\n`;
74
+ if (origin) content += `Canonical Origin: ${origin}\n\n`;
75
+ if (sections) for (const section of sections) content += formatSection(section);
76
+ if (files.length > 0) {
77
+ content += `## Table of Contents\n\n`;
78
+ for (const file of files) {
79
+ const anchor = file.title.toLowerCase().replace(ANCHOR_INVALID_CHARS_RE, "-");
80
+ content += `- [${file.title}](#${anchor})\n`;
81
+ }
82
+ content += `\n---\n\n`;
83
+ for (const file of files) {
84
+ const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
85
+ const { frontmatter, body } = parseFrontmatter(file.content);
86
+ const metadata = {
87
+ title: file.title,
88
+ url
89
+ };
90
+ if (file.filePath && options.outputDir) metadata.file = relative(options.outputDir, file.filePath);
91
+ else if (file.filePath) metadata.file = file.filePath;
92
+ if (file.metadata) {
93
+ if (file.metadata.description) metadata.description = file.metadata.description;
94
+ if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
95
+ if (file.metadata.author) metadata.author = file.metadata.author;
96
+ }
97
+ const frontmatterString = serializeFrontmatter(frontmatter ? {
98
+ ...frontmatter,
99
+ ...metadata
100
+ } : metadata);
101
+ let contentBody = frontmatter ? body : file.content;
102
+ const titleLine = contentBody.trim().split("\n")[0];
103
+ if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
104
+ content += `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
105
+ }
106
+ }
107
+ if (notes) content += `\n${formatNotes(notes)}`;
108
+ return content;
109
+ }
110
+ /**
111
+ * Generate individual markdown files structure
112
+ */
113
+ function generateMarkdownFilesContent(files) {
114
+ const markdownFiles = [];
115
+ for (const file of files) {
116
+ const mdPath = `md/${file.url === "/" ? "index" : file.url.replace(LEADING_SLASH_RE, "").replace(TRAILING_SLASH_RE, "")}.md`;
117
+ markdownFiles.push({
118
+ path: mdPath,
119
+ content: file.content
120
+ });
121
+ }
122
+ return markdownFiles;
123
+ }
124
+ /**
125
+ * Main function to generate llms.txt artifacts from pre-processed files
126
+ */
127
+ async function generateLlmsTxtArtifacts(options) {
128
+ const files = options.files;
129
+ const llmsTxt = generateLlmsTxtContent(files, options);
130
+ let llmsFullTxt;
131
+ if (options.generateFull) llmsFullTxt = generateLlmsFullTxtContent(files, options);
132
+ let markdownFiles;
133
+ if (options.generateMarkdown) markdownFiles = generateMarkdownFilesContent(files);
134
+ return {
135
+ llmsTxt,
136
+ llmsFullTxt,
137
+ markdownFiles,
138
+ processedFiles: files
139
+ };
140
+ }
141
+ /**
142
+ * Format a section with title, description, and links
143
+ */
144
+ function formatSection(section) {
145
+ let content = `## ${section.title}\n\n`;
146
+ if (section.description) {
147
+ const descriptions = Array.isArray(section.description) ? section.description : [section.description];
148
+ for (const desc of descriptions) content += `${desc}\n\n`;
149
+ }
150
+ if (section.links?.length) {
151
+ for (const link of section.links) {
152
+ const desc = link.description ? `: ${link.description}` : "";
153
+ content += `- [${link.title}](${link.href})${desc}\n`;
154
+ }
155
+ content += "\n";
156
+ }
157
+ return content;
158
+ }
159
+ /**
160
+ * Format notes section
161
+ */
162
+ function formatNotes(notes) {
163
+ const noteLines = Array.isArray(notes) ? notes : [notes];
164
+ let content = "";
165
+ for (const note of noteLines) content += `${note}\n\n`;
166
+ return content;
167
+ }
168
+ /**
169
+ * Get group prefix for a URL (up to 2 segments)
170
+ */
171
+ function getGroupPrefix(url, depth) {
172
+ const segments = url.split("/").filter(Boolean);
173
+ if (segments.length === 0) return "/";
174
+ if (depth === 1 || segments.length === 1) return `/${segments[0]}`;
175
+ return `/${segments[0]}/${segments[1]}`;
176
+ }
177
+ /**
178
+ * Sort pages by URL path in hierarchical order (directory tree structure)
179
+ * Groups by up to 2 segments, with root-level pages without nesting grouped together
180
+ */
181
+ function sortPagesByPath(pages) {
182
+ const twoSegmentCount = /* @__PURE__ */ new Map();
183
+ for (const page of pages) {
184
+ const prefix = getGroupPrefix(page.url, 2);
185
+ twoSegmentCount.set(prefix, (twoSegmentCount.get(prefix) || 0) + 1);
186
+ }
187
+ const segmentHasNested = /* @__PURE__ */ new Map();
188
+ for (const page of pages) {
189
+ const segments = page.url.split("/").filter(Boolean);
190
+ const firstSegment = segments.length > 0 ? segments[0] : "";
191
+ if (!segmentHasNested.has(firstSegment)) segmentHasNested.set(firstSegment, false);
192
+ if (segments.length > 1) segmentHasNested.set(firstSegment, true);
193
+ }
194
+ return pages.sort((a, b) => {
195
+ const segmentsA = a.url.split("/").filter(Boolean);
196
+ const segmentsB = b.url.split("/").filter(Boolean);
197
+ const firstSegmentA = segmentsA.length > 0 ? segmentsA[0] : "";
198
+ const firstSegmentB = segmentsB.length > 0 ? segmentsB[0] : "";
199
+ const twoSegPrefixA = getGroupPrefix(a.url, 2);
200
+ const twoSegPrefixB = getGroupPrefix(b.url, 2);
201
+ const twoSegCountA = twoSegmentCount.get(twoSegPrefixA) || 0;
202
+ const twoSegCountB = twoSegmentCount.get(twoSegPrefixB) || 0;
203
+ let groupKeyA = twoSegCountA > 1 ? twoSegPrefixA : `/${firstSegmentA}`;
204
+ let groupKeyB = twoSegCountB > 1 ? twoSegPrefixB : `/${firstSegmentB}`;
205
+ const isRootLevelA = segmentsA.length <= 1;
206
+ const isRootLevelB = segmentsB.length <= 1;
207
+ const hasNestedA = segmentHasNested.get(firstSegmentA);
208
+ const hasNestedB = segmentHasNested.get(firstSegmentB);
209
+ if (isRootLevelA && !hasNestedA) groupKeyA = "";
210
+ if (isRootLevelB && !hasNestedB) groupKeyB = "";
211
+ if (groupKeyA === "" && groupKeyB !== "") return -1;
212
+ if (groupKeyA !== "" && groupKeyB === "") return 1;
213
+ if (groupKeyA !== groupKeyB) return groupKeyA.localeCompare(groupKeyB);
214
+ if (segmentsA.length === 0) return -1;
215
+ if (segmentsB.length === 0) return 1;
216
+ const minLen = Math.min(segmentsA.length, segmentsB.length);
217
+ for (let i = 0; i < minLen; i++) {
218
+ const cmp = segmentsA[i].localeCompare(segmentsB[i]);
219
+ if (cmp !== 0) return cmp;
220
+ }
221
+ return segmentsA.length - segmentsB.length;
222
+ });
223
+ }
224
+ /**
225
+ * Create a WritableStream that generates llms.txt artifacts by streaming pages to disk.
226
+ *
227
+ * Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
228
+ * never keeping full content in memory. Creates outputDir recursively if needed.
229
+ */
230
+ function createLlmsTxtStream(options) {
231
+ const { siteName = "Site", description, origin = "", generateFull, outputDir = process.cwd(), sections, notes } = options;
232
+ let llmsTxtHandle;
233
+ let llmsFullTxtHandle;
234
+ const bufferedPages = [];
235
+ return new WritableStream({
236
+ async start() {
237
+ await mkdir(outputDir, { recursive: true });
238
+ llmsTxtHandle = await open(join(outputDir, "llms.txt"), "w");
239
+ let header = `# ${siteName}\n\n`;
240
+ if (description) header += `> ${description}\n\n`;
241
+ if (origin) header += `Canonical Origin: ${origin}\n\n`;
242
+ if (sections) for (const section of sections) header += formatSection(section);
243
+ await llmsTxtHandle.write(header);
244
+ if (generateFull) {
245
+ llmsFullTxtHandle = await open(join(outputDir, "llms-full.txt"), "w");
246
+ let fullHeader = `# ${siteName}\n\n`;
247
+ if (description) fullHeader += `> ${description}\n\n`;
248
+ if (origin) fullHeader += `Canonical Origin: ${origin}\n\n`;
249
+ if (sections) for (const section of sections) fullHeader += formatSection(section);
250
+ await llmsFullTxtHandle.write(fullHeader);
251
+ }
252
+ },
253
+ async write(file) {
254
+ const desc = file.metadata?.description;
255
+ bufferedPages.push({
256
+ url: file.url,
257
+ title: file.title,
258
+ description: desc,
259
+ filePath: file.filePath
260
+ });
261
+ if (generateFull && llmsFullTxtHandle) {
262
+ const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
263
+ const { frontmatter, body } = parseFrontmatter(file.content);
264
+ const metadata = {
265
+ title: file.title,
266
+ url
267
+ };
268
+ if (file.filePath) metadata.file = relative(outputDir, file.filePath);
269
+ if (file.metadata) {
270
+ if (file.metadata.description) metadata.description = file.metadata.description;
271
+ if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
272
+ if (file.metadata.author) metadata.author = file.metadata.author;
273
+ }
274
+ const frontmatterString = serializeFrontmatter(frontmatter ? {
275
+ ...frontmatter,
276
+ ...metadata
277
+ } : metadata);
278
+ let contentBody = frontmatter ? body : file.content;
279
+ const titleLine = contentBody.trim().split("\n")[0];
280
+ if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
281
+ const fullChunk = `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
282
+ await llmsFullTxtHandle.write(fullChunk);
283
+ }
284
+ },
285
+ async close() {
286
+ const sortedPages = sortPagesByPath(bufferedPages);
287
+ const twoSegmentCount = /* @__PURE__ */ new Map();
288
+ for (const page of sortedPages) {
289
+ const prefix = getGroupPrefix(page.url, 2);
290
+ twoSegmentCount.set(prefix, (twoSegmentCount.get(prefix) || 0) + 1);
291
+ }
292
+ const segmentHasNested = /* @__PURE__ */ new Map();
293
+ for (const page of sortedPages) {
294
+ const segments = page.url.split("/").filter(Boolean);
295
+ const firstSegment = segments.length > 0 ? segments[0] : "";
296
+ if (!segmentHasNested.has(firstSegment)) segmentHasNested.set(firstSegment, false);
297
+ if (segments.length > 1) segmentHasNested.set(firstSegment, true);
298
+ }
299
+ await llmsTxtHandle?.write(`## Pages\n\n`);
300
+ let currentGroup = "";
301
+ let segmentGroupIndex = 0;
302
+ let urlsInCurrentGroup = 0;
303
+ for (let i = 0; i < sortedPages.length; i++) {
304
+ const page = sortedPages[i];
305
+ const segments = page.url.split("/").filter(Boolean);
306
+ const firstSegment = segments.length > 0 ? segments[0] : "";
307
+ const twoSegPrefix = getGroupPrefix(page.url, 2);
308
+ let groupKey = (twoSegmentCount.get(twoSegPrefix) || 0) > 1 ? twoSegPrefix : `/${firstSegment}`;
309
+ const isRootLevel = segments.length <= 1;
310
+ const hasNested = segmentHasNested.get(firstSegment);
311
+ if (isRootLevel && !hasNested) groupKey = "";
312
+ if (groupKey !== currentGroup) {
313
+ if (urlsInCurrentGroup > 0) {
314
+ if (segmentGroupIndex === 0 || segmentGroupIndex >= 1 && segmentGroupIndex <= 2 && urlsInCurrentGroup > 1) await llmsTxtHandle?.write("\n");
315
+ }
316
+ currentGroup = groupKey;
317
+ segmentGroupIndex++;
318
+ urlsInCurrentGroup = 0;
319
+ }
320
+ urlsInCurrentGroup++;
321
+ const descText = page.description ? `: ${page.description.substring(0, 160)}${page.description.length > 160 ? "..." : ""}` : "";
322
+ let chunk = "";
323
+ if (page.filePath && page.filePath.endsWith(".md")) {
324
+ const relativePath = relative(outputDir, page.filePath);
325
+ chunk = `- [${page.title}](${relativePath})${descText}\n`;
326
+ } else {
327
+ const url = page.url.startsWith("http://") || page.url.startsWith("https://") ? page.url : origin ? origin + page.url : page.url;
328
+ chunk = `- [${page.title}](${url})${descText}\n`;
329
+ }
330
+ await llmsTxtHandle?.write(chunk);
331
+ }
332
+ if (notes) {
333
+ const notesContent = formatNotes(notes);
334
+ await llmsTxtHandle?.write(`\n${notesContent}`);
335
+ if (generateFull && llmsFullTxtHandle) await llmsFullTxtHandle.write(`\n${notesContent}`);
336
+ }
337
+ await llmsTxtHandle?.close();
338
+ await llmsFullTxtHandle?.close();
339
+ },
340
+ async abort(_reason) {
341
+ await llmsTxtHandle?.close();
342
+ await llmsFullTxtHandle?.close();
343
+ }
344
+ });
345
+ }
346
+ //#endregion
347
+ export { createLlmsTxtStream, generateLlmsTxtArtifacts };
@@ -0,0 +1,26 @@
1
+ //#region src/negotiate.d.ts
2
+ interface AcceptEntry {
3
+ type: string;
4
+ q: number;
5
+ position: number;
6
+ }
7
+ /**
8
+ * Parse an HTTP Accept header into an ordered list of media types with quality values.
9
+ * Supports quality weights (q=0.9) and preserves original position for tie-breaking.
10
+ */
11
+ declare function parseAcceptHeader(accept: string): AcceptEntry[];
12
+ /**
13
+ * Determine if a client prefers markdown over HTML using proper content negotiation.
14
+ *
15
+ * Uses Accept header quality weights and position ordering:
16
+ * - If text/markdown or text/plain has higher quality than text/html -> markdown
17
+ * - If same quality, earlier position in Accept header wins
18
+ * - Bare wildcard does NOT trigger markdown (prevents breaking OG crawlers)
19
+ * - sec-fetch-dest: document always returns false (browser navigation)
20
+ *
21
+ * @param acceptHeader - The HTTP Accept header value
22
+ * @param secFetchDest - The Sec-Fetch-Dest header value
23
+ */
24
+ declare function shouldServeMarkdown(acceptHeader?: string, secFetchDest?: string): boolean;
25
+ //#endregion
26
+ export { parseAcceptHeader, shouldServeMarkdown };
@@ -0,0 +1,92 @@
1
+ //#region src/negotiate.ts
2
+ /**
3
+ * Parse an HTTP Accept header into an ordered list of media types with quality values.
4
+ * Supports quality weights (q=0.9) and preserves original position for tie-breaking.
5
+ */
6
+ function parseAcceptHeader(accept) {
7
+ if (!accept) return [];
8
+ const entries = [];
9
+ const parts = accept.split(",");
10
+ for (let i = 0; i < parts.length; i++) {
11
+ const part = parts[i].trim();
12
+ if (!part) continue;
13
+ const semicolonIdx = part.indexOf(";");
14
+ let type;
15
+ let q = 1;
16
+ if (semicolonIdx === -1) type = part;
17
+ else {
18
+ type = part.slice(0, semicolonIdx).trim();
19
+ const paramStr = part.slice(semicolonIdx + 1);
20
+ const qIdx = paramStr.indexOf("q=");
21
+ if (qIdx !== -1) {
22
+ const qStart = qIdx + 2;
23
+ let qEnd = qStart;
24
+ while (qEnd < paramStr.length && paramStr.charCodeAt(qEnd) !== 59 && paramStr.charCodeAt(qEnd) !== 32) qEnd++;
25
+ q = +paramStr.slice(qStart, qEnd) || 0;
26
+ }
27
+ }
28
+ entries.push({
29
+ type,
30
+ q,
31
+ position: i
32
+ });
33
+ }
34
+ return entries;
35
+ }
36
+ /**
37
+ * Determine if a client prefers markdown over HTML using proper content negotiation.
38
+ *
39
+ * Uses Accept header quality weights and position ordering:
40
+ * - If text/markdown or text/plain has higher quality than text/html -> markdown
41
+ * - If same quality, earlier position in Accept header wins
42
+ * - Bare wildcard does NOT trigger markdown (prevents breaking OG crawlers)
43
+ * - sec-fetch-dest: document always returns false (browser navigation)
44
+ *
45
+ * @param acceptHeader - The HTTP Accept header value
46
+ * @param secFetchDest - The Sec-Fetch-Dest header value
47
+ */
48
+ function shouldServeMarkdown(acceptHeader, secFetchDest) {
49
+ if (secFetchDest === "document") return false;
50
+ const accept = acceptHeader || "";
51
+ if (!accept) return false;
52
+ const parts = accept.split(",");
53
+ let bestMdQ = -1;
54
+ let bestMdPos = -1;
55
+ let htmlQ = -1;
56
+ let htmlPos = -1;
57
+ for (let i = 0; i < parts.length; i++) {
58
+ const part = parts[i].trim();
59
+ if (!part) continue;
60
+ const semicolonIdx = part.indexOf(";");
61
+ let type;
62
+ let q = 1;
63
+ if (semicolonIdx === -1) type = part;
64
+ else {
65
+ type = part.slice(0, semicolonIdx).trim();
66
+ const paramStr = part.slice(semicolonIdx + 1);
67
+ const qIdx = paramStr.indexOf("q=");
68
+ if (qIdx !== -1) {
69
+ const qStart = qIdx + 2;
70
+ let qEnd = qStart;
71
+ while (qEnd < paramStr.length && paramStr.charCodeAt(qEnd) !== 59 && paramStr.charCodeAt(qEnd) !== 32) qEnd++;
72
+ q = +paramStr.slice(qStart, qEnd) || 0;
73
+ }
74
+ }
75
+ if (type === "text/markdown" || type === "text/plain") {
76
+ if (q > bestMdQ || q === bestMdQ && (bestMdPos === -1 || i < bestMdPos)) {
77
+ bestMdQ = q;
78
+ bestMdPos = i;
79
+ }
80
+ } else if (type === "text/html") {
81
+ htmlQ = q;
82
+ htmlPos = i;
83
+ }
84
+ }
85
+ if (bestMdPos === -1) return false;
86
+ if (htmlPos === -1) return true;
87
+ if (bestMdQ > htmlQ) return true;
88
+ if (bestMdQ === htmlQ && bestMdPos < htmlPos) return true;
89
+ return false;
90
+ }
91
+ //#endregion
92
+ export { parseAcceptHeader, shouldServeMarkdown };
@@ -0,0 +1,57 @@
1
+ import { _ as TransformPlugin, d as NodeEvent, m as TagHandler, r as ElementNode, u as Node } from "./_chunks/types.mjs";
2
+
3
+ //#region src/parse.d.ts
4
+ interface ParseOptions {
5
+ resolvedPlugins?: TransformPlugin[];
6
+ }
7
+ interface ParseState {
8
+ /** Map of tag names to their current nesting depth - uses TypedArray for performance */
9
+ depthMap: Uint8Array;
10
+ /** Current overall nesting depth */
11
+ depth: number;
12
+ /** Currently processing element node */
13
+ currentNode?: ElementNode | null;
14
+ /** Whether current content contains HTML entities that need decoding */
15
+ hasEncodedHtmlEntity?: boolean;
16
+ /** Whether the last processed character was whitespace - for collapsing whitespace */
17
+ lastCharWasWhitespace?: boolean;
18
+ /** Whether the last processed buffer has whitespace - optimization flag */
19
+ textBufferContainsWhitespace?: boolean;
20
+ /** Whether the last processed buffer contains non-whitespace characters */
21
+ textBufferContainsNonWhitespace?: boolean;
22
+ /** Whether a tag was just closed - affects whitespace handling */
23
+ justClosedTag?: boolean;
24
+ /** Whether the next text node is the first in its element - for whitespace trimming */
25
+ isFirstTextInElement?: boolean;
26
+ /** Reference to the last processed text node - for context tracking */
27
+ lastTextNode?: Node;
28
+ /** Quote state tracking for non-nesting tags - avoids backward scanning */
29
+ inSingleQuote?: boolean;
30
+ inDoubleQuote?: boolean;
31
+ inBacktick?: boolean;
32
+ /** Backslash escaping state tracking - avoids checking previous character */
33
+ lastCharWasBackslash?: boolean;
34
+ /** Resolved plugin instances for event processing */
35
+ resolvedPlugins?: TransformPlugin[];
36
+ /** Tag override handlers built from declarative tagOverrides config */
37
+ tagOverrideHandlers?: Map<string, TagHandler>;
38
+ }
39
+ interface ParseResult {
40
+ events: NodeEvent[];
41
+ remainingHtml: string;
42
+ }
43
+ /**
44
+ * Pure HTML parser that emits DOM events
45
+ * Completely decoupled from markdown generation
46
+ */
47
+ declare function parseHtml(html: string, options?: ParseOptions): ParseResult;
48
+ /**
49
+ * Streaming HTML parser - calls onEvent for each DOM event
50
+ */
51
+ declare function parseHtmlStream(html: string, state: ParseState, onEvent: (event: NodeEvent) => void): string;
52
+ /**
53
+ * Parse HTML attributes string into key-value object
54
+ */
55
+ declare function parseAttributes(attrStr: string): Record<string, string>;
56
+ //#endregion
57
+ export { ParseOptions, ParseResult, ParseState, parseAttributes, parseHtml, parseHtmlStream };
package/dist/parse.mjs ADDED
@@ -0,0 +1,3 @@
1
+ import "./_chunks/const.mjs";
2
+ import { n as parseHtml, r as parseHtmlStream, t as parseAttributes } from "./_chunks/parse.mjs";
3
+ export { parseAttributes, parseHtml, parseHtmlStream };
@@ -0,0 +1,93 @@
1
+ import { _ as TransformPlugin, a as ExtractedElement$1, l as MdreamRuntimeState, r as ElementNode } from "./_chunks/types.mjs";
2
+ import { r as createPlugin } from "./_chunks/index.mjs";
3
+ //#region src/plugins/extraction.d.ts
4
+ interface ExtractedElement extends ElementNode {
5
+ textContent: string;
6
+ }
7
+ /**
8
+ * @deprecated Use `plugins.extraction` config for declarative extraction that works with both JS and Rust engines.
9
+ */
10
+ declare function extractionPlugin(selectors: Record<string, (element: ExtractedElement, state: MdreamRuntimeState) => void>): TransformPlugin;
11
+ /**
12
+ * Extraction collector for `plugins.extraction` config.
13
+ * Collects results during processing; callbacks are called post-conversion
14
+ * to match Rust engine behavior.
15
+ */
16
+ declare function extractionCollectorPlugin(config: Record<string, (element: ExtractedElement$1) => void>): {
17
+ plugin: TransformPlugin;
18
+ getResults: () => ExtractedElement$1[];
19
+ callHandlers: () => void;
20
+ };
21
+ //#endregion
22
+ //#region src/plugins/filter.d.ts
23
+ /**
24
+ * Plugin that filters nodes based on CSS selectors.
25
+ * Allows including or excluding nodes based on selectors.
26
+ *
27
+ * @example
28
+ * // Include only heading elements and their children
29
+ * withQuerySelectorPlugin({ include: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] })
30
+ *
31
+ * @example
32
+ * // Exclude navigation, sidebar, and footer
33
+ * withQuerySelectorPlugin({ exclude: ['nav', '#sidebar', '.footer'] })
34
+ */
35
+ declare function filterPlugin(options?: {
36
+ /** CSS selectors, tag names, or TAG_* constants for elements to include (all others will be excluded) */include?: (string | number)[]; /** CSS selectors, tag names, or TAG_* constants for elements to exclude */
37
+ exclude?: (string | number)[]; /** Whether to also process the children of matching elements */
38
+ processChildren?: boolean;
39
+ keepAbsolute?: boolean;
40
+ }): TransformPlugin;
41
+ //#endregion
42
+ //#region src/plugins/frontmatter.d.ts
43
+ interface FrontmatterPluginOptions {
44
+ /** Additional frontmatter fields to include */
45
+ additionalFields?: Record<string, string>;
46
+ /** Meta tag names to extract (beyond the standard ones) */
47
+ metaFields?: string[];
48
+ }
49
+ /**
50
+ * A plugin that manages frontmatter generation from HTML head elements
51
+ * Extracts metadata from meta tags and title and generates YAML frontmatter
52
+ */
53
+ declare function frontmatterPlugin(options?: FrontmatterPluginOptions): TransformPlugin;
54
+ //#endregion
55
+ //#region src/plugins/isolate-main.d.ts
56
+ /**
57
+ * Plugin that isolates main content using the following priority order:
58
+ * 1. If an explicit <main> element exists (within 5 depth levels), use its content exclusively
59
+ * 2. Otherwise, find content between the first header tag (h1-h6) and first footer
60
+ * 3. If footer is within 5 levels of nesting from the header, use it as the end boundary
61
+ * 4. Exclude all content before the start marker and after the end marker
62
+ *
63
+ * @example
64
+ * ```html
65
+ * <body>
66
+ * <nav>Navigation (excluded)</nav>
67
+ * <main>
68
+ * <h1>Main Title (included)</h1>
69
+ * <p>Main content (included)</p>
70
+ * </main>
71
+ * <footer>Footer (excluded)</footer>
72
+ * </body>
73
+ * ```
74
+ *
75
+ * @example
76
+ * ```html
77
+ * <body>
78
+ * <nav>Navigation (excluded)</nav>
79
+ * <h1>Main Title (included)</h1>
80
+ * <p>Main content (included)</p>
81
+ * <footer>Footer (excluded)</footer>
82
+ * </body>
83
+ * ```
84
+ */
85
+ declare function isolateMainPlugin(): TransformPlugin;
86
+ //#endregion
87
+ //#region src/plugins/tailwind.d.ts
88
+ /**
89
+ * Creates a plugin that adds Tailwind class processing
90
+ */
91
+ declare function tailwindPlugin(): TransformPlugin;
92
+ //#endregion
93
+ export { type ExtractedElement, createPlugin, extractionCollectorPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin };
@@ -0,0 +1,3 @@
1
+ import "./_chunks/const.mjs";
2
+ import { a as extractionCollectorPlugin, i as filterPlugin, n as isolateMainPlugin, o as extractionPlugin, r as frontmatterPlugin, s as createPlugin, t as tailwindPlugin } from "./_chunks/plugins.mjs";
3
+ export { createPlugin, extractionCollectorPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin };
@@ -0,0 +1,2 @@
1
+ import { t as withMinimalPreset } from "../_chunks/minimal.mjs";
2
+ export { withMinimalPreset };
@@ -0,0 +1,34 @@
1
+ import "../_chunks/const.mjs";
2
+ //#region src/preset/minimal.ts
3
+ /**
4
+ * Creates a configurable minimal preset with advanced options.
5
+ * Returns declarative plugin config that works with both JS and Rust engines.
6
+ */
7
+ function withMinimalPreset(options = {}) {
8
+ return {
9
+ clean: options.clean !== void 0 ? options.clean : true,
10
+ ...options,
11
+ plugins: {
12
+ frontmatter: true,
13
+ isolateMain: true,
14
+ tailwind: true,
15
+ filter: { exclude: [
16
+ 40,
17
+ 68,
18
+ 103,
19
+ 58,
20
+ 47,
21
+ 88,
22
+ 73,
23
+ 59,
24
+ 66,
25
+ 65,
26
+ 43,
27
+ 41
28
+ ] },
29
+ ...options.plugins
30
+ }
31
+ };
32
+ }
33
+ //#endregion
34
+ export { withMinimalPreset };
@@ -0,0 +1,21 @@
1
+ import { p as SplitterOptions, s as MarkdownChunk } from "./_chunks/types.mjs";
2
+
3
+ //#region src/splitter.d.ts
4
+ /**
5
+ * Convert HTML to Markdown and split into chunks in single pass.
6
+ * Yields chunks during HTML event processing for better memory efficiency.
7
+ *
8
+ * **JavaScript engine only** — uses the JS engine's internal processing pipeline.
9
+ * Not compatible with the Rust engine.
10
+ */
11
+ declare function htmlToMarkdownSplitChunksStream(html: string, options?: SplitterOptions): Generator<MarkdownChunk, void, undefined>;
12
+ /**
13
+ * Convert HTML to Markdown and split into chunks in single pass.
14
+ * Chunks are created during HTML event processing.
15
+ *
16
+ * **JavaScript engine only** — uses the JS engine's internal processing pipeline.
17
+ * Not compatible with the Rust engine.
18
+ */
19
+ declare function htmlToMarkdownSplitChunks(html: string, options?: SplitterOptions): MarkdownChunk[];
20
+ //#endregion
21
+ export { type MarkdownChunk, type SplitterOptions, htmlToMarkdownSplitChunks, htmlToMarkdownSplitChunksStream };