mdream 0.10.3 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -82,6 +82,33 @@ Mdream provides two main functions for working with HTML:
82
82
  - `htmlToMarkdown`: Useful if you already have the entire HTML payload you want to convert.
83
83
  - `streamHtmlToMarkdown`: Best practice if you are fetching or reading from a local file.
84
84
 
85
+ ## Browser CDN Usage
86
+
87
+ For browser environments, you can use mdream directly via CDN without any build step:
88
+
89
+ ```html
90
+ <!DOCTYPE html>
91
+ <html>
92
+ <head>
93
+ <script src="https://unpkg.com/mdream/dist/iife.js"></script>
94
+ </head>
95
+ <body>
96
+ <script>
97
+ // Convert HTML to Markdown in the browser
98
+ const html = '<h1>Hello World</h1><p>This is a paragraph.</p>'
99
+ const markdown = window.mdream.htmlToMarkdown(html)
100
+ console.log(markdown) // # Hello World\n\nThis is a paragraph.
101
+ </script>
102
+ </body>
103
+ </html>
104
+ ```
105
+
106
+ **CDN Options:**
107
+ - **unpkg**: `https://unpkg.com/mdream/dist/iife.js`
108
+ - **jsDelivr**: `https://cdn.jsdelivr.net/npm/mdream/dist/iife.js`
109
+
110
+ The browser build includes the core `htmlToMarkdown` function and is optimized for size (44kB uncompressed, 10.3kB gzipped).
111
+
85
112
  **Convert existing HTML**
86
113
 
87
114
  ```ts
@@ -0,0 +1,146 @@
1
+ import { createPlugin } from "./plugin-Bqz9GKOA.mjs";
2
+
3
+ //#region src/libs/query-selector.ts
4
+ /**
5
+ * Creates a tag selector matcher (e.g., 'div', 'p', 'h1')
6
+ */
7
+ function createTagSelector(tagName) {
8
+ return {
9
+ matches: (element) => element.name === tagName,
10
+ toString: () => tagName
11
+ };
12
+ }
13
+ /**
14
+ * Creates an ID selector matcher (e.g., '#main', '#content')
15
+ */
16
+ function createIdSelector(selector) {
17
+ const id = selector.slice(1);
18
+ return {
19
+ matches: (element) => element.attributes?.id === id,
20
+ toString: () => `#${id}`
21
+ };
22
+ }
23
+ /**
24
+ * Creates a class selector matcher (e.g., '.container', '.header')
25
+ */
26
+ function createClassSelector(selector) {
27
+ const className = selector.slice(1);
28
+ return {
29
+ matches: (element) => {
30
+ if (!element.attributes?.class) return false;
31
+ const classes = element.attributes.class.trim().split(" ").filter(Boolean);
32
+ return classes.includes(className);
33
+ },
34
+ toString: () => `.${className}`
35
+ };
36
+ }
37
+ /**
38
+ * Creates an attribute selector matcher (e.g., '[data-id]', '[href="https://example.com"]')
39
+ */
40
+ function createAttributeSelector(selector) {
41
+ const match = selector.match(/\[([^\]=~|^$*]+)(?:([=~|^$*]+)["']?([^"'\]]+)["']?)?\]/);
42
+ const attrName = match ? match[1] : selector.slice(1, -1);
43
+ const operator = match?.[2];
44
+ const attrValue = match?.[3];
45
+ return {
46
+ matches: (element) => {
47
+ if (!(attrName in (element.attributes || {}))) return false;
48
+ if (!operator || !attrValue) return true;
49
+ const value = element.attributes[attrName];
50
+ switch (operator) {
51
+ case "=": return value === attrValue;
52
+ case "^=": return value.startsWith(attrValue);
53
+ case "$=": return value.endsWith(attrValue);
54
+ case "*=": return value.includes(attrValue);
55
+ case "~=": return value.trim().split(" ").filter(Boolean).includes(attrValue);
56
+ case "|=": return value === attrValue || value.startsWith(`${attrValue}-`);
57
+ default: return false;
58
+ }
59
+ },
60
+ toString: () => {
61
+ if (!operator || !attrValue) return `[${attrName}]`;
62
+ return `[${attrName}${operator}${attrValue}]`;
63
+ }
64
+ };
65
+ }
66
+ /**
67
+ * Creates a compound selector that combines multiple selectors (e.g., 'div.container', 'h1#title')
68
+ */
69
+ function createCompoundSelector(selectors) {
70
+ return {
71
+ matches: (element) => selectors.every((selector) => selector.matches(element)),
72
+ toString: () => selectors.map((s) => s.toString()).join("")
73
+ };
74
+ }
75
+ /**
76
+ * Parses a CSS selector into a matcher
77
+ */
78
+ function parseSelector(selector) {
79
+ selector = selector.trim();
80
+ if (!selector) throw new Error("Empty selector");
81
+ const selectorParts = [];
82
+ let current = "";
83
+ let inAttribute = false;
84
+ for (let i = 0; i < selector.length; i++) {
85
+ const char = selector[i];
86
+ if ((char === "." || char === "#" || char === "[") && current) {
87
+ if (current[0] === ".") selectorParts.push(createClassSelector(current));
88
+ else if (current[0] === "#") selectorParts.push(createIdSelector(current));
89
+ else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
90
+ else selectorParts.push(createTagSelector(current));
91
+ current = char;
92
+ } else current += char;
93
+ if (char === "[") inAttribute = true;
94
+ if (char === "]") inAttribute = false;
95
+ if (inAttribute && char !== "[") {}
96
+ }
97
+ if (current) if (current[0] === ".") selectorParts.push(createClassSelector(current));
98
+ else if (current[0] === "#") selectorParts.push(createIdSelector(current));
99
+ else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
100
+ else selectorParts.push(createTagSelector(current));
101
+ if (selectorParts.length === 1) return selectorParts[0];
102
+ return createCompoundSelector(selectorParts);
103
+ }
104
+
105
+ //#endregion
106
+ //#region src/plugins/extraction.ts
107
+ function extractionPlugin(selectors) {
108
+ const matcherCallbacks = Object.entries(selectors).map(([selector, callback]) => ({
109
+ matcher: parseSelector(selector),
110
+ callback
111
+ }));
112
+ const trackedElements = new Map();
113
+ return createPlugin({
114
+ onNodeEnter(element) {
115
+ matcherCallbacks.forEach(({ matcher, callback }) => {
116
+ if (matcher.matches(element)) trackedElements.set(element, {
117
+ textContent: "",
118
+ callback
119
+ });
120
+ });
121
+ },
122
+ processTextNode(textNode) {
123
+ let currentParent = textNode.parent;
124
+ while (currentParent) {
125
+ const tracked = trackedElements.get(currentParent);
126
+ if (tracked) tracked.textContent += textNode.value;
127
+ currentParent = currentParent.parent;
128
+ }
129
+ return void 0;
130
+ },
131
+ onNodeExit(element, state) {
132
+ const tracked = trackedElements.get(element);
133
+ if (tracked) {
134
+ const extractedElement = {
135
+ ...element,
136
+ textContent: tracked.textContent.trim()
137
+ };
138
+ tracked.callback(extractedElement, state);
139
+ trackedElements.delete(element);
140
+ }
141
+ }
142
+ });
143
+ }
144
+
145
+ //#endregion
146
+ export { extractionPlugin, parseSelector };
@@ -0,0 +1,225 @@
1
+ import { htmlToMarkdown } from "./src-B4vBEPKi.mjs";
2
+ import { extractionPlugin } from "./extraction-BSOWm6fo.mjs";
3
+ import { readFile } from "node:fs/promises";
4
+ import { basename, dirname, relative, sep } from "pathe";
5
+ import { glob } from "tinyglobby";
6
+
7
+ //#region src/llms-txt.ts
8
+ /**
9
+ * Extract metadata from HTML content using mdream's extraction plugin
10
+ */
11
+ function extractMetadata(html, url) {
12
+ let title = "";
13
+ let description = "";
14
+ let keywords = "";
15
+ let author = "";
16
+ const extractionPluginInstance = extractionPlugin({
17
+ "title": (element) => {
18
+ if (!title && element.textContent) title = element.textContent.trim();
19
+ },
20
+ "meta[name=\"description\"]": (element) => {
21
+ if (!description && element.attributes?.content) description = element.attributes.content.trim();
22
+ },
23
+ "meta[property=\"og:description\"]": (element) => {
24
+ if (!description && element.attributes?.content) description = element.attributes.content.trim();
25
+ },
26
+ "meta[name=\"keywords\"]": (element) => {
27
+ if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
28
+ },
29
+ "meta[name=\"author\"]": (element) => {
30
+ if (!author && element.attributes?.content) author = element.attributes.content.trim();
31
+ },
32
+ "meta[property=\"og:title\"]": (element) => {
33
+ if (!title && element.attributes?.content) title = element.attributes.content.trim();
34
+ }
35
+ });
36
+ htmlToMarkdown(html, {
37
+ plugins: [extractionPluginInstance],
38
+ origin: url
39
+ });
40
+ return {
41
+ title: title || void 0,
42
+ description: description || void 0,
43
+ keywords: keywords || void 0,
44
+ author: author || void 0
45
+ };
46
+ }
47
+ /**
48
+ * Convert file path to URL path
49
+ */
50
+ function pathToUrl(filePath, baseDir) {
51
+ let url = relative(baseDir, filePath);
52
+ url = url.split(sep).join("/");
53
+ if (url.endsWith(".html")) url = url.slice(0, -5);
54
+ if (url.endsWith("/index")) url = url.slice(0, -6);
55
+ if (url === "index") return "/";
56
+ if (!url.startsWith("/")) url = `/${url}`;
57
+ return url;
58
+ }
59
+ /**
60
+ * Process HTML files from glob patterns
61
+ */
62
+ async function processHtmlFiles(patterns, origin) {
63
+ const allPatterns = Array.isArray(patterns) ? patterns : [patterns];
64
+ const allFiles = [];
65
+ for (const pattern of allPatterns) {
66
+ const files = await glob(pattern);
67
+ allFiles.push(...files);
68
+ }
69
+ const uniqueFiles = [...new Set(allFiles)];
70
+ const results = [];
71
+ const baseDir = uniqueFiles.length > 0 ? dirname(uniqueFiles[0]) : ".";
72
+ for (const filePath of uniqueFiles) try {
73
+ const html = await readFile(filePath, "utf-8");
74
+ const metadata = extractMetadata(html, origin || filePath);
75
+ const content = htmlToMarkdown(html, { origin });
76
+ const url = pathToUrl(filePath, baseDir);
77
+ results.push({
78
+ filePath,
79
+ title: metadata?.title || basename(filePath, ".html"),
80
+ content,
81
+ url,
82
+ metadata
83
+ });
84
+ } catch (error) {
85
+ console.error(`Error processing ${filePath}:`, error);
86
+ }
87
+ return results;
88
+ }
89
+ /**
90
+ * Generate llms.txt content
91
+ */
92
+ function generateLlmsTxtContent(files, options) {
93
+ const { siteName = "Site", description, origin = "" } = options;
94
+ let content = `# ${siteName}\n\n`;
95
+ if (description) content += `> ${description}\n\n`;
96
+ if (files.length > 0) {
97
+ content += `## Pages\n\n`;
98
+ for (const file of files) {
99
+ const desc = file.metadata?.description;
100
+ const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
101
+ if (file.filePath && options.outputDir && file.filePath.endsWith(".md")) {
102
+ const relativePath = relative(options.outputDir, file.filePath);
103
+ content += `- [${file.title}](${relativePath})${descText}\n`;
104
+ } else {
105
+ const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin + file.url;
106
+ content += `- [${file.title}](${url})${descText}\n`;
107
+ }
108
+ }
109
+ }
110
+ return content;
111
+ }
112
+ /**
113
+ * Parse frontmatter from markdown content
114
+ */
115
+ function parseFrontmatter(content) {
116
+ const frontmatterRegex = /^---\n([\s\S]*?)\n---\n([\s\S]*)$/;
117
+ const match = content.match(frontmatterRegex);
118
+ if (!match) return {
119
+ frontmatter: null,
120
+ body: content
121
+ };
122
+ const frontmatterContent = match[1];
123
+ const body = match[2];
124
+ const frontmatter = {};
125
+ const lines = frontmatterContent.split("\n");
126
+ for (const line of lines) {
127
+ const colonIndex = line.indexOf(":");
128
+ if (colonIndex > 0) {
129
+ const key = line.substring(0, colonIndex).trim();
130
+ const value = line.substring(colonIndex + 1).trim();
131
+ frontmatter[key] = value;
132
+ }
133
+ }
134
+ return {
135
+ frontmatter,
136
+ body
137
+ };
138
+ }
139
+ /**
140
+ * Serialize frontmatter object to YAML-like format
141
+ */
142
+ function serializeFrontmatter(data) {
143
+ const lines = [];
144
+ for (const [key, value] of Object.entries(data)) if (value !== void 0 && value !== null) lines.push(`${key}: ${String(value)}`);
145
+ return lines.join("\n");
146
+ }
147
+ /**
148
+ * Generate llms-full.txt content with complete page content
149
+ */
150
+ function generateLlmsFullTxtContent(files, options) {
151
+ const { siteName = "Site", description, origin = "" } = options;
152
+ let content = `# ${siteName}\n\n`;
153
+ if (description) content += `> ${description}\n\n`;
154
+ if (files.length > 0) {
155
+ content += `## Table of Contents\n\n`;
156
+ for (const file of files) {
157
+ const anchor = file.title.toLowerCase().replace(/[^a-z0-9]/g, "-");
158
+ content += `- [${file.title}](#${anchor})\n`;
159
+ }
160
+ content += `\n---\n\n`;
161
+ for (const file of files) {
162
+ const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
163
+ const { frontmatter, body } = parseFrontmatter(file.content);
164
+ const metadata = {
165
+ title: file.title,
166
+ url
167
+ };
168
+ if (file.filePath && options.outputDir) metadata.file = relative(options.outputDir, file.filePath);
169
+ else if (file.filePath) metadata.file = file.filePath;
170
+ if (file.metadata) {
171
+ if (file.metadata.description) metadata.description = file.metadata.description;
172
+ if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
173
+ if (file.metadata.author) metadata.author = file.metadata.author;
174
+ }
175
+ const mergedFrontmatter = frontmatter ? {
176
+ ...frontmatter,
177
+ ...metadata
178
+ } : metadata;
179
+ const frontmatterString = serializeFrontmatter(mergedFrontmatter);
180
+ let contentBody = frontmatter ? body : file.content;
181
+ const titleLine = contentBody.trim().split("\n")[0];
182
+ if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
183
+ content += `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
184
+ }
185
+ }
186
+ return content;
187
+ }
188
+ /**
189
+ * Generate individual markdown files structure
190
+ */
191
+ function generateMarkdownFilesContent(files) {
192
+ const markdownFiles = [];
193
+ for (const file of files) {
194
+ const urlPath = file.url === "/" ? "index" : file.url.replace(/^\//, "").replace(/\/$/, "");
195
+ const mdPath = `md/${urlPath}.md`;
196
+ markdownFiles.push({
197
+ path: mdPath,
198
+ content: file.content
199
+ });
200
+ }
201
+ return markdownFiles;
202
+ }
203
+ /**
204
+ * Main function to process files and generate llms.txt artifacts
205
+ */
206
+ async function generateLlmsTxtArtifacts(options) {
207
+ let files;
208
+ if (options.files) files = options.files;
209
+ else if (options.patterns) files = await processHtmlFiles(options.patterns, options.origin);
210
+ else throw new Error("Either patterns or files must be provided");
211
+ const llmsTxt = generateLlmsTxtContent(files, options);
212
+ let llmsFullTxt;
213
+ if (options.generateFull) llmsFullTxt = generateLlmsFullTxtContent(files, options);
214
+ let markdownFiles;
215
+ if (options.generateMarkdown) markdownFiles = generateMarkdownFilesContent(files);
216
+ return {
217
+ llmsTxt,
218
+ llmsFullTxt,
219
+ markdownFiles,
220
+ processedFiles: files
221
+ };
222
+ }
223
+
224
+ //#endregion
225
+ export { generateLlmsTxtArtifacts };
@@ -1,5 +1,5 @@
1
- import { TAG_ASIDE, TAG_BUTTON, TAG_EMBED, TAG_FIELDSET, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_IFRAME, TAG_INPUT, TAG_NAV, TAG_OBJECT, TAG_SELECT, TAG_TEXTAREA } from "./extraction-D28Kr1J3.mjs";
2
- import { filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin } from "./plugins-DXY-fo9h.mjs";
1
+ import { TAG_ASIDE, TAG_BUTTON, TAG_EMBED, TAG_FIELDSET, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_IFRAME, TAG_INPUT, TAG_NAV, TAG_OBJECT, TAG_SELECT, TAG_TEXTAREA } from "./plugin-Bqz9GKOA.mjs";
2
+ import { filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin } from "./plugins-TeB1_RYL.mjs";
3
3
 
4
4
  //#region src/preset/minimal.ts
5
5
  /**
@@ -1,4 +1,4 @@
1
- import { Plugin } from "./types-E56bjFoA.mjs";
1
+ import { Plugin } from "./types-B94khc0C.mjs";
2
2
 
3
3
  //#region src/pluggable/plugin.d.ts
4
4
 
@@ -284,109 +284,6 @@ const BLOCKQUOTE_SPACING = [1, 1];
284
284
  const LIST_ITEM_SPACING = [1, 0];
285
285
  const TABLE_ROW_SPACING = [0, 1];
286
286
 
287
- //#endregion
288
- //#region src/libs/query-selector.ts
289
- /**
290
- * Creates a tag selector matcher (e.g., 'div', 'p', 'h1')
291
- */
292
- function createTagSelector(tagName) {
293
- return {
294
- matches: (element) => element.name === tagName,
295
- toString: () => tagName
296
- };
297
- }
298
- /**
299
- * Creates an ID selector matcher (e.g., '#main', '#content')
300
- */
301
- function createIdSelector(selector) {
302
- const id = selector.slice(1);
303
- return {
304
- matches: (element) => element.attributes?.id === id,
305
- toString: () => `#${id}`
306
- };
307
- }
308
- /**
309
- * Creates a class selector matcher (e.g., '.container', '.header')
310
- */
311
- function createClassSelector(selector) {
312
- const className = selector.slice(1);
313
- return {
314
- matches: (element) => {
315
- if (!element.attributes?.class) return false;
316
- const classes = element.attributes.class.trim().split(" ").filter(Boolean);
317
- return classes.includes(className);
318
- },
319
- toString: () => `.${className}`
320
- };
321
- }
322
- /**
323
- * Creates an attribute selector matcher (e.g., '[data-id]', '[href="https://example.com"]')
324
- */
325
- function createAttributeSelector(selector) {
326
- const match = selector.match(/\[([^\]=~|^$*]+)(?:([=~|^$*]+)["']?([^"'\]]+)["']?)?\]/);
327
- const attrName = match ? match[1] : selector.slice(1, -1);
328
- const operator = match?.[2];
329
- const attrValue = match?.[3];
330
- return {
331
- matches: (element) => {
332
- if (!(attrName in (element.attributes || {}))) return false;
333
- if (!operator || !attrValue) return true;
334
- const value = element.attributes[attrName];
335
- switch (operator) {
336
- case "=": return value === attrValue;
337
- case "^=": return value.startsWith(attrValue);
338
- case "$=": return value.endsWith(attrValue);
339
- case "*=": return value.includes(attrValue);
340
- case "~=": return value.trim().split(" ").filter(Boolean).includes(attrValue);
341
- case "|=": return value === attrValue || value.startsWith(`${attrValue}-`);
342
- default: return false;
343
- }
344
- },
345
- toString: () => {
346
- if (!operator || !attrValue) return `[${attrName}]`;
347
- return `[${attrName}${operator}${attrValue}]`;
348
- }
349
- };
350
- }
351
- /**
352
- * Creates a compound selector that combines multiple selectors (e.g., 'div.container', 'h1#title')
353
- */
354
- function createCompoundSelector(selectors) {
355
- return {
356
- matches: (element) => selectors.every((selector) => selector.matches(element)),
357
- toString: () => selectors.map((s) => s.toString()).join("")
358
- };
359
- }
360
- /**
361
- * Parses a CSS selector into a matcher
362
- */
363
- function parseSelector(selector) {
364
- selector = selector.trim();
365
- if (!selector) throw new Error("Empty selector");
366
- const selectorParts = [];
367
- let current = "";
368
- let inAttribute = false;
369
- for (let i = 0; i < selector.length; i++) {
370
- const char = selector[i];
371
- if ((char === "." || char === "#" || char === "[") && current) {
372
- if (current[0] === ".") selectorParts.push(createClassSelector(current));
373
- else if (current[0] === "#") selectorParts.push(createIdSelector(current));
374
- else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
375
- else selectorParts.push(createTagSelector(current));
376
- current = char;
377
- } else current += char;
378
- if (char === "[") inAttribute = true;
379
- if (char === "]") inAttribute = false;
380
- if (inAttribute && char !== "[") {}
381
- }
382
- if (current) if (current[0] === ".") selectorParts.push(createClassSelector(current));
383
- else if (current[0] === "#") selectorParts.push(createIdSelector(current));
384
- else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
385
- else selectorParts.push(createTagSelector(current));
386
- if (selectorParts.length === 1) return selectorParts[0];
387
- return createCompoundSelector(selectorParts);
388
- }
389
-
390
287
  //#endregion
391
288
  //#region src/pluggable/plugin.ts
392
289
  /**
@@ -399,44 +296,4 @@ function createPlugin(plugin) {
399
296
  }
400
297
 
401
298
  //#endregion
402
- //#region src/plugins/extraction.ts
403
- function extractionPlugin(selectors) {
404
- const matcherCallbacks = Object.entries(selectors).map(([selector, callback]) => ({
405
- matcher: parseSelector(selector),
406
- callback
407
- }));
408
- const trackedElements = new Map();
409
- return createPlugin({
410
- onNodeEnter(element) {
411
- matcherCallbacks.forEach(({ matcher, callback }) => {
412
- if (matcher.matches(element)) trackedElements.set(element, {
413
- textContent: "",
414
- callback
415
- });
416
- });
417
- },
418
- processTextNode(textNode) {
419
- let currentParent = textNode.parent;
420
- while (currentParent) {
421
- const tracked = trackedElements.get(currentParent);
422
- if (tracked) tracked.textContent += textNode.value;
423
- currentParent = currentParent.parent;
424
- }
425
- return void 0;
426
- },
427
- onNodeExit(element, state) {
428
- const tracked = trackedElements.get(element);
429
- if (tracked) {
430
- const extractedElement = {
431
- ...element,
432
- textContent: tracked.textContent.trim()
433
- };
434
- tracked.callback(extractedElement, state);
435
- trackedElements.delete(element);
436
- }
437
- }
438
- });
439
- }
440
-
441
- //#endregion
442
- export { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CAPTION, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAIN, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent, createBufferRegion, createPlugin, extractionPlugin, parseSelector };
299
+ export { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CAPTION, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAIN, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent, createBufferRegion, createPlugin };
@@ -1,4 +1,5 @@
1
- import { ELEMENT_NODE, TAG_A, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CAPTION, TAG_CODE, TAG_DD, TAG_DETAILS, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_LI, TAG_MAIN, TAG_META, TAG_NAV, TAG_OBJECT, TAG_OL, TAG_P, TAG_PRE, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUMMARY, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TITLE, TAG_TR, TAG_UL, TAG_VIDEO, TEXT_NODE, collectNodeContent, createBufferRegion, createPlugin, parseSelector } from "./extraction-D28Kr1J3.mjs";
1
+ import { ELEMENT_NODE, TAG_A, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CAPTION, TAG_CODE, TAG_DD, TAG_DETAILS, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_LI, TAG_MAIN, TAG_META, TAG_NAV, TAG_OBJECT, TAG_OL, TAG_P, TAG_PRE, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUMMARY, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TITLE, TAG_TR, TAG_UL, TAG_VIDEO, TEXT_NODE, collectNodeContent, createBufferRegion, createPlugin } from "./plugin-Bqz9GKOA.mjs";
2
+ import { parseSelector } from "./extraction-BSOWm6fo.mjs";
2
3
 
3
4
  //#region src/plugins/filter.ts
4
5
  /**
@@ -1,7 +1,4 @@
1
- import { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HR, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent, extractionPlugin } from "./extraction-D28Kr1J3.mjs";
2
- import { readFile } from "node:fs/promises";
3
- import { basename, dirname, relative, sep } from "pathe";
4
- import { glob } from "tinyglobby";
1
+ import { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HR, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent } from "./plugin-Bqz9GKOA.mjs";
5
2
 
6
3
  //#region src/tags.ts
7
4
  function resolveUrl(url, origin) {
@@ -1387,224 +1384,6 @@ function createMarkdownProcessor(options = {}) {
1387
1384
  }
1388
1385
  const MarkdownProcessor = createMarkdownProcessor;
1389
1386
 
1390
- //#endregion
1391
- //#region src/llms-txt.ts
1392
- /**
1393
- * Extract metadata from HTML content using mdream's extraction plugin
1394
- */
1395
- function extractMetadata(html, url) {
1396
- let title = "";
1397
- let description = "";
1398
- let keywords = "";
1399
- let author = "";
1400
- const extractionPluginInstance = extractionPlugin({
1401
- "title": (element) => {
1402
- if (!title && element.textContent) title = element.textContent.trim();
1403
- },
1404
- "meta[name=\"description\"]": (element) => {
1405
- if (!description && element.attributes?.content) description = element.attributes.content.trim();
1406
- },
1407
- "meta[property=\"og:description\"]": (element) => {
1408
- if (!description && element.attributes?.content) description = element.attributes.content.trim();
1409
- },
1410
- "meta[name=\"keywords\"]": (element) => {
1411
- if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
1412
- },
1413
- "meta[name=\"author\"]": (element) => {
1414
- if (!author && element.attributes?.content) author = element.attributes.content.trim();
1415
- },
1416
- "meta[property=\"og:title\"]": (element) => {
1417
- if (!title && element.attributes?.content) title = element.attributes.content.trim();
1418
- }
1419
- });
1420
- htmlToMarkdown(html, {
1421
- plugins: [extractionPluginInstance],
1422
- origin: url
1423
- });
1424
- return {
1425
- title: title || void 0,
1426
- description: description || void 0,
1427
- keywords: keywords || void 0,
1428
- author: author || void 0
1429
- };
1430
- }
1431
- /**
1432
- * Convert file path to URL path
1433
- */
1434
- function pathToUrl(filePath, baseDir) {
1435
- let url = relative(baseDir, filePath);
1436
- url = url.split(sep).join("/");
1437
- if (url.endsWith(".html")) url = url.slice(0, -5);
1438
- if (url.endsWith("/index")) url = url.slice(0, -6);
1439
- if (url === "index") return "/";
1440
- if (!url.startsWith("/")) url = `/${url}`;
1441
- return url;
1442
- }
1443
- /**
1444
- * Process HTML files from glob patterns
1445
- */
1446
- async function processHtmlFiles(patterns, origin) {
1447
- const allPatterns = Array.isArray(patterns) ? patterns : [patterns];
1448
- const allFiles = [];
1449
- for (const pattern of allPatterns) {
1450
- const files = await glob(pattern);
1451
- allFiles.push(...files);
1452
- }
1453
- const uniqueFiles = [...new Set(allFiles)];
1454
- const results = [];
1455
- const baseDir = uniqueFiles.length > 0 ? dirname(uniqueFiles[0]) : ".";
1456
- for (const filePath of uniqueFiles) try {
1457
- const html = await readFile(filePath, "utf-8");
1458
- const metadata = extractMetadata(html, origin || filePath);
1459
- const content = htmlToMarkdown(html, { origin });
1460
- const url = pathToUrl(filePath, baseDir);
1461
- results.push({
1462
- filePath,
1463
- title: metadata?.title || basename(filePath, ".html"),
1464
- content,
1465
- url,
1466
- metadata
1467
- });
1468
- } catch (error) {
1469
- console.error(`Error processing ${filePath}:`, error);
1470
- }
1471
- return results;
1472
- }
1473
- /**
1474
- * Generate llms.txt content
1475
- */
1476
- function generateLlmsTxtContent(files, options) {
1477
- const { siteName = "Site", description, origin = "" } = options;
1478
- let content = `# ${siteName}\n\n`;
1479
- if (description) content += `> ${description}\n\n`;
1480
- if (files.length > 0) {
1481
- content += `## Pages\n\n`;
1482
- for (const file of files) {
1483
- const desc = file.metadata?.description;
1484
- const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
1485
- if (file.filePath && options.outputDir && file.filePath.endsWith(".md")) {
1486
- const relativePath = relative(options.outputDir, file.filePath);
1487
- content += `- [${file.title}](${relativePath})${descText}\n`;
1488
- } else {
1489
- const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin + file.url;
1490
- content += `- [${file.title}](${url})${descText}\n`;
1491
- }
1492
- }
1493
- }
1494
- return content;
1495
- }
1496
- /**
1497
- * Parse frontmatter from markdown content
1498
- */
1499
- function parseFrontmatter(content) {
1500
- const frontmatterRegex = /^---\n([\s\S]*?)\n---\n([\s\S]*)$/;
1501
- const match = content.match(frontmatterRegex);
1502
- if (!match) return {
1503
- frontmatter: null,
1504
- body: content
1505
- };
1506
- const frontmatterContent = match[1];
1507
- const body = match[2];
1508
- const frontmatter = {};
1509
- const lines = frontmatterContent.split("\n");
1510
- for (const line of lines) {
1511
- const colonIndex = line.indexOf(":");
1512
- if (colonIndex > 0) {
1513
- const key = line.substring(0, colonIndex).trim();
1514
- const value = line.substring(colonIndex + 1).trim();
1515
- frontmatter[key] = value;
1516
- }
1517
- }
1518
- return {
1519
- frontmatter,
1520
- body
1521
- };
1522
- }
1523
- /**
1524
- * Serialize frontmatter object to YAML-like format
1525
- */
1526
- function serializeFrontmatter(data) {
1527
- const lines = [];
1528
- for (const [key, value] of Object.entries(data)) if (value !== void 0 && value !== null) lines.push(`${key}: ${String(value)}`);
1529
- return lines.join("\n");
1530
- }
1531
- /**
1532
- * Generate llms-full.txt content with complete page content
1533
- */
1534
- function generateLlmsFullTxtContent(files, options) {
1535
- const { siteName = "Site", description, origin = "" } = options;
1536
- let content = `# ${siteName}\n\n`;
1537
- if (description) content += `> ${description}\n\n`;
1538
- if (files.length > 0) {
1539
- content += `## Table of Contents\n\n`;
1540
- for (const file of files) {
1541
- const anchor = file.title.toLowerCase().replace(/[^a-z0-9]/g, "-");
1542
- content += `- [${file.title}](#${anchor})\n`;
1543
- }
1544
- content += `\n---\n\n`;
1545
- for (const file of files) {
1546
- const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
1547
- const { frontmatter, body } = parseFrontmatter(file.content);
1548
- const metadata = {
1549
- title: file.title,
1550
- url
1551
- };
1552
- if (file.filePath && options.outputDir) metadata.file = relative(options.outputDir, file.filePath);
1553
- else if (file.filePath) metadata.file = file.filePath;
1554
- if (file.metadata) {
1555
- if (file.metadata.description) metadata.description = file.metadata.description;
1556
- if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
1557
- if (file.metadata.author) metadata.author = file.metadata.author;
1558
- }
1559
- const mergedFrontmatter = frontmatter ? {
1560
- ...frontmatter,
1561
- ...metadata
1562
- } : metadata;
1563
- const frontmatterString = serializeFrontmatter(mergedFrontmatter);
1564
- let contentBody = frontmatter ? body : file.content;
1565
- const titleLine = contentBody.trim().split("\n")[0];
1566
- if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
1567
- content += `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
1568
- }
1569
- }
1570
- return content;
1571
- }
1572
- /**
1573
- * Generate individual markdown files structure
1574
- */
1575
- function generateMarkdownFilesContent(files) {
1576
- const markdownFiles = [];
1577
- for (const file of files) {
1578
- const urlPath = file.url === "/" ? "index" : file.url.replace(/^\//, "").replace(/\/$/, "");
1579
- const mdPath = `md/${urlPath}.md`;
1580
- markdownFiles.push({
1581
- path: mdPath,
1582
- content: file.content
1583
- });
1584
- }
1585
- return markdownFiles;
1586
- }
1587
- /**
1588
- * Main function to process files and generate llms.txt artifacts
1589
- */
1590
- async function generateLlmsTxtArtifacts(options) {
1591
- let files;
1592
- if (options.files) files = options.files;
1593
- else if (options.patterns) files = await processHtmlFiles(options.patterns, options.origin);
1594
- else throw new Error("Either patterns or files must be provided");
1595
- const llmsTxt = generateLlmsTxtContent(files, options);
1596
- let llmsFullTxt;
1597
- if (options.generateFull) llmsFullTxt = generateLlmsFullTxtContent(files, options);
1598
- let markdownFiles;
1599
- if (options.generateMarkdown) markdownFiles = generateMarkdownFilesContent(files);
1600
- return {
1601
- llmsTxt,
1602
- llmsFullTxt,
1603
- markdownFiles,
1604
- processedFiles: files
1605
- };
1606
- }
1607
-
1608
1387
  //#endregion
1609
1388
  //#region src/stream.ts
1610
1389
  /**
@@ -1655,4 +1434,4 @@ function htmlToMarkdown(html, options = {}) {
1655
1434
  }
1656
1435
 
1657
1436
  //#endregion
1658
- export { MarkdownProcessor, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
1437
+ export { MarkdownProcessor, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
package/dist/cli.mjs CHANGED
@@ -1,13 +1,15 @@
1
- import "./_chunks/extraction-D28Kr1J3.mjs";
2
- import { generateLlmsTxtArtifacts, streamHtmlToMarkdown } from "./_chunks/src-DYO16Ybo.mjs";
3
- import "./_chunks/plugins-DXY-fo9h.mjs";
4
- import { withMinimalPreset } from "./_chunks/minimal-CCnrG7a1.mjs";
5
- import { mkdir, writeFile } from "node:fs/promises";
6
- import { dirname, join, resolve } from "pathe";
1
+ import "./_chunks/plugin-Bqz9GKOA.mjs";
2
+ import { streamHtmlToMarkdown } from "./_chunks/src-B4vBEPKi.mjs";
3
+ import "./_chunks/extraction-BSOWm6fo.mjs";
4
+ import { generateLlmsTxtArtifacts } from "./_chunks/llms-txt-B4Tz5bHd.mjs";
5
+ import "./_chunks/plugins-TeB1_RYL.mjs";
6
+ import { withMinimalPreset } from "./_chunks/minimal-DSW9dhXV.mjs";
7
7
  import { readFileSync } from "node:fs";
8
+ import { mkdir, writeFile } from "node:fs/promises";
8
9
  import { Readable } from "node:stream";
9
10
  import { fileURLToPath } from "node:url";
10
11
  import { cac } from "cac";
12
+ import { dirname, join, resolve } from "pathe";
11
13
 
12
14
  //#region src/cli.ts
13
15
  async function streamingConvert(options = {}) {
package/dist/iife.js ADDED
@@ -0,0 +1,22 @@
1
+ (function() {
2
+ 'use strict';
3
+
4
+ function e(e,t,n){if(!t)return;let r=e.regionId||0,i=n.regionContentBuffers.get(r);i&&(i.push(t),n.lastContentCache=t)}function t(e){let t=[];for(let[n,r]of Array.from(e.regionContentBuffers.entries())){let i=e.regionToggles.get(n);i&&t.push(...r)}return e.regionToggles.clear(),e.regionContentBuffers.clear(),t.join(``).trimStart()}const n=0,r=1,i=2,a=3,o=4,s=5,c=6,l=7,u=8,d=9,f=10,p=11,m=12,h=13,g=14,_=15,v=16,y=17,ee=18,te=19,ne=20,re=21,b=22,x=23,S=24,C=25,w=26,ie=27,T=28,ae=29,oe=30,se=31,E=32,D=33,O=34,ce=35,le=36,ue=37,de=38,fe=39,pe=40,me=41,he=42,ge=43,_e=44,ve=45,ye=46,k=47,be=48,A=49,xe=50,Se=51,j=52,M=53,N=54,P=55,F=56,Ce=57,we=58,Te=59,Ee=60,De=61,Oe=62,ke=63,Ae=64,je=65,Me=66,Ne=67,Pe=68,Fe=69,Ie=70,Le=71,Re=72,ze=73,Be=74,Ve=75,He=76,Ue=77,We=78,Ge=79,Ke=80,qe=81,Je=82,Ye=83,Xe=84,Ze=85,Qe=86,$e=87,et=88,tt=89,nt=90,rt=91,it=92,at=93,ot=94,st=95,ct=96,lt=97,ut=98,dt=99,ft=100,pt=101,mt=102,ht=103,gt=104,_t=105,vt=106,yt=107,bt=108,xt={"&amp;":`&`,"&lt;":`<`,"&gt;":`>`,"&quot;":`"`,"&#39;":`'`,"&apos;":`'`,"&nbsp;":` `},I=1,L=2,R=0,St=1,Ct={html:n,head:r,details:i,summary:a,title:o,meta:s,br:c,h1:l,h2:u,h3:d,h4:f,h5:p,h6:m,hr:h,strong:g,b:_,em:v,i:y,del:ee,sub:te,sup:ne,ins:re,blockquote:b,code:x,ul:S,li:C,a:w,img:ie,table:T,thead:ae,tr:oe,th:se,td:E,ol:D,pre:O,p:ce,div:le,span:ue,tbody:de,tfoot:fe,form:pe,nav:me,label:he,button:ge,body:_e,center:ve,kbd:ye,footer:k,path:be,svg:A,article:xe,section:Se,script:j,style:M,link:N,area:P,base:F,col:Ce,embed:we,input:Te,keygen:Ee,param:De,source:Oe,track:ke,wbr:Ae,select:je,textarea:Me,option:Ne,fieldset:Pe,legend:Fe,audio:Ie,video:Le,canvas:Re,iframe:ze,map:Be,dialog:Ve,meter:He,progress:Ue,template:We,abbr:Ge,mark:Ke,q:qe,samp:Je,small:Ye,noscript:Xe,noframes:Ze,xmp:Qe,plaintext:$e,aside:et,u:tt,cite:nt,dfn:rt,var:it,time:at,bdo:ot,ruby:st,rt:ct,rp:lt,dd:ut,dt,dl:pt,address:ft,figure:mt,object:ht,main:gt,header:_t,figcaption:vt,caption:yt},wt=`**`,Tt=`_`,Et=`~~`,Dt="```",Ot="`",kt=`---`,z=[0,0],At=[2,2],jt=[1,1],Mt=[1,0],B=[0,1];function Nt(e,t){if(!e)return e;if(e.startsWith(`//`))return`https:${e}`;if(t){if(e.startsWith(`/`)&&t){let n=t.endsWith(`/`)?t.slice(0,-1):t;return`${n}${e}`}if(e.startsWith(`./`))return`${t}/${e.slice(2)}`;if(!e.startsWith(`http`)){let n=e.startsWith(`/`)?e.slice(1):e;return`${t}/${n}`}}return e}function V(e){return e.depthMap[E]>0}function Pt(e){if(!e)return``;let t=e.split(` `).map(e=>e.split(`language-`)[1]).filter(Boolean);return t.length>0?t[0].trim():``}function H(e){return{enter:({node:t})=>t.depthMap[w]?`<h${e}>`:`${`#`.repeat(e)} `,exit:({node:t})=>{if(t.depthMap[w])return`</h${e}>`},collapsesInnerWhiteSpace:!0}}const Ft={enter:({node:e})=>e.depthMap[_]>1?``:wt,exit:({node:e})=>e.depthMap[_]>1?``:wt,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},It={enter:({node:e})=>e.depthMap[y]>1?``:Tt,exit:({node:e})=>e.depthMap[y]>1?``:Tt,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},Lt={[r]:{spacing:z,collapsesInnerWhiteSpace:!0},[i]:{enter:()=>`<details>`,exit:()=>`</details>
5
+
6
+ `},[a]:{enter:()=>`<summary>`,exit:()=>`</summary>
7
+
8
+ `},[o]:{collapsesInnerWhiteSpace:!0,isNonNesting:!0,spacing:z},[j]:{excludesTextNodes:!0,isNonNesting:!0},[M]:{isNonNesting:!0,excludesTextNodes:!0},[s]:{collapsesInnerWhiteSpace:!0,isSelfClosing:!0,spacing:z},[c]:{enter:({node:e})=>V(e)?`<br>`:void 0,isSelfClosing:!0,spacing:z,collapsesInnerWhiteSpace:!0,isInline:!0},[l]:H(1),[u]:H(2),[d]:H(3),[f]:H(4),[p]:H(5),[m]:H(6),[h]:{enter:()=>kt,isSelfClosing:!0},[g]:Ft,[_]:Ft,[v]:It,[y]:It,[ee]:{enter:()=>Et,exit:()=>Et,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[te]:{enter:()=>`<sub>`,exit:()=>`</sub>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ne]:{enter:()=>`<sup>`,exit:()=>`</sup>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[re]:{enter:()=>`<ins>`,exit:()=>`</ins>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[b]:{enter:({node:e})=>{let t=e.depthMap[b]||1,n=`> `.repeat(t);return e.depthMap[C]>0&&(n=`\n${` `.repeat(e.depthMap[C])}${n}`),n},spacing:jt},[x]:{enter:({node:e})=>{if((e.depthMap[O]||0)>0){let t=Pt(e.attributes?.class);return`${Dt}${t}\n`}return Ot},exit:({node:e})=>e.depthMap[O]>0?`\n${Dt}`:Ot,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[S]:{enter:({node:e})=>V(e)?`<ul>`:void 0,exit:({node:e})=>V(e)?`</ul>`:void 0},[C]:{enter:({node:e})=>{if(V(e))return`<li>`;let t=(e.depthMap[S]||0)+(e.depthMap[D]||0)-1,n=e.parent?.tagId===D,r=` `.repeat(Math.max(0,t)),i=n?`${e.index+1}. `:`- `;return`${r}${i}`},exit:({node:e})=>V(e)?`</li>`:void 0,spacing:Mt},[w]:{enter:({node:e})=>{if(e.attributes?.href)return`[`},exit:({node:e,state:t})=>{if(!e.attributes?.href)return``;let n=Nt(e.attributes?.href||``,t.options?.origin),r=e.attributes?.title,i=t.lastContentCache;return i===r&&(r=``),r?`](${n} "${r}")`:`](${n})`},collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ie]:{enter:({node:e,state:t})=>{let n=e.attributes?.alt||``,r=Nt(e.attributes?.src||``,t.options?.origin);return`![${n}](${r})`},collapsesInnerWhiteSpace:!0,isSelfClosing:!0,spacing:z,isInline:!0},[T]:{enter:({node:e,state:t})=>{if(V(e))return`<table>`;e.depthMap[T]<=1&&(t.tableRenderedTable=!1),t.tableColumnAlignments=[]},exit:({node:e})=>V(e)?`</table>`:void 0},[ae]:{enter:({node:e})=>{if(V(e))return`<thead>`},exit:({node:e})=>V(e)?`</thead>`:void 0,spacing:B,excludesTextNodes:!0},[oe]:{enter:({node:e,state:t})=>V(e)?`<tr>`:(t.tableCurrentRowCells=0,`| `),exit:({node:e,state:t})=>{if(V(e)||e.depthMap[T]>1)return`</tr>`;if(!t.tableRenderedTable){t.tableRenderedTable=!0;let e=t.tableColumnAlignments;for(;e.length<t.tableCurrentRowCells;)e.push(``);let n=e.map(e=>{switch(e){case`left`:return`:---`;case`center`:return`:---:`;case`right`:return`---:`;default:return`---`}});return` |\n| ${n.join(` | `)} |`}return` |`},excludesTextNodes:!0,spacing:B},[se]:{enter:({node:e,state:t})=>{if(e.depthMap[T]>1)return`<th>`;let n=e.attributes?.align?.toLowerCase();return n?t.tableColumnAlignments.push(n):t.tableColumnAlignments.length<=t.tableCurrentRowCells&&t.tableColumnAlignments.push(``),e.index===0?``:` | `},exit:({node:e,state:t})=>{if(e.depthMap[T]>1)return`</th>`;t.tableCurrentRowCells++},collapsesInnerWhiteSpace:!0,spacing:z},[E]:{enter:({node:e})=>e.depthMap[T]>1?`<td>`:e.index===0?``:` | `,exit:({node:e,state:t})=>{if(e.depthMap[T]>1)return`</td>`;t.tableCurrentRowCells++},collapsesInnerWhiteSpace:!0,spacing:z},[ce]:{},[le]:{},[ue]:{collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[me]:{},[he]:{collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ge]:{collapsesInnerWhiteSpace:!0,isInline:!0},[_e]:{spacing:z},[ve]:{enter:({node:e})=>{if(e.depthMap[T]>1)return`<center>`},exit:({node:e})=>{if(e.depthMap[T]>1)return`</center>`},spacing:z},[de]:{spacing:z,excludesTextNodes:!0},[fe]:{spacing:B,excludesTextNodes:!0},[ye]:{enter:()=>"`",exit:()=>"`",collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[k]:{spacing:z},[pe]:{spacing:z},[N]:{isSelfClosing:!0,spacing:z,collapsesInnerWhiteSpace:!0,isInline:!0},[P]:{isSelfClosing:!0,spacing:z,isInline:!0},[F]:{isSelfClosing:!0,spacing:z,isInline:!0},[Ce]:{isSelfClosing:!0,spacing:z},[we]:{isSelfClosing:!0,spacing:z},[Te]:{isSelfClosing:!0,spacing:z,isInline:!0},[Ee]:{isSelfClosing:!0,spacing:z,isInline:!0},[De]:{isSelfClosing:!0,spacing:z},[Oe]:{isSelfClosing:!0,spacing:z},[ke]:{isSelfClosing:!0,spacing:z},[Ae]:{isSelfClosing:!0,spacing:z,isInline:!0},[A]:{spacing:z},[je]:{spacing:z},[Me]:{isNonNesting:!0,spacing:z},[Ne]:{isNonNesting:!0,spacing:z},[Pe]:{spacing:z},[Fe]:{spacing:z},[Ie]:{spacing:z},[Le]:{spacing:z},[Re]:{spacing:z},[ze]:{isNonNesting:!0,spacing:z},[Be]:{spacing:z},[Ve]:{spacing:z},[He]:{spacing:z},[Ue]:{spacing:z},[We]:{spacing:z},[Ge]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[Ke]:{enter:()=>`<mark>`,exit:()=>`</mark>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[qe]:{enter:()=>`"`,exit:()=>`"`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[Je]:{enter:()=>"`",exit:()=>"`",collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[Ye]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[Xe]:{excludesTextNodes:!0,spacing:z},[Ze]:{isNonNesting:!0,spacing:z},[Qe]:{isNonNesting:!0,spacing:z},[$e]:{isNonNesting:!0,spacing:z},[et]:{spacing:z},[tt]:{enter:()=>`<u>`,exit:()=>`</u>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[nt]:{enter:()=>`*`,exit:()=>`*`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[rt]:{enter:()=>`**`,exit:()=>`**`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[it]:{enter:()=>"`",exit:()=>"`",collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[at]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ot]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[st]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ct]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[lt]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ft]:{enter:()=>`<address>`,exit:()=>`</address>`,spacing:z,collapsesInnerWhiteSpace:!0},[pt]:{spacing:z,enter:()=>`<dl>`,exit:()=>`</dl>`},[dt]:{enter:()=>`<dt>`,exit:()=>`</dt>`,collapsesInnerWhiteSpace:!0,spacing:[0,1]},[ut]:{enter:()=>`<dd>`,exit:()=>`</dd>`,spacing:[0,1]}};function Rt(e){let t=``,n=0;for(;n<e.length;){if(e[n]===`&`){let r=!1;for(let[i,a]of Object.entries(xt))if(e.startsWith(i,n)){t+=a,n+=i.length,r=!0;break}if(r)continue;if(n+2<e.length&&e[n+1]===`#`){let r=n;n+=2;let i=e[n]===`x`||e[n]===`X`;i&&n++;let a=n;for(;n<e.length&&e[n]!==`;`;)n++;if(n<e.length&&e[n]===`;`){let r=e.substring(a,n),o=i?16:10;try{let e=Number.parseInt(r,o);if(!Number.isNaN(e)){t+=String.fromCodePoint(e),n++;continue}}catch{}}n=r}}t+=e[n],n++}return t}function zt(e){let t=e,n=[t];for(;t.tagHandler?.isInline&&t.parent;)t=t.parent,n.push(t);return n}const Bt=60,U=62,W=47,G=61,K=34,q=39,Vt=33,Ht=38,J=92,Y=45,X=32,Ut=9,Wt=10,Gt=13,Kt=96,qt=124,Jt=91,Yt=93,Xt=Object.freeze({});function Zt(e){return new Uint8Array(e)}function Z(e){return e===X||e===Ut||e===Wt||e===Gt}function Qt(e,t,n){return $t(e,t,n)}function $t(e,t,n){let r=``;t.depthMap??=new Uint8Array(bt),t.depth??=0,t.lastCharWasWhitespace??=!0,t.justClosedTag??=!1,t.isFirstTextInElement??=!1,t.lastCharWasBackslash??=!1;let i=0,a=e.length;for(;i<a;){let o=e.charCodeAt(i);if(o!==Bt){if(o===Ht&&(t.hasEncodedHtmlEntity=!0),Z(o)){let n=t.depthMap[O]>0;if(t.justClosedTag&&(t.justClosedTag=!1,t.lastCharWasWhitespace=!1),!n&&t.lastCharWasWhitespace){i++;continue}n?r+=e[i]:(o===X||!t.lastCharWasWhitespace)&&(r+=` `),t.lastCharWasWhitespace=!0,t.textBufferContainsWhitespace=!0,t.lastCharWasBackslash=!1}else t.textBufferContainsNonWhitespace=!0,t.lastCharWasWhitespace=!1,t.justClosedTag=!1,o===qt&&t.depthMap[T]?r+=`\\|`:o===Kt&&(t.depthMap[x]||t.depthMap[O])?r+="\\`":o===Jt&&t.depthMap[w]?r+=`\\[`:o===Yt&&t.depthMap[w]?r+=`\\]`:o===U&&t.depthMap[b]?r+=`\\>`:r+=e[i],t.currentNode?.tagHandler?.isNonNesting&&(t.lastCharWasBackslash||(o===q&&!t.inDoubleQuote&&!t.inBacktick?t.inSingleQuote=!t.inSingleQuote:o===K&&!t.inSingleQuote&&!t.inBacktick?t.inDoubleQuote=!t.inDoubleQuote:o===Kt&&!t.inSingleQuote&&!t.inDoubleQuote&&(t.inBacktick=!t.inBacktick))),t.lastCharWasBackslash=o===J;i++;continue}if(i+1>=a){r+=e[i];break}let s=e.charCodeAt(i+1);if(s===Vt){r.length>0&&(Q(r,t,n),r=``);let a=tn(e,i);if(a.complete)i=a.newPosition;else{r+=a.remainingText;break}}else if(s===W){let a=t.inSingleQuote||t.inDoubleQuote||t.inBacktick;if(t.currentNode?.tagHandler?.isNonNesting&&a){r+=e[i],i++;continue}r.length>0&&(Q(r,t,n),r=``);let o=en(e,i,t,n);if(o.complete)i=o.newPosition;else{r+=o.remainingText;break}}else{let o=i+1,s=o,c=-1;for(;o<a;){let t=e.charCodeAt(o);if(Z(t)||t===W||t===U){c=o;break}o++}if(c===-1){r+=e.substring(i);break}let l=e.substring(s,c).toLowerCase();if(!l){i=c;break}let u=Ct[l]??-1;if(o=c,t.currentNode?.tagHandler?.isNonNesting&&u!==t.currentNode?.tagId){r+=e[i++];continue}r.length>0&&(Q(r,t,n),r=``);let d=nn(l,u,e,o,t,n);if(d.skip)r+=e[i++];else if(d.complete)i=d.newPosition,d.selfClosing||(t.isFirstTextInElement=!0);else{r+=d.remainingText;break}}}return r}function Q(e,t,n){let r=t.textBufferContainsNonWhitespace,i=t.textBufferContainsWhitespace;if(t.textBufferContainsNonWhitespace=!1,t.textBufferContainsWhitespace=!1,!t.currentNode)return;let a=t.currentNode?.tagHandler?.excludesTextNodes,o=t.depthMap[O]>0;if(!o&&!r&&!t.currentNode.childTextNodeIndex)return;let s=e;if(s.length===0)return;let c=zt(t.currentNode),l=c[c.length-1];if(i&&!l?.childTextNodeIndex){let e=0;for(;e<s.length&&(o?s.charCodeAt(e)===Wt||s.charCodeAt(e)===Gt:Z(s.charCodeAt(e)));)e++;e>0&&(s=s.substring(e))}t.hasEncodedHtmlEntity&&(s=Rt(String(s)),t.hasEncodedHtmlEntity=!1);let u={type:L,value:s,parent:t.currentNode,regionId:t.currentNode?.regionId,index:t.currentNode.currentWalkIndex++,depth:t.depth,containsWhitespace:i,excludedFromMarkdown:a};for(let e of c)e.childTextNodeIndex=(e.childTextNodeIndex||0)+1;n({type:R,node:u}),t.lastTextNode=u}function en(e,t,n,r){let i=t+2,a=i,o=e.length,s=!1;for(;i<o;){let t=e.charCodeAt(i);if(t===U){s=!0;break}i++}if(!s)return{complete:!1,newPosition:t,remainingText:e.substring(t)};let c=e.substring(a,i).toLowerCase(),l=Ct[c]??-1;if(n.currentNode?.tagHandler?.isNonNesting&&l!==n.currentNode.tagId)return{complete:!1,newPosition:t,remainingText:e.substring(t)};let u=n.currentNode;if(u){let e=u.tagId!==l;for(;u&&e;)$(u,n,r),u=u.parent,e=u?.tagId!==l}return u&&$(u,n,r),n.justClosedTag=!0,{complete:!0,newPosition:i+1,remainingText:``}}function $(e,t,n){if(e){if(e.tagId===w&&!e.childTextNodeIndex){let t=e.attributes?.title||e.attributes?.[`aria-label`]||``;if(t){e.childTextNodeIndex=1;let r={type:L,value:t,parent:e,index:0,depth:e.depth+1};n({type:R,node:r});for(let t of zt(e))t.childTextNodeIndex=(t.childTextNodeIndex||0)+1}}e.tagId&&(t.depthMap[e.tagId]=Math.max(0,t.depthMap[e.tagId]-1)),e.tagHandler?.isNonNesting&&(t.inSingleQuote=!1,t.inDoubleQuote=!1,t.inBacktick=!1,t.lastCharWasBackslash=!1),t.depth--,n({type:St,node:e}),t.currentNode=t.currentNode.parent,t.hasEncodedHtmlEntity=!1,t.justClosedTag=!0}}function tn(e,t){let n=t,r=e.length;if(n+3<r&&e.charCodeAt(n+2)===Y&&e.charCodeAt(n+3)===Y){for(n+=4;n<r-2;){if(e.charCodeAt(n)===Y&&e.charCodeAt(n+1)===Y&&e.charCodeAt(n+2)===U)return n+=3,{complete:!0,newPosition:n,remainingText:``};n++}return{complete:!1,newPosition:t,remainingText:e.substring(t)}}else{for(n+=2;n<r;){if(e.charCodeAt(n)===U)return n++,{complete:!0,newPosition:n,remainingText:``};n++}return{complete:!1,newPosition:n,remainingText:e.substring(t,n)}}}function nn(e,t,n,r,i,a){i.currentNode?.tagHandler?.isNonNesting&&$(i.currentNode,i,a);let o=Lt[t],s=rn(n,r,o);if(!s.complete)return{complete:!1,newPosition:r,remainingText:`<${e}${s.attrBuffer}`,selfClosing:!1};let c=i.depthMap[t];i.depthMap[t]=c+1,i.depth++,r=s.newPosition,i.currentNode&&(i.currentNode.currentWalkIndex=i.currentNode.currentWalkIndex||0);let l=i.currentNode?i.currentNode.currentWalkIndex++:0,u={type:I,name:e,attributes:s.attributes,parent:i.currentNode,depthMap:Zt(i.depthMap),depth:i.depth,index:l,regionId:i.currentNode?.regionId,tagId:t,tagHandler:o};i.lastTextNode=u,a({type:R,node:u});let d=u;return d.currentWalkIndex=0,i.currentNode=d,i.hasEncodedHtmlEntity=!1,o?.isNonNesting&&!s.selfClosing&&(i.inSingleQuote=!1,i.inDoubleQuote=!1,i.inBacktick=!1,i.lastCharWasBackslash=!1),s.selfClosing?($(u,i,a),i.justClosedTag=!0):i.justClosedTag=!1,{complete:!0,newPosition:r,remainingText:``,selfClosing:s.selfClosing}}function rn(e,t,n){let r=t,i=e.length,a=n?.isSelfClosing||!1,o=r,s=!1,c=0,l=0;for(;r<i;){let t=e.charCodeAt(r);if(s){t===c&&l!==J&&(s=!1),r++;continue}else if(t===K||t===q)s=!0,c=t;else if(t===W&&r+1<i&&e.charCodeAt(r+1)===U){let t=e.substring(o,r).trim();return{complete:!0,newPosition:r+2,attributes:an(t),selfClosing:!0,attrBuffer:t}}else if(t===U){let t=e.substring(o,r).trim();return{complete:!0,newPosition:r+1,attributes:an(t),selfClosing:a,attrBuffer:t}}r++,l=t}return{complete:!1,newPosition:r,attributes:Xt,selfClosing:!1,attrBuffer:e.substring(o,r)}}function an(e){if(!e)return Xt;let t={},n=e.length,r=0,i=0,a=1,o=2,s=3,c=4,l=5,u=i,d=0,f=0,p=0,m=0,h=``;for(;r<n;){let g=e.charCodeAt(r),_=Z(g);switch(u){case i:_||(u=a,d=r,f=0);break;case a:(g===G||_)&&(f=r,h=e.substring(d,f).toLowerCase(),u=g===G?s:o);break;case o:g===G?u=s:_||(t[h]=``,u=a,d=r,f=0);break;case s:g===K||g===q?(m=g,u=c,p=r+1):_||(u=l,p=r);break;case c:g===J&&r+1<n?r++:g===m&&(t[h]=e.substring(p,r),u=i);break;case l:(_||g===U)&&(t[h]=e.substring(p,r),u=i);break}r++}if(u===c||u===l)h&&(t[h]=e.substring(p,r));else if(u===a||u===o||u===s){f||=r;let n=e.substring(d,f).toLowerCase();n&&(t[n]=``)}return t}function on(e,t,n,r){if(t?.length){for(let r of t){let t=r.beforeNodeProcess?.(e,n);if(typeof t==`object`&&t.skip)return!0}if(e.node.type===I){let r=e.node;if(e.type===R)for(let e of t)e.processAttributes&&e.processAttributes(r,n);let i=e.type===R?`onNodeEnter`:`onNodeExit`,a=[];for(let e of t)if(e[i]){let t=e[i](r,n);t&&a.push(t)}a.length>0&&(r.pluginOutput=(r.pluginOutput||[]).concat(a))}else if(e.node.type===L&&e.type===R){let r=e.node;for(let e of t)if(e.processTextNode){let t=e.processTextNode(r,n);if(t){if(t.skip)return!0;r.value=t.content}}}}return r(e),!1}function sn(e,t,n){if(e===` `||e===`
9
+ `||e===` `||t===` `||t===`
10
+ `||t===` `)return!1;let r=new Set([`[`,`(`,`>`,`*`,`_`,"`"]),i=new Set([`]`,`)`,`<`,`.`,`,`,`!`,`?`,`:`,`;`,`*`,`_`,"`"]);return e===`|`&&t===`<`&&n&&n.depthMap[T]>0?!0:!(r.has(e)||i.has(t))}function cn(e,t,n){return!!e&&e!==`
11
+ `&&e!==` `&&e!==`[`&&e!==`>`&&!t?.tagHandler?.isInline&&n.value[0]!==` `}function ln(e){let t=e.tagId,n=e.depthMap;if(t!==C&&n[C]>0||t!==b&&n[b]>0)return z;let r=e.parent;for(;r;){if(r.tagHandler?.collapsesInnerWhiteSpace)return z;r=r.parent}return e.tagHandler?.spacing?e.tagHandler?.spacing:At}function un(n={}){let r={options:n,regionToggles:new Map,regionContentBuffers:new Map,depthMap:new Uint8Array(bt)};r.regionToggles.set(0,!0),r.regionContentBuffers.set(0,[]);let i=0;function a(t){let{type:n,node:i}=t,a=r.lastNode;r.lastNode=t.node,r.depth=i.depth;let o=r.regionContentBuffers.get(i.regionId||0)||[],s=o[o.length-1],c=s?.charAt(s.length-1)||``,l;if(l=s?.length>1?s.charAt(s.length-2):o[o.length-2]?.charAt(o[o.length-2].length-1),i.type===L&&n===R){let t=i;if(t.value){if(t.excludedFromMarkdown||t.value===` `&&c===`
12
+ `)return;cn(c,a,t)&&(t.value=` ${t.value}`),e(t,t.value,r)}r.lastTextNode=t;return}if(i.type!==I)return;let u={node:i,state:r},d=[],f=i;f.pluginOutput?.length&&(d.push(...f.pluginOutput),f.pluginOutput=[]);let p=r.lastContentCache,m=0;c===`
13
+ `&&m++,l===`
14
+ `&&m++;let h=n===R?`enter`:`exit`,g=i.tagHandler;if(!d.length&&g?.[h]){let e=g[h](u);e&&d.push(e)}let _=ln(i),v=_[n]||0,y=Math.max(0,v-m);if(y>0){if(!o.length){for(let t of d)e(i,t,r);return}let t=`
15
+ `.repeat(y);c===` `&&o?.length&&(o[o.length-1]=o[o.length-1].substring(0,o[o.length-1].length-1)),n===R?d.unshift(t):d.push(t)}else if(p&&r.lastTextNode?.containsWhitespace&&i.parent&&`value`in r.lastTextNode&&typeof r.lastTextNode.value==`string`&&(!i.parent.depthMap[O]||i.parent.tagId===O)){let e=i.tagHandler?.isInline,t=i.tagHandler?.collapsesInnerWhiteSpace,a=i.tagHandler?.spacing&&Array.isArray(i.tagHandler.spacing),s=!e&&!t&&v>0,c=(!e||n===St)&&!s&&!(t&&n===R)&&!(a&&n===R);if(c){let e=p.length,t=p.trimEnd(),n=e-t.length;n>0&&o?.length&&o[o.length-1]===p&&(o[o.length-1]=t)}r.lastTextNode=void 0}d[0]?.[0]&&n===R&&c&&sn(c,d[0][0],r)&&e(i,` `,r);for(let t of d)e(i,t,r)}function o(e){let t={depthMap:r.depthMap,depth:0,plugins:r.options?.plugins||[]};Qt(e,t,e=>{on(e,r.options?.plugins,r,a)})}function s(){let e=t(r);return e.trimEnd()}function c(){let e=[];for(let[t,n]of Array.from(r.regionContentBuffers.entries())){let i=r.regionToggles.get(t);i&&e.push(...n)}let t=e.join(``).trimStart(),n=t.slice(i);return i=t.length,n}return{processEvent:a,processHtml:o,getMarkdown:s,getMarkdownChunk:c,state:r}}function dn(e,t={}){let n=un(t);return n.processHtml(e),n.getMarkdown()}const fn={htmlToMarkdown:dn};typeof window<`u`&&(window.mdream=fn);var pn=fn;
16
+
17
+ // Expose mdream globally
18
+ if (typeof window !== 'undefined') {
19
+ window.mdream = fn;
20
+ }
21
+
22
+ })();
package/dist/index.d.mts CHANGED
@@ -1,5 +1,5 @@
1
- import { BufferRegion, ELEMENT_NODE$1 as ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE$1 as TEXT_NODE, TagHandler, TailwindContext, TextNode } from "./_chunks/types-E56bjFoA.mjs";
2
- import { createPlugin$1 as createPlugin } from "./_chunks/plugin-B8PiU4Eb.mjs";
1
+ import { BufferRegion, ELEMENT_NODE$1 as ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE$1 as TEXT_NODE, TagHandler, TailwindContext, TextNode } from "./_chunks/types-B94khc0C.mjs";
2
+ import { createPlugin$1 as createPlugin } from "./_chunks/plugin-BUiqQb0v.mjs";
3
3
  import { ReadableStream } from "node:stream/web";
4
4
 
5
5
  //#region src/const.d.ts
@@ -115,43 +115,6 @@ declare const TagIdMap: {
115
115
  readonly caption: 107;
116
116
  };
117
117
  //#endregion
118
- //#region src/llms-txt.d.ts
119
- interface LlmsTxtArtifactsOptions {
120
- patterns?: string | string[];
121
- files?: ProcessedFile[];
122
- siteName?: string;
123
- description?: string;
124
- origin?: string;
125
- generateFull?: boolean;
126
- generateMarkdown?: boolean;
127
- outputDir?: string;
128
- }
129
- interface ProcessedFile {
130
- filePath?: string;
131
- title: string;
132
- content: string;
133
- url: string;
134
- metadata?: {
135
- title?: string;
136
- description?: string;
137
- keywords?: string;
138
- author?: string;
139
- };
140
- }
141
- interface LlmsTxtArtifactsResult {
142
- llmsTxt: string;
143
- llmsFullTxt?: string;
144
- markdownFiles?: {
145
- path: string;
146
- content: string;
147
- }[];
148
- processedFiles: ProcessedFile[];
149
- }
150
- /**
151
- * Main function to process files and generate llms.txt artifacts
152
- */
153
- declare function generateLlmsTxtArtifacts(options: LlmsTxtArtifactsOptions): Promise<LlmsTxtArtifactsResult>;
154
- //#endregion
155
118
  //#region src/markdown-processor.d.ts
156
119
  interface MarkdownState {
157
120
  /** Configuration options for conversion */
@@ -218,4 +181,4 @@ declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options
218
181
  //#region src/index.d.ts
219
182
  declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
220
183
  //#endregion
221
- export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, type LlmsTxtArtifactsOptions, type LlmsTxtArtifactsResult, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, type ProcessedFile, ReadabilityContext, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
184
+ export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { TagIdMap, createPlugin } from "./_chunks/extraction-D28Kr1J3.mjs";
2
- import { MarkdownProcessor, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown } from "./_chunks/src-DYO16Ybo.mjs";
1
+ import { TagIdMap, createPlugin } from "./_chunks/plugin-Bqz9GKOA.mjs";
2
+ import { MarkdownProcessor, htmlToMarkdown, parseHtml, streamHtmlToMarkdown } from "./_chunks/src-B4vBEPKi.mjs";
3
3
 
4
- export { MarkdownProcessor, TagIdMap, createPlugin, generateLlmsTxtArtifacts, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
4
+ export { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
@@ -0,0 +1,38 @@
1
+ //#region src/llms-txt.d.ts
2
+ interface LlmsTxtArtifactsOptions {
3
+ patterns?: string | string[];
4
+ files?: ProcessedFile[];
5
+ siteName?: string;
6
+ description?: string;
7
+ origin?: string;
8
+ generateFull?: boolean;
9
+ generateMarkdown?: boolean;
10
+ outputDir?: string;
11
+ }
12
+ interface ProcessedFile {
13
+ filePath?: string;
14
+ title: string;
15
+ content: string;
16
+ url: string;
17
+ metadata?: {
18
+ title?: string;
19
+ description?: string;
20
+ keywords?: string;
21
+ author?: string;
22
+ };
23
+ }
24
+ interface LlmsTxtArtifactsResult {
25
+ llmsTxt: string;
26
+ llmsFullTxt?: string;
27
+ markdownFiles?: {
28
+ path: string;
29
+ content: string;
30
+ }[];
31
+ processedFiles: ProcessedFile[];
32
+ }
33
+ /**
34
+ * Main function to process files and generate llms.txt artifacts
35
+ */
36
+ declare function generateLlmsTxtArtifacts(options: LlmsTxtArtifactsOptions): Promise<LlmsTxtArtifactsResult>;
37
+ //#endregion
38
+ export { LlmsTxtArtifactsOptions, LlmsTxtArtifactsResult, ProcessedFile, generateLlmsTxtArtifacts };
@@ -0,0 +1,6 @@
1
+ import "./_chunks/plugin-Bqz9GKOA.mjs";
2
+ import "./_chunks/src-B4vBEPKi.mjs";
3
+ import "./_chunks/extraction-BSOWm6fo.mjs";
4
+ import { generateLlmsTxtArtifacts } from "./_chunks/llms-txt-B4Tz5bHd.mjs";
5
+
6
+ export { generateLlmsTxtArtifacts };
@@ -1,5 +1,5 @@
1
- import { Plugin, extractionPlugin$1 as extractionPlugin } from "./_chunks/types-E56bjFoA.mjs";
2
- import { createPlugin$1 as createPlugin } from "./_chunks/plugin-B8PiU4Eb.mjs";
1
+ import { Plugin, extractionPlugin$1 as extractionPlugin } from "./_chunks/types-B94khc0C.mjs";
2
+ import { createPlugin$1 as createPlugin } from "./_chunks/plugin-BUiqQb0v.mjs";
3
3
 
4
4
  //#region src/plugins/filter.d.ts
5
5
 
package/dist/plugins.mjs CHANGED
@@ -1,4 +1,5 @@
1
- import { createPlugin, extractionPlugin } from "./_chunks/extraction-D28Kr1J3.mjs";
2
- import { filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin } from "./_chunks/plugins-DXY-fo9h.mjs";
1
+ import { createPlugin } from "./_chunks/plugin-Bqz9GKOA.mjs";
2
+ import { extractionPlugin } from "./_chunks/extraction-BSOWm6fo.mjs";
3
+ import { filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin } from "./_chunks/plugins-TeB1_RYL.mjs";
3
4
 
4
5
  export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
@@ -1,4 +1,4 @@
1
- import { HTMLToMarkdownOptions } from "../_chunks/types-E56bjFoA.mjs";
1
+ import { HTMLToMarkdownOptions } from "../_chunks/types-B94khc0C.mjs";
2
2
 
3
3
  //#region src/preset/minimal.d.ts
4
4
 
@@ -1,5 +1,6 @@
1
- import "../_chunks/extraction-D28Kr1J3.mjs";
2
- import "../_chunks/plugins-DXY-fo9h.mjs";
3
- import { withMinimalPreset } from "../_chunks/minimal-CCnrG7a1.mjs";
1
+ import "../_chunks/plugin-Bqz9GKOA.mjs";
2
+ import "../_chunks/extraction-BSOWm6fo.mjs";
3
+ import "../_chunks/plugins-TeB1_RYL.mjs";
4
+ import { withMinimalPreset } from "../_chunks/minimal-DSW9dhXV.mjs";
4
5
 
5
6
  export { withMinimalPreset };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "mdream",
3
3
  "type": "module",
4
- "version": "0.10.3",
4
+ "version": "0.11.0",
5
5
  "description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -23,6 +23,14 @@
23
23
  },
24
24
  "default": "./dist/index.mjs"
25
25
  },
26
+ "./llms-txt": {
27
+ "types": "./dist/llms-txt.d.mts",
28
+ "import": {
29
+ "types": "./dist/llms-txt.d.mts",
30
+ "default": "./dist/llms-txt.mjs"
31
+ },
32
+ "default": "./dist/llms-txt.mjs"
33
+ },
26
34
  "./cli": {
27
35
  "types": "./dist/cli.d.mts",
28
36
  "import": {
@@ -49,6 +57,8 @@
49
57
  }
50
58
  },
51
59
  "main": "./dist/index.mjs",
60
+ "unpkg": "./dist/iife.js",
61
+ "jsdelivr": "./dist/iife.js",
52
62
  "types": "./dist/index.d.mts",
53
63
  "bin": {
54
64
  "mdream": "./bin/mdream.mjs"
@@ -57,6 +67,7 @@
57
67
  "bin",
58
68
  "dist"
59
69
  ],
70
+ "browser": "./dist/iife.js",
60
71
  "dependencies": {
61
72
  "cac": "^6.7.14",
62
73
  "pathe": "^2.0.3",
@@ -79,6 +90,7 @@
79
90
  "typecheck": "tsc --noEmit",
80
91
  "dev:prepare": "obuild --stub",
81
92
  "test": "vitest test",
93
+ "test:browser": "vitest test --project=browser",
82
94
  "test:attw": "attw --pack"
83
95
  }
84
96
  }