mdream 0.10.3 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -0
- package/dist/_chunks/extraction-BSOWm6fo.mjs +146 -0
- package/dist/_chunks/llms-txt-B4Tz5bHd.mjs +225 -0
- package/dist/_chunks/{minimal-CCnrG7a1.mjs → minimal-DSW9dhXV.mjs} +2 -2
- package/dist/_chunks/{plugin-B8PiU4Eb.d.mts → plugin-BUiqQb0v.d.mts} +1 -1
- package/dist/_chunks/{extraction-D28Kr1J3.mjs → plugin-Bqz9GKOA.mjs} +1 -144
- package/dist/_chunks/{plugins-DXY-fo9h.mjs → plugins-TeB1_RYL.mjs} +2 -1
- package/dist/_chunks/{src-DYO16Ybo.mjs → src-B4vBEPKi.mjs} +2 -223
- package/dist/cli.mjs +8 -6
- package/dist/iife.js +22 -0
- package/dist/index.d.mts +3 -40
- package/dist/index.mjs +3 -3
- package/dist/llms-txt.d.mts +38 -0
- package/dist/llms-txt.mjs +6 -0
- package/dist/plugins.d.mts +2 -2
- package/dist/plugins.mjs +3 -2
- package/dist/preset/minimal.d.mts +1 -1
- package/dist/preset/minimal.mjs +4 -3
- package/package.json +13 -1
- /package/dist/_chunks/{types-E56bjFoA.d.mts → types-B94khc0C.d.mts} +0 -0
package/README.md
CHANGED
|
@@ -82,6 +82,33 @@ Mdream provides two main functions for working with HTML:
|
|
|
82
82
|
- `htmlToMarkdown`: Useful if you already have the entire HTML payload you want to convert.
|
|
83
83
|
- `streamHtmlToMarkdown`: Best practice if you are fetching or reading from a local file.
|
|
84
84
|
|
|
85
|
+
## Browser CDN Usage
|
|
86
|
+
|
|
87
|
+
For browser environments, you can use mdream directly via CDN without any build step:
|
|
88
|
+
|
|
89
|
+
```html
|
|
90
|
+
<!DOCTYPE html>
|
|
91
|
+
<html>
|
|
92
|
+
<head>
|
|
93
|
+
<script src="https://unpkg.com/mdream/dist/iife.js"></script>
|
|
94
|
+
</head>
|
|
95
|
+
<body>
|
|
96
|
+
<script>
|
|
97
|
+
// Convert HTML to Markdown in the browser
|
|
98
|
+
const html = '<h1>Hello World</h1><p>This is a paragraph.</p>'
|
|
99
|
+
const markdown = window.mdream.htmlToMarkdown(html)
|
|
100
|
+
console.log(markdown) // # Hello World\n\nThis is a paragraph.
|
|
101
|
+
</script>
|
|
102
|
+
</body>
|
|
103
|
+
</html>
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**CDN Options:**
|
|
107
|
+
- **unpkg**: `https://unpkg.com/mdream/dist/iife.js`
|
|
108
|
+
- **jsDelivr**: `https://cdn.jsdelivr.net/npm/mdream/dist/iife.js`
|
|
109
|
+
|
|
110
|
+
The browser build includes the core `htmlToMarkdown` function and is optimized for size (44kB uncompressed, 10.3kB gzipped).
|
|
111
|
+
|
|
85
112
|
**Convert existing HTML**
|
|
86
113
|
|
|
87
114
|
```ts
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import { createPlugin } from "./plugin-Bqz9GKOA.mjs";
|
|
2
|
+
|
|
3
|
+
//#region src/libs/query-selector.ts
|
|
4
|
+
/**
|
|
5
|
+
* Creates a tag selector matcher (e.g., 'div', 'p', 'h1')
|
|
6
|
+
*/
|
|
7
|
+
function createTagSelector(tagName) {
|
|
8
|
+
return {
|
|
9
|
+
matches: (element) => element.name === tagName,
|
|
10
|
+
toString: () => tagName
|
|
11
|
+
};
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Creates an ID selector matcher (e.g., '#main', '#content')
|
|
15
|
+
*/
|
|
16
|
+
function createIdSelector(selector) {
|
|
17
|
+
const id = selector.slice(1);
|
|
18
|
+
return {
|
|
19
|
+
matches: (element) => element.attributes?.id === id,
|
|
20
|
+
toString: () => `#${id}`
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Creates a class selector matcher (e.g., '.container', '.header')
|
|
25
|
+
*/
|
|
26
|
+
function createClassSelector(selector) {
|
|
27
|
+
const className = selector.slice(1);
|
|
28
|
+
return {
|
|
29
|
+
matches: (element) => {
|
|
30
|
+
if (!element.attributes?.class) return false;
|
|
31
|
+
const classes = element.attributes.class.trim().split(" ").filter(Boolean);
|
|
32
|
+
return classes.includes(className);
|
|
33
|
+
},
|
|
34
|
+
toString: () => `.${className}`
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Creates an attribute selector matcher (e.g., '[data-id]', '[href="https://example.com"]')
|
|
39
|
+
*/
|
|
40
|
+
function createAttributeSelector(selector) {
|
|
41
|
+
const match = selector.match(/\[([^\]=~|^$*]+)(?:([=~|^$*]+)["']?([^"'\]]+)["']?)?\]/);
|
|
42
|
+
const attrName = match ? match[1] : selector.slice(1, -1);
|
|
43
|
+
const operator = match?.[2];
|
|
44
|
+
const attrValue = match?.[3];
|
|
45
|
+
return {
|
|
46
|
+
matches: (element) => {
|
|
47
|
+
if (!(attrName in (element.attributes || {}))) return false;
|
|
48
|
+
if (!operator || !attrValue) return true;
|
|
49
|
+
const value = element.attributes[attrName];
|
|
50
|
+
switch (operator) {
|
|
51
|
+
case "=": return value === attrValue;
|
|
52
|
+
case "^=": return value.startsWith(attrValue);
|
|
53
|
+
case "$=": return value.endsWith(attrValue);
|
|
54
|
+
case "*=": return value.includes(attrValue);
|
|
55
|
+
case "~=": return value.trim().split(" ").filter(Boolean).includes(attrValue);
|
|
56
|
+
case "|=": return value === attrValue || value.startsWith(`${attrValue}-`);
|
|
57
|
+
default: return false;
|
|
58
|
+
}
|
|
59
|
+
},
|
|
60
|
+
toString: () => {
|
|
61
|
+
if (!operator || !attrValue) return `[${attrName}]`;
|
|
62
|
+
return `[${attrName}${operator}${attrValue}]`;
|
|
63
|
+
}
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Creates a compound selector that combines multiple selectors (e.g., 'div.container', 'h1#title')
|
|
68
|
+
*/
|
|
69
|
+
function createCompoundSelector(selectors) {
|
|
70
|
+
return {
|
|
71
|
+
matches: (element) => selectors.every((selector) => selector.matches(element)),
|
|
72
|
+
toString: () => selectors.map((s) => s.toString()).join("")
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Parses a CSS selector into a matcher
|
|
77
|
+
*/
|
|
78
|
+
function parseSelector(selector) {
|
|
79
|
+
selector = selector.trim();
|
|
80
|
+
if (!selector) throw new Error("Empty selector");
|
|
81
|
+
const selectorParts = [];
|
|
82
|
+
let current = "";
|
|
83
|
+
let inAttribute = false;
|
|
84
|
+
for (let i = 0; i < selector.length; i++) {
|
|
85
|
+
const char = selector[i];
|
|
86
|
+
if ((char === "." || char === "#" || char === "[") && current) {
|
|
87
|
+
if (current[0] === ".") selectorParts.push(createClassSelector(current));
|
|
88
|
+
else if (current[0] === "#") selectorParts.push(createIdSelector(current));
|
|
89
|
+
else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
|
|
90
|
+
else selectorParts.push(createTagSelector(current));
|
|
91
|
+
current = char;
|
|
92
|
+
} else current += char;
|
|
93
|
+
if (char === "[") inAttribute = true;
|
|
94
|
+
if (char === "]") inAttribute = false;
|
|
95
|
+
if (inAttribute && char !== "[") {}
|
|
96
|
+
}
|
|
97
|
+
if (current) if (current[0] === ".") selectorParts.push(createClassSelector(current));
|
|
98
|
+
else if (current[0] === "#") selectorParts.push(createIdSelector(current));
|
|
99
|
+
else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
|
|
100
|
+
else selectorParts.push(createTagSelector(current));
|
|
101
|
+
if (selectorParts.length === 1) return selectorParts[0];
|
|
102
|
+
return createCompoundSelector(selectorParts);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
//#endregion
|
|
106
|
+
//#region src/plugins/extraction.ts
|
|
107
|
+
function extractionPlugin(selectors) {
|
|
108
|
+
const matcherCallbacks = Object.entries(selectors).map(([selector, callback]) => ({
|
|
109
|
+
matcher: parseSelector(selector),
|
|
110
|
+
callback
|
|
111
|
+
}));
|
|
112
|
+
const trackedElements = new Map();
|
|
113
|
+
return createPlugin({
|
|
114
|
+
onNodeEnter(element) {
|
|
115
|
+
matcherCallbacks.forEach(({ matcher, callback }) => {
|
|
116
|
+
if (matcher.matches(element)) trackedElements.set(element, {
|
|
117
|
+
textContent: "",
|
|
118
|
+
callback
|
|
119
|
+
});
|
|
120
|
+
});
|
|
121
|
+
},
|
|
122
|
+
processTextNode(textNode) {
|
|
123
|
+
let currentParent = textNode.parent;
|
|
124
|
+
while (currentParent) {
|
|
125
|
+
const tracked = trackedElements.get(currentParent);
|
|
126
|
+
if (tracked) tracked.textContent += textNode.value;
|
|
127
|
+
currentParent = currentParent.parent;
|
|
128
|
+
}
|
|
129
|
+
return void 0;
|
|
130
|
+
},
|
|
131
|
+
onNodeExit(element, state) {
|
|
132
|
+
const tracked = trackedElements.get(element);
|
|
133
|
+
if (tracked) {
|
|
134
|
+
const extractedElement = {
|
|
135
|
+
...element,
|
|
136
|
+
textContent: tracked.textContent.trim()
|
|
137
|
+
};
|
|
138
|
+
tracked.callback(extractedElement, state);
|
|
139
|
+
trackedElements.delete(element);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
//#endregion
|
|
146
|
+
export { extractionPlugin, parseSelector };
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
import { htmlToMarkdown } from "./src-B4vBEPKi.mjs";
|
|
2
|
+
import { extractionPlugin } from "./extraction-BSOWm6fo.mjs";
|
|
3
|
+
import { readFile } from "node:fs/promises";
|
|
4
|
+
import { basename, dirname, relative, sep } from "pathe";
|
|
5
|
+
import { glob } from "tinyglobby";
|
|
6
|
+
|
|
7
|
+
//#region src/llms-txt.ts
|
|
8
|
+
/**
|
|
9
|
+
* Extract metadata from HTML content using mdream's extraction plugin
|
|
10
|
+
*/
|
|
11
|
+
function extractMetadata(html, url) {
|
|
12
|
+
let title = "";
|
|
13
|
+
let description = "";
|
|
14
|
+
let keywords = "";
|
|
15
|
+
let author = "";
|
|
16
|
+
const extractionPluginInstance = extractionPlugin({
|
|
17
|
+
"title": (element) => {
|
|
18
|
+
if (!title && element.textContent) title = element.textContent.trim();
|
|
19
|
+
},
|
|
20
|
+
"meta[name=\"description\"]": (element) => {
|
|
21
|
+
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
22
|
+
},
|
|
23
|
+
"meta[property=\"og:description\"]": (element) => {
|
|
24
|
+
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
25
|
+
},
|
|
26
|
+
"meta[name=\"keywords\"]": (element) => {
|
|
27
|
+
if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
|
|
28
|
+
},
|
|
29
|
+
"meta[name=\"author\"]": (element) => {
|
|
30
|
+
if (!author && element.attributes?.content) author = element.attributes.content.trim();
|
|
31
|
+
},
|
|
32
|
+
"meta[property=\"og:title\"]": (element) => {
|
|
33
|
+
if (!title && element.attributes?.content) title = element.attributes.content.trim();
|
|
34
|
+
}
|
|
35
|
+
});
|
|
36
|
+
htmlToMarkdown(html, {
|
|
37
|
+
plugins: [extractionPluginInstance],
|
|
38
|
+
origin: url
|
|
39
|
+
});
|
|
40
|
+
return {
|
|
41
|
+
title: title || void 0,
|
|
42
|
+
description: description || void 0,
|
|
43
|
+
keywords: keywords || void 0,
|
|
44
|
+
author: author || void 0
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Convert file path to URL path
|
|
49
|
+
*/
|
|
50
|
+
function pathToUrl(filePath, baseDir) {
|
|
51
|
+
let url = relative(baseDir, filePath);
|
|
52
|
+
url = url.split(sep).join("/");
|
|
53
|
+
if (url.endsWith(".html")) url = url.slice(0, -5);
|
|
54
|
+
if (url.endsWith("/index")) url = url.slice(0, -6);
|
|
55
|
+
if (url === "index") return "/";
|
|
56
|
+
if (!url.startsWith("/")) url = `/${url}`;
|
|
57
|
+
return url;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Process HTML files from glob patterns
|
|
61
|
+
*/
|
|
62
|
+
async function processHtmlFiles(patterns, origin) {
|
|
63
|
+
const allPatterns = Array.isArray(patterns) ? patterns : [patterns];
|
|
64
|
+
const allFiles = [];
|
|
65
|
+
for (const pattern of allPatterns) {
|
|
66
|
+
const files = await glob(pattern);
|
|
67
|
+
allFiles.push(...files);
|
|
68
|
+
}
|
|
69
|
+
const uniqueFiles = [...new Set(allFiles)];
|
|
70
|
+
const results = [];
|
|
71
|
+
const baseDir = uniqueFiles.length > 0 ? dirname(uniqueFiles[0]) : ".";
|
|
72
|
+
for (const filePath of uniqueFiles) try {
|
|
73
|
+
const html = await readFile(filePath, "utf-8");
|
|
74
|
+
const metadata = extractMetadata(html, origin || filePath);
|
|
75
|
+
const content = htmlToMarkdown(html, { origin });
|
|
76
|
+
const url = pathToUrl(filePath, baseDir);
|
|
77
|
+
results.push({
|
|
78
|
+
filePath,
|
|
79
|
+
title: metadata?.title || basename(filePath, ".html"),
|
|
80
|
+
content,
|
|
81
|
+
url,
|
|
82
|
+
metadata
|
|
83
|
+
});
|
|
84
|
+
} catch (error) {
|
|
85
|
+
console.error(`Error processing ${filePath}:`, error);
|
|
86
|
+
}
|
|
87
|
+
return results;
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Generate llms.txt content
|
|
91
|
+
*/
|
|
92
|
+
function generateLlmsTxtContent(files, options) {
|
|
93
|
+
const { siteName = "Site", description, origin = "" } = options;
|
|
94
|
+
let content = `# ${siteName}\n\n`;
|
|
95
|
+
if (description) content += `> ${description}\n\n`;
|
|
96
|
+
if (files.length > 0) {
|
|
97
|
+
content += `## Pages\n\n`;
|
|
98
|
+
for (const file of files) {
|
|
99
|
+
const desc = file.metadata?.description;
|
|
100
|
+
const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
|
|
101
|
+
if (file.filePath && options.outputDir && file.filePath.endsWith(".md")) {
|
|
102
|
+
const relativePath = relative(options.outputDir, file.filePath);
|
|
103
|
+
content += `- [${file.title}](${relativePath})${descText}\n`;
|
|
104
|
+
} else {
|
|
105
|
+
const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin + file.url;
|
|
106
|
+
content += `- [${file.title}](${url})${descText}\n`;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
return content;
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Parse frontmatter from markdown content
|
|
114
|
+
*/
|
|
115
|
+
function parseFrontmatter(content) {
|
|
116
|
+
const frontmatterRegex = /^---\n([\s\S]*?)\n---\n([\s\S]*)$/;
|
|
117
|
+
const match = content.match(frontmatterRegex);
|
|
118
|
+
if (!match) return {
|
|
119
|
+
frontmatter: null,
|
|
120
|
+
body: content
|
|
121
|
+
};
|
|
122
|
+
const frontmatterContent = match[1];
|
|
123
|
+
const body = match[2];
|
|
124
|
+
const frontmatter = {};
|
|
125
|
+
const lines = frontmatterContent.split("\n");
|
|
126
|
+
for (const line of lines) {
|
|
127
|
+
const colonIndex = line.indexOf(":");
|
|
128
|
+
if (colonIndex > 0) {
|
|
129
|
+
const key = line.substring(0, colonIndex).trim();
|
|
130
|
+
const value = line.substring(colonIndex + 1).trim();
|
|
131
|
+
frontmatter[key] = value;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
return {
|
|
135
|
+
frontmatter,
|
|
136
|
+
body
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Serialize frontmatter object to YAML-like format
|
|
141
|
+
*/
|
|
142
|
+
function serializeFrontmatter(data) {
|
|
143
|
+
const lines = [];
|
|
144
|
+
for (const [key, value] of Object.entries(data)) if (value !== void 0 && value !== null) lines.push(`${key}: ${String(value)}`);
|
|
145
|
+
return lines.join("\n");
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Generate llms-full.txt content with complete page content
|
|
149
|
+
*/
|
|
150
|
+
function generateLlmsFullTxtContent(files, options) {
|
|
151
|
+
const { siteName = "Site", description, origin = "" } = options;
|
|
152
|
+
let content = `# ${siteName}\n\n`;
|
|
153
|
+
if (description) content += `> ${description}\n\n`;
|
|
154
|
+
if (files.length > 0) {
|
|
155
|
+
content += `## Table of Contents\n\n`;
|
|
156
|
+
for (const file of files) {
|
|
157
|
+
const anchor = file.title.toLowerCase().replace(/[^a-z0-9]/g, "-");
|
|
158
|
+
content += `- [${file.title}](#${anchor})\n`;
|
|
159
|
+
}
|
|
160
|
+
content += `\n---\n\n`;
|
|
161
|
+
for (const file of files) {
|
|
162
|
+
const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
|
|
163
|
+
const { frontmatter, body } = parseFrontmatter(file.content);
|
|
164
|
+
const metadata = {
|
|
165
|
+
title: file.title,
|
|
166
|
+
url
|
|
167
|
+
};
|
|
168
|
+
if (file.filePath && options.outputDir) metadata.file = relative(options.outputDir, file.filePath);
|
|
169
|
+
else if (file.filePath) metadata.file = file.filePath;
|
|
170
|
+
if (file.metadata) {
|
|
171
|
+
if (file.metadata.description) metadata.description = file.metadata.description;
|
|
172
|
+
if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
|
|
173
|
+
if (file.metadata.author) metadata.author = file.metadata.author;
|
|
174
|
+
}
|
|
175
|
+
const mergedFrontmatter = frontmatter ? {
|
|
176
|
+
...frontmatter,
|
|
177
|
+
...metadata
|
|
178
|
+
} : metadata;
|
|
179
|
+
const frontmatterString = serializeFrontmatter(mergedFrontmatter);
|
|
180
|
+
let contentBody = frontmatter ? body : file.content;
|
|
181
|
+
const titleLine = contentBody.trim().split("\n")[0];
|
|
182
|
+
if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
|
|
183
|
+
content += `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
return content;
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Generate individual markdown files structure
|
|
190
|
+
*/
|
|
191
|
+
function generateMarkdownFilesContent(files) {
|
|
192
|
+
const markdownFiles = [];
|
|
193
|
+
for (const file of files) {
|
|
194
|
+
const urlPath = file.url === "/" ? "index" : file.url.replace(/^\//, "").replace(/\/$/, "");
|
|
195
|
+
const mdPath = `md/${urlPath}.md`;
|
|
196
|
+
markdownFiles.push({
|
|
197
|
+
path: mdPath,
|
|
198
|
+
content: file.content
|
|
199
|
+
});
|
|
200
|
+
}
|
|
201
|
+
return markdownFiles;
|
|
202
|
+
}
|
|
203
|
+
/**
|
|
204
|
+
* Main function to process files and generate llms.txt artifacts
|
|
205
|
+
*/
|
|
206
|
+
async function generateLlmsTxtArtifacts(options) {
|
|
207
|
+
let files;
|
|
208
|
+
if (options.files) files = options.files;
|
|
209
|
+
else if (options.patterns) files = await processHtmlFiles(options.patterns, options.origin);
|
|
210
|
+
else throw new Error("Either patterns or files must be provided");
|
|
211
|
+
const llmsTxt = generateLlmsTxtContent(files, options);
|
|
212
|
+
let llmsFullTxt;
|
|
213
|
+
if (options.generateFull) llmsFullTxt = generateLlmsFullTxtContent(files, options);
|
|
214
|
+
let markdownFiles;
|
|
215
|
+
if (options.generateMarkdown) markdownFiles = generateMarkdownFilesContent(files);
|
|
216
|
+
return {
|
|
217
|
+
llmsTxt,
|
|
218
|
+
llmsFullTxt,
|
|
219
|
+
markdownFiles,
|
|
220
|
+
processedFiles: files
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
//#endregion
|
|
225
|
+
export { generateLlmsTxtArtifacts };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { TAG_ASIDE, TAG_BUTTON, TAG_EMBED, TAG_FIELDSET, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_IFRAME, TAG_INPUT, TAG_NAV, TAG_OBJECT, TAG_SELECT, TAG_TEXTAREA } from "./
|
|
2
|
-
import { filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin } from "./plugins-
|
|
1
|
+
import { TAG_ASIDE, TAG_BUTTON, TAG_EMBED, TAG_FIELDSET, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_IFRAME, TAG_INPUT, TAG_NAV, TAG_OBJECT, TAG_SELECT, TAG_TEXTAREA } from "./plugin-Bqz9GKOA.mjs";
|
|
2
|
+
import { filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin } from "./plugins-TeB1_RYL.mjs";
|
|
3
3
|
|
|
4
4
|
//#region src/preset/minimal.ts
|
|
5
5
|
/**
|
|
@@ -284,109 +284,6 @@ const BLOCKQUOTE_SPACING = [1, 1];
|
|
|
284
284
|
const LIST_ITEM_SPACING = [1, 0];
|
|
285
285
|
const TABLE_ROW_SPACING = [0, 1];
|
|
286
286
|
|
|
287
|
-
//#endregion
|
|
288
|
-
//#region src/libs/query-selector.ts
|
|
289
|
-
/**
|
|
290
|
-
* Creates a tag selector matcher (e.g., 'div', 'p', 'h1')
|
|
291
|
-
*/
|
|
292
|
-
function createTagSelector(tagName) {
|
|
293
|
-
return {
|
|
294
|
-
matches: (element) => element.name === tagName,
|
|
295
|
-
toString: () => tagName
|
|
296
|
-
};
|
|
297
|
-
}
|
|
298
|
-
/**
|
|
299
|
-
* Creates an ID selector matcher (e.g., '#main', '#content')
|
|
300
|
-
*/
|
|
301
|
-
function createIdSelector(selector) {
|
|
302
|
-
const id = selector.slice(1);
|
|
303
|
-
return {
|
|
304
|
-
matches: (element) => element.attributes?.id === id,
|
|
305
|
-
toString: () => `#${id}`
|
|
306
|
-
};
|
|
307
|
-
}
|
|
308
|
-
/**
|
|
309
|
-
* Creates a class selector matcher (e.g., '.container', '.header')
|
|
310
|
-
*/
|
|
311
|
-
function createClassSelector(selector) {
|
|
312
|
-
const className = selector.slice(1);
|
|
313
|
-
return {
|
|
314
|
-
matches: (element) => {
|
|
315
|
-
if (!element.attributes?.class) return false;
|
|
316
|
-
const classes = element.attributes.class.trim().split(" ").filter(Boolean);
|
|
317
|
-
return classes.includes(className);
|
|
318
|
-
},
|
|
319
|
-
toString: () => `.${className}`
|
|
320
|
-
};
|
|
321
|
-
}
|
|
322
|
-
/**
|
|
323
|
-
* Creates an attribute selector matcher (e.g., '[data-id]', '[href="https://example.com"]')
|
|
324
|
-
*/
|
|
325
|
-
function createAttributeSelector(selector) {
|
|
326
|
-
const match = selector.match(/\[([^\]=~|^$*]+)(?:([=~|^$*]+)["']?([^"'\]]+)["']?)?\]/);
|
|
327
|
-
const attrName = match ? match[1] : selector.slice(1, -1);
|
|
328
|
-
const operator = match?.[2];
|
|
329
|
-
const attrValue = match?.[3];
|
|
330
|
-
return {
|
|
331
|
-
matches: (element) => {
|
|
332
|
-
if (!(attrName in (element.attributes || {}))) return false;
|
|
333
|
-
if (!operator || !attrValue) return true;
|
|
334
|
-
const value = element.attributes[attrName];
|
|
335
|
-
switch (operator) {
|
|
336
|
-
case "=": return value === attrValue;
|
|
337
|
-
case "^=": return value.startsWith(attrValue);
|
|
338
|
-
case "$=": return value.endsWith(attrValue);
|
|
339
|
-
case "*=": return value.includes(attrValue);
|
|
340
|
-
case "~=": return value.trim().split(" ").filter(Boolean).includes(attrValue);
|
|
341
|
-
case "|=": return value === attrValue || value.startsWith(`${attrValue}-`);
|
|
342
|
-
default: return false;
|
|
343
|
-
}
|
|
344
|
-
},
|
|
345
|
-
toString: () => {
|
|
346
|
-
if (!operator || !attrValue) return `[${attrName}]`;
|
|
347
|
-
return `[${attrName}${operator}${attrValue}]`;
|
|
348
|
-
}
|
|
349
|
-
};
|
|
350
|
-
}
|
|
351
|
-
/**
|
|
352
|
-
* Creates a compound selector that combines multiple selectors (e.g., 'div.container', 'h1#title')
|
|
353
|
-
*/
|
|
354
|
-
function createCompoundSelector(selectors) {
|
|
355
|
-
return {
|
|
356
|
-
matches: (element) => selectors.every((selector) => selector.matches(element)),
|
|
357
|
-
toString: () => selectors.map((s) => s.toString()).join("")
|
|
358
|
-
};
|
|
359
|
-
}
|
|
360
|
-
/**
|
|
361
|
-
* Parses a CSS selector into a matcher
|
|
362
|
-
*/
|
|
363
|
-
function parseSelector(selector) {
|
|
364
|
-
selector = selector.trim();
|
|
365
|
-
if (!selector) throw new Error("Empty selector");
|
|
366
|
-
const selectorParts = [];
|
|
367
|
-
let current = "";
|
|
368
|
-
let inAttribute = false;
|
|
369
|
-
for (let i = 0; i < selector.length; i++) {
|
|
370
|
-
const char = selector[i];
|
|
371
|
-
if ((char === "." || char === "#" || char === "[") && current) {
|
|
372
|
-
if (current[0] === ".") selectorParts.push(createClassSelector(current));
|
|
373
|
-
else if (current[0] === "#") selectorParts.push(createIdSelector(current));
|
|
374
|
-
else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
|
|
375
|
-
else selectorParts.push(createTagSelector(current));
|
|
376
|
-
current = char;
|
|
377
|
-
} else current += char;
|
|
378
|
-
if (char === "[") inAttribute = true;
|
|
379
|
-
if (char === "]") inAttribute = false;
|
|
380
|
-
if (inAttribute && char !== "[") {}
|
|
381
|
-
}
|
|
382
|
-
if (current) if (current[0] === ".") selectorParts.push(createClassSelector(current));
|
|
383
|
-
else if (current[0] === "#") selectorParts.push(createIdSelector(current));
|
|
384
|
-
else if (current[0] === "[") selectorParts.push(createAttributeSelector(current));
|
|
385
|
-
else selectorParts.push(createTagSelector(current));
|
|
386
|
-
if (selectorParts.length === 1) return selectorParts[0];
|
|
387
|
-
return createCompoundSelector(selectorParts);
|
|
388
|
-
}
|
|
389
|
-
|
|
390
287
|
//#endregion
|
|
391
288
|
//#region src/pluggable/plugin.ts
|
|
392
289
|
/**
|
|
@@ -399,44 +296,4 @@ function createPlugin(plugin) {
|
|
|
399
296
|
}
|
|
400
297
|
|
|
401
298
|
//#endregion
|
|
402
|
-
|
|
403
|
-
function extractionPlugin(selectors) {
|
|
404
|
-
const matcherCallbacks = Object.entries(selectors).map(([selector, callback]) => ({
|
|
405
|
-
matcher: parseSelector(selector),
|
|
406
|
-
callback
|
|
407
|
-
}));
|
|
408
|
-
const trackedElements = new Map();
|
|
409
|
-
return createPlugin({
|
|
410
|
-
onNodeEnter(element) {
|
|
411
|
-
matcherCallbacks.forEach(({ matcher, callback }) => {
|
|
412
|
-
if (matcher.matches(element)) trackedElements.set(element, {
|
|
413
|
-
textContent: "",
|
|
414
|
-
callback
|
|
415
|
-
});
|
|
416
|
-
});
|
|
417
|
-
},
|
|
418
|
-
processTextNode(textNode) {
|
|
419
|
-
let currentParent = textNode.parent;
|
|
420
|
-
while (currentParent) {
|
|
421
|
-
const tracked = trackedElements.get(currentParent);
|
|
422
|
-
if (tracked) tracked.textContent += textNode.value;
|
|
423
|
-
currentParent = currentParent.parent;
|
|
424
|
-
}
|
|
425
|
-
return void 0;
|
|
426
|
-
},
|
|
427
|
-
onNodeExit(element, state) {
|
|
428
|
-
const tracked = trackedElements.get(element);
|
|
429
|
-
if (tracked) {
|
|
430
|
-
const extractedElement = {
|
|
431
|
-
...element,
|
|
432
|
-
textContent: tracked.textContent.trim()
|
|
433
|
-
};
|
|
434
|
-
tracked.callback(extractedElement, state);
|
|
435
|
-
trackedElements.delete(element);
|
|
436
|
-
}
|
|
437
|
-
}
|
|
438
|
-
});
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
//#endregion
|
|
442
|
-
export { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CAPTION, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAIN, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent, createBufferRegion, createPlugin, extractionPlugin, parseSelector };
|
|
299
|
+
export { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CAPTION, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAIN, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent, createBufferRegion, createPlugin };
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { ELEMENT_NODE, TAG_A, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CAPTION, TAG_CODE, TAG_DD, TAG_DETAILS, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_LI, TAG_MAIN, TAG_META, TAG_NAV, TAG_OBJECT, TAG_OL, TAG_P, TAG_PRE, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUMMARY, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TITLE, TAG_TR, TAG_UL, TAG_VIDEO, TEXT_NODE, collectNodeContent, createBufferRegion, createPlugin
|
|
1
|
+
import { ELEMENT_NODE, TAG_A, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CAPTION, TAG_CODE, TAG_DD, TAG_DETAILS, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_LI, TAG_MAIN, TAG_META, TAG_NAV, TAG_OBJECT, TAG_OL, TAG_P, TAG_PRE, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUMMARY, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TITLE, TAG_TR, TAG_UL, TAG_VIDEO, TEXT_NODE, collectNodeContent, createBufferRegion, createPlugin } from "./plugin-Bqz9GKOA.mjs";
|
|
2
|
+
import { parseSelector } from "./extraction-BSOWm6fo.mjs";
|
|
2
3
|
|
|
3
4
|
//#region src/plugins/filter.ts
|
|
4
5
|
/**
|
|
@@ -1,7 +1,4 @@
|
|
|
1
|
-
import { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HR, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent
|
|
2
|
-
import { readFile } from "node:fs/promises";
|
|
3
|
-
import { basename, dirname, relative, sep } from "pathe";
|
|
4
|
-
import { glob } from "tinyglobby";
|
|
1
|
+
import { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HR, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent } from "./plugin-Bqz9GKOA.mjs";
|
|
5
2
|
|
|
6
3
|
//#region src/tags.ts
|
|
7
4
|
function resolveUrl(url, origin) {
|
|
@@ -1387,224 +1384,6 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1387
1384
|
}
|
|
1388
1385
|
const MarkdownProcessor = createMarkdownProcessor;
|
|
1389
1386
|
|
|
1390
|
-
//#endregion
|
|
1391
|
-
//#region src/llms-txt.ts
|
|
1392
|
-
/**
|
|
1393
|
-
* Extract metadata from HTML content using mdream's extraction plugin
|
|
1394
|
-
*/
|
|
1395
|
-
function extractMetadata(html, url) {
|
|
1396
|
-
let title = "";
|
|
1397
|
-
let description = "";
|
|
1398
|
-
let keywords = "";
|
|
1399
|
-
let author = "";
|
|
1400
|
-
const extractionPluginInstance = extractionPlugin({
|
|
1401
|
-
"title": (element) => {
|
|
1402
|
-
if (!title && element.textContent) title = element.textContent.trim();
|
|
1403
|
-
},
|
|
1404
|
-
"meta[name=\"description\"]": (element) => {
|
|
1405
|
-
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
1406
|
-
},
|
|
1407
|
-
"meta[property=\"og:description\"]": (element) => {
|
|
1408
|
-
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
1409
|
-
},
|
|
1410
|
-
"meta[name=\"keywords\"]": (element) => {
|
|
1411
|
-
if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
|
|
1412
|
-
},
|
|
1413
|
-
"meta[name=\"author\"]": (element) => {
|
|
1414
|
-
if (!author && element.attributes?.content) author = element.attributes.content.trim();
|
|
1415
|
-
},
|
|
1416
|
-
"meta[property=\"og:title\"]": (element) => {
|
|
1417
|
-
if (!title && element.attributes?.content) title = element.attributes.content.trim();
|
|
1418
|
-
}
|
|
1419
|
-
});
|
|
1420
|
-
htmlToMarkdown(html, {
|
|
1421
|
-
plugins: [extractionPluginInstance],
|
|
1422
|
-
origin: url
|
|
1423
|
-
});
|
|
1424
|
-
return {
|
|
1425
|
-
title: title || void 0,
|
|
1426
|
-
description: description || void 0,
|
|
1427
|
-
keywords: keywords || void 0,
|
|
1428
|
-
author: author || void 0
|
|
1429
|
-
};
|
|
1430
|
-
}
|
|
1431
|
-
/**
|
|
1432
|
-
* Convert file path to URL path
|
|
1433
|
-
*/
|
|
1434
|
-
function pathToUrl(filePath, baseDir) {
|
|
1435
|
-
let url = relative(baseDir, filePath);
|
|
1436
|
-
url = url.split(sep).join("/");
|
|
1437
|
-
if (url.endsWith(".html")) url = url.slice(0, -5);
|
|
1438
|
-
if (url.endsWith("/index")) url = url.slice(0, -6);
|
|
1439
|
-
if (url === "index") return "/";
|
|
1440
|
-
if (!url.startsWith("/")) url = `/${url}`;
|
|
1441
|
-
return url;
|
|
1442
|
-
}
|
|
1443
|
-
/**
|
|
1444
|
-
* Process HTML files from glob patterns
|
|
1445
|
-
*/
|
|
1446
|
-
async function processHtmlFiles(patterns, origin) {
|
|
1447
|
-
const allPatterns = Array.isArray(patterns) ? patterns : [patterns];
|
|
1448
|
-
const allFiles = [];
|
|
1449
|
-
for (const pattern of allPatterns) {
|
|
1450
|
-
const files = await glob(pattern);
|
|
1451
|
-
allFiles.push(...files);
|
|
1452
|
-
}
|
|
1453
|
-
const uniqueFiles = [...new Set(allFiles)];
|
|
1454
|
-
const results = [];
|
|
1455
|
-
const baseDir = uniqueFiles.length > 0 ? dirname(uniqueFiles[0]) : ".";
|
|
1456
|
-
for (const filePath of uniqueFiles) try {
|
|
1457
|
-
const html = await readFile(filePath, "utf-8");
|
|
1458
|
-
const metadata = extractMetadata(html, origin || filePath);
|
|
1459
|
-
const content = htmlToMarkdown(html, { origin });
|
|
1460
|
-
const url = pathToUrl(filePath, baseDir);
|
|
1461
|
-
results.push({
|
|
1462
|
-
filePath,
|
|
1463
|
-
title: metadata?.title || basename(filePath, ".html"),
|
|
1464
|
-
content,
|
|
1465
|
-
url,
|
|
1466
|
-
metadata
|
|
1467
|
-
});
|
|
1468
|
-
} catch (error) {
|
|
1469
|
-
console.error(`Error processing ${filePath}:`, error);
|
|
1470
|
-
}
|
|
1471
|
-
return results;
|
|
1472
|
-
}
|
|
1473
|
-
/**
|
|
1474
|
-
* Generate llms.txt content
|
|
1475
|
-
*/
|
|
1476
|
-
function generateLlmsTxtContent(files, options) {
|
|
1477
|
-
const { siteName = "Site", description, origin = "" } = options;
|
|
1478
|
-
let content = `# ${siteName}\n\n`;
|
|
1479
|
-
if (description) content += `> ${description}\n\n`;
|
|
1480
|
-
if (files.length > 0) {
|
|
1481
|
-
content += `## Pages\n\n`;
|
|
1482
|
-
for (const file of files) {
|
|
1483
|
-
const desc = file.metadata?.description;
|
|
1484
|
-
const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
|
|
1485
|
-
if (file.filePath && options.outputDir && file.filePath.endsWith(".md")) {
|
|
1486
|
-
const relativePath = relative(options.outputDir, file.filePath);
|
|
1487
|
-
content += `- [${file.title}](${relativePath})${descText}\n`;
|
|
1488
|
-
} else {
|
|
1489
|
-
const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin + file.url;
|
|
1490
|
-
content += `- [${file.title}](${url})${descText}\n`;
|
|
1491
|
-
}
|
|
1492
|
-
}
|
|
1493
|
-
}
|
|
1494
|
-
return content;
|
|
1495
|
-
}
|
|
1496
|
-
/**
|
|
1497
|
-
* Parse frontmatter from markdown content
|
|
1498
|
-
*/
|
|
1499
|
-
function parseFrontmatter(content) {
|
|
1500
|
-
const frontmatterRegex = /^---\n([\s\S]*?)\n---\n([\s\S]*)$/;
|
|
1501
|
-
const match = content.match(frontmatterRegex);
|
|
1502
|
-
if (!match) return {
|
|
1503
|
-
frontmatter: null,
|
|
1504
|
-
body: content
|
|
1505
|
-
};
|
|
1506
|
-
const frontmatterContent = match[1];
|
|
1507
|
-
const body = match[2];
|
|
1508
|
-
const frontmatter = {};
|
|
1509
|
-
const lines = frontmatterContent.split("\n");
|
|
1510
|
-
for (const line of lines) {
|
|
1511
|
-
const colonIndex = line.indexOf(":");
|
|
1512
|
-
if (colonIndex > 0) {
|
|
1513
|
-
const key = line.substring(0, colonIndex).trim();
|
|
1514
|
-
const value = line.substring(colonIndex + 1).trim();
|
|
1515
|
-
frontmatter[key] = value;
|
|
1516
|
-
}
|
|
1517
|
-
}
|
|
1518
|
-
return {
|
|
1519
|
-
frontmatter,
|
|
1520
|
-
body
|
|
1521
|
-
};
|
|
1522
|
-
}
|
|
1523
|
-
/**
|
|
1524
|
-
* Serialize frontmatter object to YAML-like format
|
|
1525
|
-
*/
|
|
1526
|
-
function serializeFrontmatter(data) {
|
|
1527
|
-
const lines = [];
|
|
1528
|
-
for (const [key, value] of Object.entries(data)) if (value !== void 0 && value !== null) lines.push(`${key}: ${String(value)}`);
|
|
1529
|
-
return lines.join("\n");
|
|
1530
|
-
}
|
|
1531
|
-
/**
|
|
1532
|
-
* Generate llms-full.txt content with complete page content
|
|
1533
|
-
*/
|
|
1534
|
-
function generateLlmsFullTxtContent(files, options) {
|
|
1535
|
-
const { siteName = "Site", description, origin = "" } = options;
|
|
1536
|
-
let content = `# ${siteName}\n\n`;
|
|
1537
|
-
if (description) content += `> ${description}\n\n`;
|
|
1538
|
-
if (files.length > 0) {
|
|
1539
|
-
content += `## Table of Contents\n\n`;
|
|
1540
|
-
for (const file of files) {
|
|
1541
|
-
const anchor = file.title.toLowerCase().replace(/[^a-z0-9]/g, "-");
|
|
1542
|
-
content += `- [${file.title}](#${anchor})\n`;
|
|
1543
|
-
}
|
|
1544
|
-
content += `\n---\n\n`;
|
|
1545
|
-
for (const file of files) {
|
|
1546
|
-
const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
|
|
1547
|
-
const { frontmatter, body } = parseFrontmatter(file.content);
|
|
1548
|
-
const metadata = {
|
|
1549
|
-
title: file.title,
|
|
1550
|
-
url
|
|
1551
|
-
};
|
|
1552
|
-
if (file.filePath && options.outputDir) metadata.file = relative(options.outputDir, file.filePath);
|
|
1553
|
-
else if (file.filePath) metadata.file = file.filePath;
|
|
1554
|
-
if (file.metadata) {
|
|
1555
|
-
if (file.metadata.description) metadata.description = file.metadata.description;
|
|
1556
|
-
if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
|
|
1557
|
-
if (file.metadata.author) metadata.author = file.metadata.author;
|
|
1558
|
-
}
|
|
1559
|
-
const mergedFrontmatter = frontmatter ? {
|
|
1560
|
-
...frontmatter,
|
|
1561
|
-
...metadata
|
|
1562
|
-
} : metadata;
|
|
1563
|
-
const frontmatterString = serializeFrontmatter(mergedFrontmatter);
|
|
1564
|
-
let contentBody = frontmatter ? body : file.content;
|
|
1565
|
-
const titleLine = contentBody.trim().split("\n")[0];
|
|
1566
|
-
if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
|
|
1567
|
-
content += `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
|
|
1568
|
-
}
|
|
1569
|
-
}
|
|
1570
|
-
return content;
|
|
1571
|
-
}
|
|
1572
|
-
/**
|
|
1573
|
-
* Generate individual markdown files structure
|
|
1574
|
-
*/
|
|
1575
|
-
function generateMarkdownFilesContent(files) {
|
|
1576
|
-
const markdownFiles = [];
|
|
1577
|
-
for (const file of files) {
|
|
1578
|
-
const urlPath = file.url === "/" ? "index" : file.url.replace(/^\//, "").replace(/\/$/, "");
|
|
1579
|
-
const mdPath = `md/${urlPath}.md`;
|
|
1580
|
-
markdownFiles.push({
|
|
1581
|
-
path: mdPath,
|
|
1582
|
-
content: file.content
|
|
1583
|
-
});
|
|
1584
|
-
}
|
|
1585
|
-
return markdownFiles;
|
|
1586
|
-
}
|
|
1587
|
-
/**
|
|
1588
|
-
* Main function to process files and generate llms.txt artifacts
|
|
1589
|
-
*/
|
|
1590
|
-
async function generateLlmsTxtArtifacts(options) {
|
|
1591
|
-
let files;
|
|
1592
|
-
if (options.files) files = options.files;
|
|
1593
|
-
else if (options.patterns) files = await processHtmlFiles(options.patterns, options.origin);
|
|
1594
|
-
else throw new Error("Either patterns or files must be provided");
|
|
1595
|
-
const llmsTxt = generateLlmsTxtContent(files, options);
|
|
1596
|
-
let llmsFullTxt;
|
|
1597
|
-
if (options.generateFull) llmsFullTxt = generateLlmsFullTxtContent(files, options);
|
|
1598
|
-
let markdownFiles;
|
|
1599
|
-
if (options.generateMarkdown) markdownFiles = generateMarkdownFilesContent(files);
|
|
1600
|
-
return {
|
|
1601
|
-
llmsTxt,
|
|
1602
|
-
llmsFullTxt,
|
|
1603
|
-
markdownFiles,
|
|
1604
|
-
processedFiles: files
|
|
1605
|
-
};
|
|
1606
|
-
}
|
|
1607
|
-
|
|
1608
1387
|
//#endregion
|
|
1609
1388
|
//#region src/stream.ts
|
|
1610
1389
|
/**
|
|
@@ -1655,4 +1434,4 @@ function htmlToMarkdown(html, options = {}) {
|
|
|
1655
1434
|
}
|
|
1656
1435
|
|
|
1657
1436
|
//#endregion
|
|
1658
|
-
export { MarkdownProcessor,
|
|
1437
|
+
export { MarkdownProcessor, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,13 +1,15 @@
|
|
|
1
|
-
import "./_chunks/
|
|
2
|
-
import {
|
|
3
|
-
import "./_chunks/
|
|
4
|
-
import {
|
|
5
|
-
import
|
|
6
|
-
import {
|
|
1
|
+
import "./_chunks/plugin-Bqz9GKOA.mjs";
|
|
2
|
+
import { streamHtmlToMarkdown } from "./_chunks/src-B4vBEPKi.mjs";
|
|
3
|
+
import "./_chunks/extraction-BSOWm6fo.mjs";
|
|
4
|
+
import { generateLlmsTxtArtifacts } from "./_chunks/llms-txt-B4Tz5bHd.mjs";
|
|
5
|
+
import "./_chunks/plugins-TeB1_RYL.mjs";
|
|
6
|
+
import { withMinimalPreset } from "./_chunks/minimal-DSW9dhXV.mjs";
|
|
7
7
|
import { readFileSync } from "node:fs";
|
|
8
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
8
9
|
import { Readable } from "node:stream";
|
|
9
10
|
import { fileURLToPath } from "node:url";
|
|
10
11
|
import { cac } from "cac";
|
|
12
|
+
import { dirname, join, resolve } from "pathe";
|
|
11
13
|
|
|
12
14
|
//#region src/cli.ts
|
|
13
15
|
async function streamingConvert(options = {}) {
|
package/dist/iife.js
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
(function() {
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
function e(e,t,n){if(!t)return;let r=e.regionId||0,i=n.regionContentBuffers.get(r);i&&(i.push(t),n.lastContentCache=t)}function t(e){let t=[];for(let[n,r]of Array.from(e.regionContentBuffers.entries())){let i=e.regionToggles.get(n);i&&t.push(...r)}return e.regionToggles.clear(),e.regionContentBuffers.clear(),t.join(``).trimStart()}const n=0,r=1,i=2,a=3,o=4,s=5,c=6,l=7,u=8,d=9,f=10,p=11,m=12,h=13,g=14,_=15,v=16,y=17,ee=18,te=19,ne=20,re=21,b=22,x=23,S=24,C=25,w=26,ie=27,T=28,ae=29,oe=30,se=31,E=32,D=33,O=34,ce=35,le=36,ue=37,de=38,fe=39,pe=40,me=41,he=42,ge=43,_e=44,ve=45,ye=46,k=47,be=48,A=49,xe=50,Se=51,j=52,M=53,N=54,P=55,F=56,Ce=57,we=58,Te=59,Ee=60,De=61,Oe=62,ke=63,Ae=64,je=65,Me=66,Ne=67,Pe=68,Fe=69,Ie=70,Le=71,Re=72,ze=73,Be=74,Ve=75,He=76,Ue=77,We=78,Ge=79,Ke=80,qe=81,Je=82,Ye=83,Xe=84,Ze=85,Qe=86,$e=87,et=88,tt=89,nt=90,rt=91,it=92,at=93,ot=94,st=95,ct=96,lt=97,ut=98,dt=99,ft=100,pt=101,mt=102,ht=103,gt=104,_t=105,vt=106,yt=107,bt=108,xt={"&":`&`,"<":`<`,">":`>`,""":`"`,"'":`'`,"'":`'`," ":` `},I=1,L=2,R=0,St=1,Ct={html:n,head:r,details:i,summary:a,title:o,meta:s,br:c,h1:l,h2:u,h3:d,h4:f,h5:p,h6:m,hr:h,strong:g,b:_,em:v,i:y,del:ee,sub:te,sup:ne,ins:re,blockquote:b,code:x,ul:S,li:C,a:w,img:ie,table:T,thead:ae,tr:oe,th:se,td:E,ol:D,pre:O,p:ce,div:le,span:ue,tbody:de,tfoot:fe,form:pe,nav:me,label:he,button:ge,body:_e,center:ve,kbd:ye,footer:k,path:be,svg:A,article:xe,section:Se,script:j,style:M,link:N,area:P,base:F,col:Ce,embed:we,input:Te,keygen:Ee,param:De,source:Oe,track:ke,wbr:Ae,select:je,textarea:Me,option:Ne,fieldset:Pe,legend:Fe,audio:Ie,video:Le,canvas:Re,iframe:ze,map:Be,dialog:Ve,meter:He,progress:Ue,template:We,abbr:Ge,mark:Ke,q:qe,samp:Je,small:Ye,noscript:Xe,noframes:Ze,xmp:Qe,plaintext:$e,aside:et,u:tt,cite:nt,dfn:rt,var:it,time:at,bdo:ot,ruby:st,rt:ct,rp:lt,dd:ut,dt,dl:pt,address:ft,figure:mt,object:ht,main:gt,header:_t,figcaption:vt,caption:yt},wt=`**`,Tt=`_`,Et=`~~`,Dt="```",Ot="`",kt=`---`,z=[0,0],At=[2,2],jt=[1,1],Mt=[1,0],B=[0,1];function Nt(e,t){if(!e)return e;if(e.startsWith(`//`))return`https:${e}`;if(t){if(e.startsWith(`/`)&&t){let n=t.endsWith(`/`)?t.slice(0,-1):t;return`${n}${e}`}if(e.startsWith(`./`))return`${t}/${e.slice(2)}`;if(!e.startsWith(`http`)){let n=e.startsWith(`/`)?e.slice(1):e;return`${t}/${n}`}}return e}function V(e){return e.depthMap[E]>0}function Pt(e){if(!e)return``;let t=e.split(` `).map(e=>e.split(`language-`)[1]).filter(Boolean);return t.length>0?t[0].trim():``}function H(e){return{enter:({node:t})=>t.depthMap[w]?`<h${e}>`:`${`#`.repeat(e)} `,exit:({node:t})=>{if(t.depthMap[w])return`</h${e}>`},collapsesInnerWhiteSpace:!0}}const Ft={enter:({node:e})=>e.depthMap[_]>1?``:wt,exit:({node:e})=>e.depthMap[_]>1?``:wt,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},It={enter:({node:e})=>e.depthMap[y]>1?``:Tt,exit:({node:e})=>e.depthMap[y]>1?``:Tt,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},Lt={[r]:{spacing:z,collapsesInnerWhiteSpace:!0},[i]:{enter:()=>`<details>`,exit:()=>`</details>
|
|
5
|
+
|
|
6
|
+
`},[a]:{enter:()=>`<summary>`,exit:()=>`</summary>
|
|
7
|
+
|
|
8
|
+
`},[o]:{collapsesInnerWhiteSpace:!0,isNonNesting:!0,spacing:z},[j]:{excludesTextNodes:!0,isNonNesting:!0},[M]:{isNonNesting:!0,excludesTextNodes:!0},[s]:{collapsesInnerWhiteSpace:!0,isSelfClosing:!0,spacing:z},[c]:{enter:({node:e})=>V(e)?`<br>`:void 0,isSelfClosing:!0,spacing:z,collapsesInnerWhiteSpace:!0,isInline:!0},[l]:H(1),[u]:H(2),[d]:H(3),[f]:H(4),[p]:H(5),[m]:H(6),[h]:{enter:()=>kt,isSelfClosing:!0},[g]:Ft,[_]:Ft,[v]:It,[y]:It,[ee]:{enter:()=>Et,exit:()=>Et,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[te]:{enter:()=>`<sub>`,exit:()=>`</sub>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ne]:{enter:()=>`<sup>`,exit:()=>`</sup>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[re]:{enter:()=>`<ins>`,exit:()=>`</ins>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[b]:{enter:({node:e})=>{let t=e.depthMap[b]||1,n=`> `.repeat(t);return e.depthMap[C]>0&&(n=`\n${` `.repeat(e.depthMap[C])}${n}`),n},spacing:jt},[x]:{enter:({node:e})=>{if((e.depthMap[O]||0)>0){let t=Pt(e.attributes?.class);return`${Dt}${t}\n`}return Ot},exit:({node:e})=>e.depthMap[O]>0?`\n${Dt}`:Ot,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[S]:{enter:({node:e})=>V(e)?`<ul>`:void 0,exit:({node:e})=>V(e)?`</ul>`:void 0},[C]:{enter:({node:e})=>{if(V(e))return`<li>`;let t=(e.depthMap[S]||0)+(e.depthMap[D]||0)-1,n=e.parent?.tagId===D,r=` `.repeat(Math.max(0,t)),i=n?`${e.index+1}. `:`- `;return`${r}${i}`},exit:({node:e})=>V(e)?`</li>`:void 0,spacing:Mt},[w]:{enter:({node:e})=>{if(e.attributes?.href)return`[`},exit:({node:e,state:t})=>{if(!e.attributes?.href)return``;let n=Nt(e.attributes?.href||``,t.options?.origin),r=e.attributes?.title,i=t.lastContentCache;return i===r&&(r=``),r?`](${n} "${r}")`:`](${n})`},collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ie]:{enter:({node:e,state:t})=>{let n=e.attributes?.alt||``,r=Nt(e.attributes?.src||``,t.options?.origin);return``},collapsesInnerWhiteSpace:!0,isSelfClosing:!0,spacing:z,isInline:!0},[T]:{enter:({node:e,state:t})=>{if(V(e))return`<table>`;e.depthMap[T]<=1&&(t.tableRenderedTable=!1),t.tableColumnAlignments=[]},exit:({node:e})=>V(e)?`</table>`:void 0},[ae]:{enter:({node:e})=>{if(V(e))return`<thead>`},exit:({node:e})=>V(e)?`</thead>`:void 0,spacing:B,excludesTextNodes:!0},[oe]:{enter:({node:e,state:t})=>V(e)?`<tr>`:(t.tableCurrentRowCells=0,`| `),exit:({node:e,state:t})=>{if(V(e)||e.depthMap[T]>1)return`</tr>`;if(!t.tableRenderedTable){t.tableRenderedTable=!0;let e=t.tableColumnAlignments;for(;e.length<t.tableCurrentRowCells;)e.push(``);let n=e.map(e=>{switch(e){case`left`:return`:---`;case`center`:return`:---:`;case`right`:return`---:`;default:return`---`}});return` |\n| ${n.join(` | `)} |`}return` |`},excludesTextNodes:!0,spacing:B},[se]:{enter:({node:e,state:t})=>{if(e.depthMap[T]>1)return`<th>`;let n=e.attributes?.align?.toLowerCase();return n?t.tableColumnAlignments.push(n):t.tableColumnAlignments.length<=t.tableCurrentRowCells&&t.tableColumnAlignments.push(``),e.index===0?``:` | `},exit:({node:e,state:t})=>{if(e.depthMap[T]>1)return`</th>`;t.tableCurrentRowCells++},collapsesInnerWhiteSpace:!0,spacing:z},[E]:{enter:({node:e})=>e.depthMap[T]>1?`<td>`:e.index===0?``:` | `,exit:({node:e,state:t})=>{if(e.depthMap[T]>1)return`</td>`;t.tableCurrentRowCells++},collapsesInnerWhiteSpace:!0,spacing:z},[ce]:{},[le]:{},[ue]:{collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[me]:{},[he]:{collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ge]:{collapsesInnerWhiteSpace:!0,isInline:!0},[_e]:{spacing:z},[ve]:{enter:({node:e})=>{if(e.depthMap[T]>1)return`<center>`},exit:({node:e})=>{if(e.depthMap[T]>1)return`</center>`},spacing:z},[de]:{spacing:z,excludesTextNodes:!0},[fe]:{spacing:B,excludesTextNodes:!0},[ye]:{enter:()=>"`",exit:()=>"`",collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[k]:{spacing:z},[pe]:{spacing:z},[N]:{isSelfClosing:!0,spacing:z,collapsesInnerWhiteSpace:!0,isInline:!0},[P]:{isSelfClosing:!0,spacing:z,isInline:!0},[F]:{isSelfClosing:!0,spacing:z,isInline:!0},[Ce]:{isSelfClosing:!0,spacing:z},[we]:{isSelfClosing:!0,spacing:z},[Te]:{isSelfClosing:!0,spacing:z,isInline:!0},[Ee]:{isSelfClosing:!0,spacing:z,isInline:!0},[De]:{isSelfClosing:!0,spacing:z},[Oe]:{isSelfClosing:!0,spacing:z},[ke]:{isSelfClosing:!0,spacing:z},[Ae]:{isSelfClosing:!0,spacing:z,isInline:!0},[A]:{spacing:z},[je]:{spacing:z},[Me]:{isNonNesting:!0,spacing:z},[Ne]:{isNonNesting:!0,spacing:z},[Pe]:{spacing:z},[Fe]:{spacing:z},[Ie]:{spacing:z},[Le]:{spacing:z},[Re]:{spacing:z},[ze]:{isNonNesting:!0,spacing:z},[Be]:{spacing:z},[Ve]:{spacing:z},[He]:{spacing:z},[Ue]:{spacing:z},[We]:{spacing:z},[Ge]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[Ke]:{enter:()=>`<mark>`,exit:()=>`</mark>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[qe]:{enter:()=>`"`,exit:()=>`"`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[Je]:{enter:()=>"`",exit:()=>"`",collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[Ye]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[Xe]:{excludesTextNodes:!0,spacing:z},[Ze]:{isNonNesting:!0,spacing:z},[Qe]:{isNonNesting:!0,spacing:z},[$e]:{isNonNesting:!0,spacing:z},[et]:{spacing:z},[tt]:{enter:()=>`<u>`,exit:()=>`</u>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[nt]:{enter:()=>`*`,exit:()=>`*`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[rt]:{enter:()=>`**`,exit:()=>`**`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[it]:{enter:()=>"`",exit:()=>"`",collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[at]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ot]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[st]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ct]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[lt]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ft]:{enter:()=>`<address>`,exit:()=>`</address>`,spacing:z,collapsesInnerWhiteSpace:!0},[pt]:{spacing:z,enter:()=>`<dl>`,exit:()=>`</dl>`},[dt]:{enter:()=>`<dt>`,exit:()=>`</dt>`,collapsesInnerWhiteSpace:!0,spacing:[0,1]},[ut]:{enter:()=>`<dd>`,exit:()=>`</dd>`,spacing:[0,1]}};function Rt(e){let t=``,n=0;for(;n<e.length;){if(e[n]===`&`){let r=!1;for(let[i,a]of Object.entries(xt))if(e.startsWith(i,n)){t+=a,n+=i.length,r=!0;break}if(r)continue;if(n+2<e.length&&e[n+1]===`#`){let r=n;n+=2;let i=e[n]===`x`||e[n]===`X`;i&&n++;let a=n;for(;n<e.length&&e[n]!==`;`;)n++;if(n<e.length&&e[n]===`;`){let r=e.substring(a,n),o=i?16:10;try{let e=Number.parseInt(r,o);if(!Number.isNaN(e)){t+=String.fromCodePoint(e),n++;continue}}catch{}}n=r}}t+=e[n],n++}return t}function zt(e){let t=e,n=[t];for(;t.tagHandler?.isInline&&t.parent;)t=t.parent,n.push(t);return n}const Bt=60,U=62,W=47,G=61,K=34,q=39,Vt=33,Ht=38,J=92,Y=45,X=32,Ut=9,Wt=10,Gt=13,Kt=96,qt=124,Jt=91,Yt=93,Xt=Object.freeze({});function Zt(e){return new Uint8Array(e)}function Z(e){return e===X||e===Ut||e===Wt||e===Gt}function Qt(e,t,n){return $t(e,t,n)}function $t(e,t,n){let r=``;t.depthMap??=new Uint8Array(bt),t.depth??=0,t.lastCharWasWhitespace??=!0,t.justClosedTag??=!1,t.isFirstTextInElement??=!1,t.lastCharWasBackslash??=!1;let i=0,a=e.length;for(;i<a;){let o=e.charCodeAt(i);if(o!==Bt){if(o===Ht&&(t.hasEncodedHtmlEntity=!0),Z(o)){let n=t.depthMap[O]>0;if(t.justClosedTag&&(t.justClosedTag=!1,t.lastCharWasWhitespace=!1),!n&&t.lastCharWasWhitespace){i++;continue}n?r+=e[i]:(o===X||!t.lastCharWasWhitespace)&&(r+=` `),t.lastCharWasWhitespace=!0,t.textBufferContainsWhitespace=!0,t.lastCharWasBackslash=!1}else t.textBufferContainsNonWhitespace=!0,t.lastCharWasWhitespace=!1,t.justClosedTag=!1,o===qt&&t.depthMap[T]?r+=`\\|`:o===Kt&&(t.depthMap[x]||t.depthMap[O])?r+="\\`":o===Jt&&t.depthMap[w]?r+=`\\[`:o===Yt&&t.depthMap[w]?r+=`\\]`:o===U&&t.depthMap[b]?r+=`\\>`:r+=e[i],t.currentNode?.tagHandler?.isNonNesting&&(t.lastCharWasBackslash||(o===q&&!t.inDoubleQuote&&!t.inBacktick?t.inSingleQuote=!t.inSingleQuote:o===K&&!t.inSingleQuote&&!t.inBacktick?t.inDoubleQuote=!t.inDoubleQuote:o===Kt&&!t.inSingleQuote&&!t.inDoubleQuote&&(t.inBacktick=!t.inBacktick))),t.lastCharWasBackslash=o===J;i++;continue}if(i+1>=a){r+=e[i];break}let s=e.charCodeAt(i+1);if(s===Vt){r.length>0&&(Q(r,t,n),r=``);let a=tn(e,i);if(a.complete)i=a.newPosition;else{r+=a.remainingText;break}}else if(s===W){let a=t.inSingleQuote||t.inDoubleQuote||t.inBacktick;if(t.currentNode?.tagHandler?.isNonNesting&&a){r+=e[i],i++;continue}r.length>0&&(Q(r,t,n),r=``);let o=en(e,i,t,n);if(o.complete)i=o.newPosition;else{r+=o.remainingText;break}}else{let o=i+1,s=o,c=-1;for(;o<a;){let t=e.charCodeAt(o);if(Z(t)||t===W||t===U){c=o;break}o++}if(c===-1){r+=e.substring(i);break}let l=e.substring(s,c).toLowerCase();if(!l){i=c;break}let u=Ct[l]??-1;if(o=c,t.currentNode?.tagHandler?.isNonNesting&&u!==t.currentNode?.tagId){r+=e[i++];continue}r.length>0&&(Q(r,t,n),r=``);let d=nn(l,u,e,o,t,n);if(d.skip)r+=e[i++];else if(d.complete)i=d.newPosition,d.selfClosing||(t.isFirstTextInElement=!0);else{r+=d.remainingText;break}}}return r}function Q(e,t,n){let r=t.textBufferContainsNonWhitespace,i=t.textBufferContainsWhitespace;if(t.textBufferContainsNonWhitespace=!1,t.textBufferContainsWhitespace=!1,!t.currentNode)return;let a=t.currentNode?.tagHandler?.excludesTextNodes,o=t.depthMap[O]>0;if(!o&&!r&&!t.currentNode.childTextNodeIndex)return;let s=e;if(s.length===0)return;let c=zt(t.currentNode),l=c[c.length-1];if(i&&!l?.childTextNodeIndex){let e=0;for(;e<s.length&&(o?s.charCodeAt(e)===Wt||s.charCodeAt(e)===Gt:Z(s.charCodeAt(e)));)e++;e>0&&(s=s.substring(e))}t.hasEncodedHtmlEntity&&(s=Rt(String(s)),t.hasEncodedHtmlEntity=!1);let u={type:L,value:s,parent:t.currentNode,regionId:t.currentNode?.regionId,index:t.currentNode.currentWalkIndex++,depth:t.depth,containsWhitespace:i,excludedFromMarkdown:a};for(let e of c)e.childTextNodeIndex=(e.childTextNodeIndex||0)+1;n({type:R,node:u}),t.lastTextNode=u}function en(e,t,n,r){let i=t+2,a=i,o=e.length,s=!1;for(;i<o;){let t=e.charCodeAt(i);if(t===U){s=!0;break}i++}if(!s)return{complete:!1,newPosition:t,remainingText:e.substring(t)};let c=e.substring(a,i).toLowerCase(),l=Ct[c]??-1;if(n.currentNode?.tagHandler?.isNonNesting&&l!==n.currentNode.tagId)return{complete:!1,newPosition:t,remainingText:e.substring(t)};let u=n.currentNode;if(u){let e=u.tagId!==l;for(;u&&e;)$(u,n,r),u=u.parent,e=u?.tagId!==l}return u&&$(u,n,r),n.justClosedTag=!0,{complete:!0,newPosition:i+1,remainingText:``}}function $(e,t,n){if(e){if(e.tagId===w&&!e.childTextNodeIndex){let t=e.attributes?.title||e.attributes?.[`aria-label`]||``;if(t){e.childTextNodeIndex=1;let r={type:L,value:t,parent:e,index:0,depth:e.depth+1};n({type:R,node:r});for(let t of zt(e))t.childTextNodeIndex=(t.childTextNodeIndex||0)+1}}e.tagId&&(t.depthMap[e.tagId]=Math.max(0,t.depthMap[e.tagId]-1)),e.tagHandler?.isNonNesting&&(t.inSingleQuote=!1,t.inDoubleQuote=!1,t.inBacktick=!1,t.lastCharWasBackslash=!1),t.depth--,n({type:St,node:e}),t.currentNode=t.currentNode.parent,t.hasEncodedHtmlEntity=!1,t.justClosedTag=!0}}function tn(e,t){let n=t,r=e.length;if(n+3<r&&e.charCodeAt(n+2)===Y&&e.charCodeAt(n+3)===Y){for(n+=4;n<r-2;){if(e.charCodeAt(n)===Y&&e.charCodeAt(n+1)===Y&&e.charCodeAt(n+2)===U)return n+=3,{complete:!0,newPosition:n,remainingText:``};n++}return{complete:!1,newPosition:t,remainingText:e.substring(t)}}else{for(n+=2;n<r;){if(e.charCodeAt(n)===U)return n++,{complete:!0,newPosition:n,remainingText:``};n++}return{complete:!1,newPosition:n,remainingText:e.substring(t,n)}}}function nn(e,t,n,r,i,a){i.currentNode?.tagHandler?.isNonNesting&&$(i.currentNode,i,a);let o=Lt[t],s=rn(n,r,o);if(!s.complete)return{complete:!1,newPosition:r,remainingText:`<${e}${s.attrBuffer}`,selfClosing:!1};let c=i.depthMap[t];i.depthMap[t]=c+1,i.depth++,r=s.newPosition,i.currentNode&&(i.currentNode.currentWalkIndex=i.currentNode.currentWalkIndex||0);let l=i.currentNode?i.currentNode.currentWalkIndex++:0,u={type:I,name:e,attributes:s.attributes,parent:i.currentNode,depthMap:Zt(i.depthMap),depth:i.depth,index:l,regionId:i.currentNode?.regionId,tagId:t,tagHandler:o};i.lastTextNode=u,a({type:R,node:u});let d=u;return d.currentWalkIndex=0,i.currentNode=d,i.hasEncodedHtmlEntity=!1,o?.isNonNesting&&!s.selfClosing&&(i.inSingleQuote=!1,i.inDoubleQuote=!1,i.inBacktick=!1,i.lastCharWasBackslash=!1),s.selfClosing?($(u,i,a),i.justClosedTag=!0):i.justClosedTag=!1,{complete:!0,newPosition:r,remainingText:``,selfClosing:s.selfClosing}}function rn(e,t,n){let r=t,i=e.length,a=n?.isSelfClosing||!1,o=r,s=!1,c=0,l=0;for(;r<i;){let t=e.charCodeAt(r);if(s){t===c&&l!==J&&(s=!1),r++;continue}else if(t===K||t===q)s=!0,c=t;else if(t===W&&r+1<i&&e.charCodeAt(r+1)===U){let t=e.substring(o,r).trim();return{complete:!0,newPosition:r+2,attributes:an(t),selfClosing:!0,attrBuffer:t}}else if(t===U){let t=e.substring(o,r).trim();return{complete:!0,newPosition:r+1,attributes:an(t),selfClosing:a,attrBuffer:t}}r++,l=t}return{complete:!1,newPosition:r,attributes:Xt,selfClosing:!1,attrBuffer:e.substring(o,r)}}function an(e){if(!e)return Xt;let t={},n=e.length,r=0,i=0,a=1,o=2,s=3,c=4,l=5,u=i,d=0,f=0,p=0,m=0,h=``;for(;r<n;){let g=e.charCodeAt(r),_=Z(g);switch(u){case i:_||(u=a,d=r,f=0);break;case a:(g===G||_)&&(f=r,h=e.substring(d,f).toLowerCase(),u=g===G?s:o);break;case o:g===G?u=s:_||(t[h]=``,u=a,d=r,f=0);break;case s:g===K||g===q?(m=g,u=c,p=r+1):_||(u=l,p=r);break;case c:g===J&&r+1<n?r++:g===m&&(t[h]=e.substring(p,r),u=i);break;case l:(_||g===U)&&(t[h]=e.substring(p,r),u=i);break}r++}if(u===c||u===l)h&&(t[h]=e.substring(p,r));else if(u===a||u===o||u===s){f||=r;let n=e.substring(d,f).toLowerCase();n&&(t[n]=``)}return t}function on(e,t,n,r){if(t?.length){for(let r of t){let t=r.beforeNodeProcess?.(e,n);if(typeof t==`object`&&t.skip)return!0}if(e.node.type===I){let r=e.node;if(e.type===R)for(let e of t)e.processAttributes&&e.processAttributes(r,n);let i=e.type===R?`onNodeEnter`:`onNodeExit`,a=[];for(let e of t)if(e[i]){let t=e[i](r,n);t&&a.push(t)}a.length>0&&(r.pluginOutput=(r.pluginOutput||[]).concat(a))}else if(e.node.type===L&&e.type===R){let r=e.node;for(let e of t)if(e.processTextNode){let t=e.processTextNode(r,n);if(t){if(t.skip)return!0;r.value=t.content}}}}return r(e),!1}function sn(e,t,n){if(e===` `||e===`
|
|
9
|
+
`||e===` `||t===` `||t===`
|
|
10
|
+
`||t===` `)return!1;let r=new Set([`[`,`(`,`>`,`*`,`_`,"`"]),i=new Set([`]`,`)`,`<`,`.`,`,`,`!`,`?`,`:`,`;`,`*`,`_`,"`"]);return e===`|`&&t===`<`&&n&&n.depthMap[T]>0?!0:!(r.has(e)||i.has(t))}function cn(e,t,n){return!!e&&e!==`
|
|
11
|
+
`&&e!==` `&&e!==`[`&&e!==`>`&&!t?.tagHandler?.isInline&&n.value[0]!==` `}function ln(e){let t=e.tagId,n=e.depthMap;if(t!==C&&n[C]>0||t!==b&&n[b]>0)return z;let r=e.parent;for(;r;){if(r.tagHandler?.collapsesInnerWhiteSpace)return z;r=r.parent}return e.tagHandler?.spacing?e.tagHandler?.spacing:At}function un(n={}){let r={options:n,regionToggles:new Map,regionContentBuffers:new Map,depthMap:new Uint8Array(bt)};r.regionToggles.set(0,!0),r.regionContentBuffers.set(0,[]);let i=0;function a(t){let{type:n,node:i}=t,a=r.lastNode;r.lastNode=t.node,r.depth=i.depth;let o=r.regionContentBuffers.get(i.regionId||0)||[],s=o[o.length-1],c=s?.charAt(s.length-1)||``,l;if(l=s?.length>1?s.charAt(s.length-2):o[o.length-2]?.charAt(o[o.length-2].length-1),i.type===L&&n===R){let t=i;if(t.value){if(t.excludedFromMarkdown||t.value===` `&&c===`
|
|
12
|
+
`)return;cn(c,a,t)&&(t.value=` ${t.value}`),e(t,t.value,r)}r.lastTextNode=t;return}if(i.type!==I)return;let u={node:i,state:r},d=[],f=i;f.pluginOutput?.length&&(d.push(...f.pluginOutput),f.pluginOutput=[]);let p=r.lastContentCache,m=0;c===`
|
|
13
|
+
`&&m++,l===`
|
|
14
|
+
`&&m++;let h=n===R?`enter`:`exit`,g=i.tagHandler;if(!d.length&&g?.[h]){let e=g[h](u);e&&d.push(e)}let _=ln(i),v=_[n]||0,y=Math.max(0,v-m);if(y>0){if(!o.length){for(let t of d)e(i,t,r);return}let t=`
|
|
15
|
+
`.repeat(y);c===` `&&o?.length&&(o[o.length-1]=o[o.length-1].substring(0,o[o.length-1].length-1)),n===R?d.unshift(t):d.push(t)}else if(p&&r.lastTextNode?.containsWhitespace&&i.parent&&`value`in r.lastTextNode&&typeof r.lastTextNode.value==`string`&&(!i.parent.depthMap[O]||i.parent.tagId===O)){let e=i.tagHandler?.isInline,t=i.tagHandler?.collapsesInnerWhiteSpace,a=i.tagHandler?.spacing&&Array.isArray(i.tagHandler.spacing),s=!e&&!t&&v>0,c=(!e||n===St)&&!s&&!(t&&n===R)&&!(a&&n===R);if(c){let e=p.length,t=p.trimEnd(),n=e-t.length;n>0&&o?.length&&o[o.length-1]===p&&(o[o.length-1]=t)}r.lastTextNode=void 0}d[0]?.[0]&&n===R&&c&&sn(c,d[0][0],r)&&e(i,` `,r);for(let t of d)e(i,t,r)}function o(e){let t={depthMap:r.depthMap,depth:0,plugins:r.options?.plugins||[]};Qt(e,t,e=>{on(e,r.options?.plugins,r,a)})}function s(){let e=t(r);return e.trimEnd()}function c(){let e=[];for(let[t,n]of Array.from(r.regionContentBuffers.entries())){let i=r.regionToggles.get(t);i&&e.push(...n)}let t=e.join(``).trimStart(),n=t.slice(i);return i=t.length,n}return{processEvent:a,processHtml:o,getMarkdown:s,getMarkdownChunk:c,state:r}}function dn(e,t={}){let n=un(t);return n.processHtml(e),n.getMarkdown()}const fn={htmlToMarkdown:dn};typeof window<`u`&&(window.mdream=fn);var pn=fn;
|
|
16
|
+
|
|
17
|
+
// Expose mdream globally
|
|
18
|
+
if (typeof window !== 'undefined') {
|
|
19
|
+
window.mdream = fn;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
})();
|
package/dist/index.d.mts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { BufferRegion, ELEMENT_NODE$1 as ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE$1 as TEXT_NODE, TagHandler, TailwindContext, TextNode } from "./_chunks/types-
|
|
2
|
-
import { createPlugin$1 as createPlugin } from "./_chunks/plugin-
|
|
1
|
+
import { BufferRegion, ELEMENT_NODE$1 as ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE$1 as TEXT_NODE, TagHandler, TailwindContext, TextNode } from "./_chunks/types-B94khc0C.mjs";
|
|
2
|
+
import { createPlugin$1 as createPlugin } from "./_chunks/plugin-BUiqQb0v.mjs";
|
|
3
3
|
import { ReadableStream } from "node:stream/web";
|
|
4
4
|
|
|
5
5
|
//#region src/const.d.ts
|
|
@@ -115,43 +115,6 @@ declare const TagIdMap: {
|
|
|
115
115
|
readonly caption: 107;
|
|
116
116
|
};
|
|
117
117
|
//#endregion
|
|
118
|
-
//#region src/llms-txt.d.ts
|
|
119
|
-
interface LlmsTxtArtifactsOptions {
|
|
120
|
-
patterns?: string | string[];
|
|
121
|
-
files?: ProcessedFile[];
|
|
122
|
-
siteName?: string;
|
|
123
|
-
description?: string;
|
|
124
|
-
origin?: string;
|
|
125
|
-
generateFull?: boolean;
|
|
126
|
-
generateMarkdown?: boolean;
|
|
127
|
-
outputDir?: string;
|
|
128
|
-
}
|
|
129
|
-
interface ProcessedFile {
|
|
130
|
-
filePath?: string;
|
|
131
|
-
title: string;
|
|
132
|
-
content: string;
|
|
133
|
-
url: string;
|
|
134
|
-
metadata?: {
|
|
135
|
-
title?: string;
|
|
136
|
-
description?: string;
|
|
137
|
-
keywords?: string;
|
|
138
|
-
author?: string;
|
|
139
|
-
};
|
|
140
|
-
}
|
|
141
|
-
interface LlmsTxtArtifactsResult {
|
|
142
|
-
llmsTxt: string;
|
|
143
|
-
llmsFullTxt?: string;
|
|
144
|
-
markdownFiles?: {
|
|
145
|
-
path: string;
|
|
146
|
-
content: string;
|
|
147
|
-
}[];
|
|
148
|
-
processedFiles: ProcessedFile[];
|
|
149
|
-
}
|
|
150
|
-
/**
|
|
151
|
-
* Main function to process files and generate llms.txt artifacts
|
|
152
|
-
*/
|
|
153
|
-
declare function generateLlmsTxtArtifacts(options: LlmsTxtArtifactsOptions): Promise<LlmsTxtArtifactsResult>;
|
|
154
|
-
//#endregion
|
|
155
118
|
//#region src/markdown-processor.d.ts
|
|
156
119
|
interface MarkdownState {
|
|
157
120
|
/** Configuration options for conversion */
|
|
@@ -218,4 +181,4 @@ declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options
|
|
|
218
181
|
//#region src/index.d.ts
|
|
219
182
|
declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
|
|
220
183
|
//#endregion
|
|
221
|
-
export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext,
|
|
184
|
+
export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { TagIdMap, createPlugin } from "./_chunks/
|
|
2
|
-
import { MarkdownProcessor,
|
|
1
|
+
import { TagIdMap, createPlugin } from "./_chunks/plugin-Bqz9GKOA.mjs";
|
|
2
|
+
import { MarkdownProcessor, htmlToMarkdown, parseHtml, streamHtmlToMarkdown } from "./_chunks/src-B4vBEPKi.mjs";
|
|
3
3
|
|
|
4
|
-
export { MarkdownProcessor, TagIdMap, createPlugin,
|
|
4
|
+
export { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
//#region src/llms-txt.d.ts
|
|
2
|
+
interface LlmsTxtArtifactsOptions {
|
|
3
|
+
patterns?: string | string[];
|
|
4
|
+
files?: ProcessedFile[];
|
|
5
|
+
siteName?: string;
|
|
6
|
+
description?: string;
|
|
7
|
+
origin?: string;
|
|
8
|
+
generateFull?: boolean;
|
|
9
|
+
generateMarkdown?: boolean;
|
|
10
|
+
outputDir?: string;
|
|
11
|
+
}
|
|
12
|
+
interface ProcessedFile {
|
|
13
|
+
filePath?: string;
|
|
14
|
+
title: string;
|
|
15
|
+
content: string;
|
|
16
|
+
url: string;
|
|
17
|
+
metadata?: {
|
|
18
|
+
title?: string;
|
|
19
|
+
description?: string;
|
|
20
|
+
keywords?: string;
|
|
21
|
+
author?: string;
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
interface LlmsTxtArtifactsResult {
|
|
25
|
+
llmsTxt: string;
|
|
26
|
+
llmsFullTxt?: string;
|
|
27
|
+
markdownFiles?: {
|
|
28
|
+
path: string;
|
|
29
|
+
content: string;
|
|
30
|
+
}[];
|
|
31
|
+
processedFiles: ProcessedFile[];
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Main function to process files and generate llms.txt artifacts
|
|
35
|
+
*/
|
|
36
|
+
declare function generateLlmsTxtArtifacts(options: LlmsTxtArtifactsOptions): Promise<LlmsTxtArtifactsResult>;
|
|
37
|
+
//#endregion
|
|
38
|
+
export { LlmsTxtArtifactsOptions, LlmsTxtArtifactsResult, ProcessedFile, generateLlmsTxtArtifacts };
|
package/dist/plugins.d.mts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Plugin, extractionPlugin$1 as extractionPlugin } from "./_chunks/types-
|
|
2
|
-
import { createPlugin$1 as createPlugin } from "./_chunks/plugin-
|
|
1
|
+
import { Plugin, extractionPlugin$1 as extractionPlugin } from "./_chunks/types-B94khc0C.mjs";
|
|
2
|
+
import { createPlugin$1 as createPlugin } from "./_chunks/plugin-BUiqQb0v.mjs";
|
|
3
3
|
|
|
4
4
|
//#region src/plugins/filter.d.ts
|
|
5
5
|
|
package/dist/plugins.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { createPlugin
|
|
2
|
-
import {
|
|
1
|
+
import { createPlugin } from "./_chunks/plugin-Bqz9GKOA.mjs";
|
|
2
|
+
import { extractionPlugin } from "./_chunks/extraction-BSOWm6fo.mjs";
|
|
3
|
+
import { filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin } from "./_chunks/plugins-TeB1_RYL.mjs";
|
|
3
4
|
|
|
4
5
|
export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
|
package/dist/preset/minimal.mjs
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import "../_chunks/
|
|
2
|
-
import "../_chunks/
|
|
3
|
-
import
|
|
1
|
+
import "../_chunks/plugin-Bqz9GKOA.mjs";
|
|
2
|
+
import "../_chunks/extraction-BSOWm6fo.mjs";
|
|
3
|
+
import "../_chunks/plugins-TeB1_RYL.mjs";
|
|
4
|
+
import { withMinimalPreset } from "../_chunks/minimal-DSW9dhXV.mjs";
|
|
4
5
|
|
|
5
6
|
export { withMinimalPreset };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mdream",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.11.0",
|
|
5
5
|
"description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -23,6 +23,14 @@
|
|
|
23
23
|
},
|
|
24
24
|
"default": "./dist/index.mjs"
|
|
25
25
|
},
|
|
26
|
+
"./llms-txt": {
|
|
27
|
+
"types": "./dist/llms-txt.d.mts",
|
|
28
|
+
"import": {
|
|
29
|
+
"types": "./dist/llms-txt.d.mts",
|
|
30
|
+
"default": "./dist/llms-txt.mjs"
|
|
31
|
+
},
|
|
32
|
+
"default": "./dist/llms-txt.mjs"
|
|
33
|
+
},
|
|
26
34
|
"./cli": {
|
|
27
35
|
"types": "./dist/cli.d.mts",
|
|
28
36
|
"import": {
|
|
@@ -49,6 +57,8 @@
|
|
|
49
57
|
}
|
|
50
58
|
},
|
|
51
59
|
"main": "./dist/index.mjs",
|
|
60
|
+
"unpkg": "./dist/iife.js",
|
|
61
|
+
"jsdelivr": "./dist/iife.js",
|
|
52
62
|
"types": "./dist/index.d.mts",
|
|
53
63
|
"bin": {
|
|
54
64
|
"mdream": "./bin/mdream.mjs"
|
|
@@ -57,6 +67,7 @@
|
|
|
57
67
|
"bin",
|
|
58
68
|
"dist"
|
|
59
69
|
],
|
|
70
|
+
"browser": "./dist/iife.js",
|
|
60
71
|
"dependencies": {
|
|
61
72
|
"cac": "^6.7.14",
|
|
62
73
|
"pathe": "^2.0.3",
|
|
@@ -79,6 +90,7 @@
|
|
|
79
90
|
"typecheck": "tsc --noEmit",
|
|
80
91
|
"dev:prepare": "obuild --stub",
|
|
81
92
|
"test": "vitest test",
|
|
93
|
+
"test:browser": "vitest test --project=browser",
|
|
82
94
|
"test:attw": "attw --pack"
|
|
83
95
|
}
|
|
84
96
|
}
|
|
File without changes
|