mdream 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -9
- package/dist/_chunks/const.mjs +110 -228
- package/dist/_chunks/extraction.mjs +24 -1
- package/dist/_chunks/markdown-processor.mjs +256 -165
- package/dist/_chunks/plugin.mjs +7 -0
- package/dist/_chunks/{tailwind.mjs → plugins.mjs} +109 -26
- package/dist/_chunks/{stream.mjs → src.mjs} +16 -1
- package/dist/cli.mjs +7 -1
- package/dist/iife.js +3 -3
- package/dist/index.mjs +3 -8
- package/dist/llms-txt.mjs +91 -5
- package/dist/negotiate.d.mts +26 -0
- package/dist/negotiate.mjs +92 -0
- package/dist/plugins.mjs +2 -1
- package/dist/preset/minimal.mjs +28 -18
- package/dist/splitter.mjs +34 -19
- package/package.json +10 -2
package/dist/llms-txt.mjs
CHANGED
|
@@ -1,10 +1,19 @@
|
|
|
1
|
+
import "./_chunks/const.mjs";
|
|
1
2
|
import "./_chunks/markdown-processor.mjs";
|
|
2
|
-
import "./_chunks/
|
|
3
|
-
import { htmlToMarkdown } from "./
|
|
3
|
+
import "./_chunks/plugin.mjs";
|
|
4
|
+
import { t as htmlToMarkdown } from "./_chunks/src.mjs";
|
|
4
5
|
import { t as extractionPlugin } from "./_chunks/extraction.mjs";
|
|
5
6
|
import { mkdir, open, readFile } from "node:fs/promises";
|
|
6
7
|
import { basename, dirname, join, relative, sep } from "pathe";
|
|
7
8
|
import { glob } from "tinyglobby";
|
|
9
|
+
//#region src/llms-txt.ts
|
|
10
|
+
const FRONTMATTER_RE = /^---\n([\s\S]*?)\n---\n([\s\S]*)$/;
|
|
11
|
+
const ANCHOR_INVALID_CHARS_RE = /[^a-z0-9]/g;
|
|
12
|
+
const LEADING_SLASH_RE = /^\//;
|
|
13
|
+
const TRAILING_SLASH_RE = /\/$/;
|
|
14
|
+
/**
|
|
15
|
+
* Extract metadata from HTML content using mdream's extraction plugin
|
|
16
|
+
*/
|
|
8
17
|
function extractMetadata(html, url) {
|
|
9
18
|
let title = "";
|
|
10
19
|
let description = "";
|
|
@@ -40,6 +49,9 @@ function extractMetadata(html, url) {
|
|
|
40
49
|
author: author || void 0
|
|
41
50
|
};
|
|
42
51
|
}
|
|
52
|
+
/**
|
|
53
|
+
* Convert file path to URL path
|
|
54
|
+
*/
|
|
43
55
|
function pathToUrl(filePath, baseDir) {
|
|
44
56
|
let url = relative(baseDir, filePath);
|
|
45
57
|
url = url.split(sep).join("/");
|
|
@@ -49,6 +61,9 @@ function pathToUrl(filePath, baseDir) {
|
|
|
49
61
|
if (!url.startsWith("/")) url = `/${url}`;
|
|
50
62
|
return url;
|
|
51
63
|
}
|
|
64
|
+
/**
|
|
65
|
+
* Process HTML files from glob patterns
|
|
66
|
+
*/
|
|
52
67
|
async function processHtmlFiles(patterns, origin) {
|
|
53
68
|
const allPatterns = Array.isArray(patterns) ? patterns : [patterns];
|
|
54
69
|
const allFiles = [];
|
|
@@ -76,6 +91,9 @@ async function processHtmlFiles(patterns, origin) {
|
|
|
76
91
|
}
|
|
77
92
|
return results;
|
|
78
93
|
}
|
|
94
|
+
/**
|
|
95
|
+
* Generate llms.txt content
|
|
96
|
+
*/
|
|
79
97
|
function generateLlmsTxtContent(files, options) {
|
|
80
98
|
const { siteName = "Site", description, origin = "", sections, notes } = options;
|
|
81
99
|
let content = `# ${siteName}\n\n`;
|
|
@@ -99,8 +117,11 @@ function generateLlmsTxtContent(files, options) {
|
|
|
99
117
|
if (notes) content += `\n${formatNotes(notes)}`;
|
|
100
118
|
return content;
|
|
101
119
|
}
|
|
120
|
+
/**
|
|
121
|
+
* Parse frontmatter from markdown content
|
|
122
|
+
*/
|
|
102
123
|
function parseFrontmatter(content) {
|
|
103
|
-
const match = content.match(
|
|
124
|
+
const match = content.match(FRONTMATTER_RE);
|
|
104
125
|
if (!match) return {
|
|
105
126
|
frontmatter: null,
|
|
106
127
|
body: content
|
|
@@ -121,11 +142,17 @@ function parseFrontmatter(content) {
|
|
|
121
142
|
body
|
|
122
143
|
};
|
|
123
144
|
}
|
|
145
|
+
/**
|
|
146
|
+
* Serialize frontmatter object to YAML-like format
|
|
147
|
+
*/
|
|
124
148
|
function serializeFrontmatter(data) {
|
|
125
149
|
const lines = [];
|
|
126
150
|
for (const [key, value] of Object.entries(data)) if (value !== void 0 && value !== null) lines.push(`${key}: ${String(value)}`);
|
|
127
151
|
return lines.join("\n");
|
|
128
152
|
}
|
|
153
|
+
/**
|
|
154
|
+
* Generate llms-full.txt content with complete page content
|
|
155
|
+
*/
|
|
129
156
|
function generateLlmsFullTxtContent(files, options) {
|
|
130
157
|
const { siteName = "Site", description, origin = "", sections, notes } = options;
|
|
131
158
|
let content = `# ${siteName}\n\n`;
|
|
@@ -135,7 +162,7 @@ function generateLlmsFullTxtContent(files, options) {
|
|
|
135
162
|
if (files.length > 0) {
|
|
136
163
|
content += `## Table of Contents\n\n`;
|
|
137
164
|
for (const file of files) {
|
|
138
|
-
const anchor = file.title.toLowerCase().replace(
|
|
165
|
+
const anchor = file.title.toLowerCase().replace(ANCHOR_INVALID_CHARS_RE, "-");
|
|
139
166
|
content += `- [${file.title}](#${anchor})\n`;
|
|
140
167
|
}
|
|
141
168
|
content += `\n---\n\n`;
|
|
@@ -166,10 +193,13 @@ function generateLlmsFullTxtContent(files, options) {
|
|
|
166
193
|
if (notes) content += `\n${formatNotes(notes)}`;
|
|
167
194
|
return content;
|
|
168
195
|
}
|
|
196
|
+
/**
|
|
197
|
+
* Generate individual markdown files structure
|
|
198
|
+
*/
|
|
169
199
|
function generateMarkdownFilesContent(files) {
|
|
170
200
|
const markdownFiles = [];
|
|
171
201
|
for (const file of files) {
|
|
172
|
-
const mdPath = `md/${file.url === "/" ? "index" : file.url.replace(
|
|
202
|
+
const mdPath = `md/${file.url === "/" ? "index" : file.url.replace(LEADING_SLASH_RE, "").replace(TRAILING_SLASH_RE, "")}.md`;
|
|
173
203
|
markdownFiles.push({
|
|
174
204
|
path: mdPath,
|
|
175
205
|
content: file.content
|
|
@@ -177,6 +207,9 @@ function generateMarkdownFilesContent(files) {
|
|
|
177
207
|
}
|
|
178
208
|
return markdownFiles;
|
|
179
209
|
}
|
|
210
|
+
/**
|
|
211
|
+
* Main function to process files and generate llms.txt artifacts
|
|
212
|
+
*/
|
|
180
213
|
async function generateLlmsTxtArtifacts(options) {
|
|
181
214
|
let files;
|
|
182
215
|
if (options.files) files = options.files;
|
|
@@ -194,6 +227,9 @@ async function generateLlmsTxtArtifacts(options) {
|
|
|
194
227
|
processedFiles: files
|
|
195
228
|
};
|
|
196
229
|
}
|
|
230
|
+
/**
|
|
231
|
+
* Format a section with title, description, and links
|
|
232
|
+
*/
|
|
197
233
|
function formatSection(section) {
|
|
198
234
|
let content = `## ${section.title}\n\n`;
|
|
199
235
|
if (section.description) {
|
|
@@ -209,18 +245,67 @@ function formatSection(section) {
|
|
|
209
245
|
}
|
|
210
246
|
return content;
|
|
211
247
|
}
|
|
248
|
+
/**
|
|
249
|
+
* Format notes section
|
|
250
|
+
*/
|
|
212
251
|
function formatNotes(notes) {
|
|
213
252
|
const noteLines = Array.isArray(notes) ? notes : [notes];
|
|
214
253
|
let content = "";
|
|
215
254
|
for (const note of noteLines) content += `${note}\n\n`;
|
|
216
255
|
return content;
|
|
217
256
|
}
|
|
257
|
+
/**
|
|
258
|
+
* Create a WritableStream that generates llms.txt artifacts by streaming pages to disk
|
|
259
|
+
*
|
|
260
|
+
* Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
|
|
261
|
+
* never keeping full content in memory. Creates outputDir recursively if needed.
|
|
262
|
+
*
|
|
263
|
+
* @example
|
|
264
|
+
* ```typescript
|
|
265
|
+
* const stream = createLlmsTxtStream({
|
|
266
|
+
* siteName: 'My Docs',
|
|
267
|
+
* description: 'Documentation site',
|
|
268
|
+
* origin: 'https://example.com',
|
|
269
|
+
* generateFull: true,
|
|
270
|
+
* outputDir: './dist',
|
|
271
|
+
* sections: [
|
|
272
|
+
* {
|
|
273
|
+
* title: 'Getting Started',
|
|
274
|
+
* description: 'Quick start guide',
|
|
275
|
+
* links: [
|
|
276
|
+
* { title: 'Installation', href: '/install', description: 'How to install' },
|
|
277
|
+
* { title: 'Quick Start', href: '/quickstart' },
|
|
278
|
+
* ],
|
|
279
|
+
* },
|
|
280
|
+
* ],
|
|
281
|
+
* notes: ['Generated by mdream', 'Last updated: 2024'],
|
|
282
|
+
* })
|
|
283
|
+
*
|
|
284
|
+
* const writer = stream.getWriter()
|
|
285
|
+
* await writer.write({
|
|
286
|
+
* title: 'Home',
|
|
287
|
+
* content: '# Welcome\n\nHome page content.',
|
|
288
|
+
* url: '/',
|
|
289
|
+
* })
|
|
290
|
+
* await writer.close()
|
|
291
|
+
* ```
|
|
292
|
+
*
|
|
293
|
+
* @param options - Configuration options
|
|
294
|
+
* @returns WritableStream that accepts ProcessedFile objects
|
|
295
|
+
*/
|
|
296
|
+
/**
|
|
297
|
+
* Get group prefix for a URL (up to 2 segments)
|
|
298
|
+
*/
|
|
218
299
|
function getGroupPrefix(url, depth) {
|
|
219
300
|
const segments = url.split("/").filter(Boolean);
|
|
220
301
|
if (segments.length === 0) return "/";
|
|
221
302
|
if (depth === 1 || segments.length === 1) return `/${segments[0]}`;
|
|
222
303
|
return `/${segments[0]}/${segments[1]}`;
|
|
223
304
|
}
|
|
305
|
+
/**
|
|
306
|
+
* Sort pages by URL path in hierarchical order (directory tree structure)
|
|
307
|
+
* Groups by up to 2 segments, with root-level pages without nesting grouped together
|
|
308
|
+
*/
|
|
224
309
|
function sortPagesByPath(pages) {
|
|
225
310
|
const twoSegmentCount = /* @__PURE__ */ new Map();
|
|
226
311
|
for (const page of pages) {
|
|
@@ -380,4 +465,5 @@ function createLlmsTxtStream(options = {}) {
|
|
|
380
465
|
}
|
|
381
466
|
});
|
|
382
467
|
}
|
|
468
|
+
//#endregion
|
|
383
469
|
export { createLlmsTxtStream, generateLlmsTxtArtifacts };
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
//#region src/negotiate.d.ts
|
|
2
|
+
interface AcceptEntry {
|
|
3
|
+
type: string;
|
|
4
|
+
q: number;
|
|
5
|
+
position: number;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Parse an HTTP Accept header into an ordered list of media types with quality values.
|
|
9
|
+
* Supports quality weights (q=0.9) and preserves original position for tie-breaking.
|
|
10
|
+
*/
|
|
11
|
+
declare function parseAcceptHeader(accept: string): AcceptEntry[];
|
|
12
|
+
/**
|
|
13
|
+
* Determine if a client prefers markdown over HTML using proper content negotiation.
|
|
14
|
+
*
|
|
15
|
+
* Uses Accept header quality weights and position ordering:
|
|
16
|
+
* - If text/markdown or text/plain has higher quality than text/html → markdown
|
|
17
|
+
* - If same quality, earlier position in Accept header wins
|
|
18
|
+
* - Bare wildcard does NOT trigger markdown (prevents breaking OG crawlers)
|
|
19
|
+
* - sec-fetch-dest: document always returns false (browser navigation)
|
|
20
|
+
*
|
|
21
|
+
* @param acceptHeader - The HTTP Accept header value
|
|
22
|
+
* @param secFetchDest - The Sec-Fetch-Dest header value
|
|
23
|
+
*/
|
|
24
|
+
declare function shouldServeMarkdown(acceptHeader?: string, secFetchDest?: string): boolean;
|
|
25
|
+
//#endregion
|
|
26
|
+
export { parseAcceptHeader, shouldServeMarkdown };
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
//#region src/negotiate.ts
|
|
2
|
+
/**
|
|
3
|
+
* Parse an HTTP Accept header into an ordered list of media types with quality values.
|
|
4
|
+
* Supports quality weights (q=0.9) and preserves original position for tie-breaking.
|
|
5
|
+
*/
|
|
6
|
+
function parseAcceptHeader(accept) {
|
|
7
|
+
if (!accept) return [];
|
|
8
|
+
const entries = [];
|
|
9
|
+
const parts = accept.split(",");
|
|
10
|
+
for (let i = 0; i < parts.length; i++) {
|
|
11
|
+
const part = parts[i].trim();
|
|
12
|
+
if (!part) continue;
|
|
13
|
+
const semicolonIdx = part.indexOf(";");
|
|
14
|
+
let type;
|
|
15
|
+
let q = 1;
|
|
16
|
+
if (semicolonIdx === -1) type = part;
|
|
17
|
+
else {
|
|
18
|
+
type = part.slice(0, semicolonIdx).trim();
|
|
19
|
+
const paramStr = part.slice(semicolonIdx + 1);
|
|
20
|
+
const qIdx = paramStr.indexOf("q=");
|
|
21
|
+
if (qIdx !== -1) {
|
|
22
|
+
const qStart = qIdx + 2;
|
|
23
|
+
let qEnd = qStart;
|
|
24
|
+
while (qEnd < paramStr.length && paramStr.charCodeAt(qEnd) !== 59 && paramStr.charCodeAt(qEnd) !== 32) qEnd++;
|
|
25
|
+
q = +paramStr.slice(qStart, qEnd) || 0;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
entries.push({
|
|
29
|
+
type,
|
|
30
|
+
q,
|
|
31
|
+
position: i
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
return entries;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Determine if a client prefers markdown over HTML using proper content negotiation.
|
|
38
|
+
*
|
|
39
|
+
* Uses Accept header quality weights and position ordering:
|
|
40
|
+
* - If text/markdown or text/plain has higher quality than text/html → markdown
|
|
41
|
+
* - If same quality, earlier position in Accept header wins
|
|
42
|
+
* - Bare wildcard does NOT trigger markdown (prevents breaking OG crawlers)
|
|
43
|
+
* - sec-fetch-dest: document always returns false (browser navigation)
|
|
44
|
+
*
|
|
45
|
+
* @param acceptHeader - The HTTP Accept header value
|
|
46
|
+
* @param secFetchDest - The Sec-Fetch-Dest header value
|
|
47
|
+
*/
|
|
48
|
+
function shouldServeMarkdown(acceptHeader, secFetchDest) {
|
|
49
|
+
if (secFetchDest === "document") return false;
|
|
50
|
+
const accept = acceptHeader || "";
|
|
51
|
+
if (!accept) return false;
|
|
52
|
+
const parts = accept.split(",");
|
|
53
|
+
let bestMdQ = -1;
|
|
54
|
+
let bestMdPos = -1;
|
|
55
|
+
let htmlQ = -1;
|
|
56
|
+
let htmlPos = -1;
|
|
57
|
+
for (let i = 0; i < parts.length; i++) {
|
|
58
|
+
const part = parts[i].trim();
|
|
59
|
+
if (!part) continue;
|
|
60
|
+
const semicolonIdx = part.indexOf(";");
|
|
61
|
+
let type;
|
|
62
|
+
let q = 1;
|
|
63
|
+
if (semicolonIdx === -1) type = part;
|
|
64
|
+
else {
|
|
65
|
+
type = part.slice(0, semicolonIdx).trim();
|
|
66
|
+
const paramStr = part.slice(semicolonIdx + 1);
|
|
67
|
+
const qIdx = paramStr.indexOf("q=");
|
|
68
|
+
if (qIdx !== -1) {
|
|
69
|
+
const qStart = qIdx + 2;
|
|
70
|
+
let qEnd = qStart;
|
|
71
|
+
while (qEnd < paramStr.length && paramStr.charCodeAt(qEnd) !== 59 && paramStr.charCodeAt(qEnd) !== 32) qEnd++;
|
|
72
|
+
q = +paramStr.slice(qStart, qEnd) || 0;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
if (type === "text/markdown" || type === "text/plain") {
|
|
76
|
+
if (q > bestMdQ || q === bestMdQ && (bestMdPos === -1 || i < bestMdPos)) {
|
|
77
|
+
bestMdQ = q;
|
|
78
|
+
bestMdPos = i;
|
|
79
|
+
}
|
|
80
|
+
} else if (type === "text/html") {
|
|
81
|
+
htmlQ = q;
|
|
82
|
+
htmlPos = i;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
if (bestMdPos === -1) return false;
|
|
86
|
+
if (htmlPos === -1) return true;
|
|
87
|
+
if (bestMdQ > htmlQ) return true;
|
|
88
|
+
if (bestMdQ === htmlQ && bestMdPos < htmlPos) return true;
|
|
89
|
+
return false;
|
|
90
|
+
}
|
|
91
|
+
//#endregion
|
|
92
|
+
export { parseAcceptHeader, shouldServeMarkdown };
|
package/dist/plugins.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
+
import "./_chunks/const.mjs";
|
|
1
2
|
import { t as createPlugin } from "./_chunks/plugin.mjs";
|
|
2
3
|
import { t as extractionPlugin } from "./_chunks/extraction.mjs";
|
|
3
|
-
import { i as filterPlugin, n as isolateMainPlugin, r as frontmatterPlugin, t as tailwindPlugin } from "./_chunks/
|
|
4
|
+
import { i as filterPlugin, n as isolateMainPlugin, r as frontmatterPlugin, t as tailwindPlugin } from "./_chunks/plugins.mjs";
|
|
4
5
|
export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin };
|
package/dist/preset/minimal.mjs
CHANGED
|
@@ -1,30 +1,40 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
1
|
+
import "../_chunks/const.mjs";
|
|
2
|
+
import "../_chunks/plugin.mjs";
|
|
3
|
+
import "../_chunks/extraction.mjs";
|
|
4
|
+
import { i as filterPlugin, n as isolateMainPlugin, r as frontmatterPlugin, t as tailwindPlugin } from "../_chunks/plugins.mjs";
|
|
5
|
+
//#region src/preset/minimal.ts
|
|
6
|
+
/**
|
|
7
|
+
* Creates a configurable minimal preset with advanced options
|
|
8
|
+
*
|
|
9
|
+
* @param options HTML to Markdown options
|
|
10
|
+
* @returns HTML to Markdown options with configured plugins
|
|
11
|
+
*/
|
|
3
12
|
function withMinimalPreset(options = {}) {
|
|
13
|
+
const filter = filterPlugin({ exclude: [
|
|
14
|
+
40,
|
|
15
|
+
68,
|
|
16
|
+
103,
|
|
17
|
+
58,
|
|
18
|
+
47,
|
|
19
|
+
88,
|
|
20
|
+
73,
|
|
21
|
+
59,
|
|
22
|
+
66,
|
|
23
|
+
65,
|
|
24
|
+
43,
|
|
25
|
+
41
|
|
26
|
+
] });
|
|
4
27
|
const plugins = [
|
|
5
28
|
frontmatterPlugin(),
|
|
6
29
|
isolateMainPlugin(),
|
|
7
30
|
tailwindPlugin(),
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
TAG_FIELDSET,
|
|
11
|
-
TAG_OBJECT,
|
|
12
|
-
TAG_EMBED,
|
|
13
|
-
TAG_FIGURE,
|
|
14
|
-
TAG_FOOTER,
|
|
15
|
-
TAG_ASIDE,
|
|
16
|
-
TAG_IFRAME,
|
|
17
|
-
TAG_INPUT,
|
|
18
|
-
TAG_TEXTAREA,
|
|
19
|
-
TAG_SELECT,
|
|
20
|
-
TAG_BUTTON,
|
|
21
|
-
TAG_NAV
|
|
22
|
-
] })
|
|
31
|
+
...options.plugins || [],
|
|
32
|
+
filter
|
|
23
33
|
];
|
|
24
|
-
if (options.plugins) plugins.push(...options.plugins);
|
|
25
34
|
return {
|
|
26
35
|
...options,
|
|
27
36
|
plugins
|
|
28
37
|
};
|
|
29
38
|
}
|
|
39
|
+
//#endregion
|
|
30
40
|
export { withMinimalPreset };
|
package/dist/splitter.mjs
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
|
-
import
|
|
1
|
+
import "./_chunks/const.mjs";
|
|
2
2
|
import { a as parseHtmlStream, n as createMarkdownProcessor, r as processPluginsForEvent } from "./_chunks/markdown-processor.mjs";
|
|
3
|
+
//#region src/splitter.ts
|
|
4
|
+
const MARKDOWN_HEADER_LINE_RE = /^#{1,6}\s+/;
|
|
5
|
+
const NEWLINE_RE = /\n/g;
|
|
3
6
|
const DEFAULT_HEADERS_TO_SPLIT_ON = [
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
8,
|
|
8
|
+
9,
|
|
9
|
+
10,
|
|
10
|
+
11,
|
|
11
|
+
12
|
|
9
12
|
];
|
|
10
13
|
function createOptions(options) {
|
|
11
14
|
return {
|
|
@@ -29,9 +32,16 @@ function getCodeLanguage(node) {
|
|
|
29
32
|
function shouldSplitOnHeader(tagId, options) {
|
|
30
33
|
return options.headersToSplitOn.includes(tagId);
|
|
31
34
|
}
|
|
35
|
+
/**
|
|
36
|
+
* Get current markdown content WITHOUT clearing buffers
|
|
37
|
+
*/
|
|
32
38
|
function getCurrentMarkdown(state) {
|
|
33
39
|
return state.buffer.join("").trimStart();
|
|
34
40
|
}
|
|
41
|
+
/**
|
|
42
|
+
* Convert HTML to Markdown and split into chunks in single pass
|
|
43
|
+
* Yields chunks during HTML event processing for better memory efficiency
|
|
44
|
+
*/
|
|
35
45
|
function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
36
46
|
const opts = createOptions(options);
|
|
37
47
|
if (opts.chunkOverlap >= opts.chunkSize) throw new Error("chunkOverlap must be less than chunkSize");
|
|
@@ -58,7 +68,7 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
|
58
68
|
}
|
|
59
69
|
let chunkContent = originalChunkContent;
|
|
60
70
|
if (opts.stripHeaders) {
|
|
61
|
-
chunkContent = chunkContent.split("\n").filter((line) => !
|
|
71
|
+
chunkContent = chunkContent.split("\n").filter((line) => !MARKDOWN_HEADER_LINE_RE.test(line)).join("\n").trim();
|
|
62
72
|
if (!chunkContent) {
|
|
63
73
|
lastChunkEndPosition = chunkEnd;
|
|
64
74
|
return;
|
|
@@ -68,13 +78,13 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
|
68
78
|
content: chunkContent.trimEnd(),
|
|
69
79
|
metadata: { loc: { lines: {
|
|
70
80
|
from: lineNumber,
|
|
71
|
-
to: lineNumber + (originalChunkContent.match(
|
|
81
|
+
to: lineNumber + (originalChunkContent.match(NEWLINE_RE) || []).length
|
|
72
82
|
} } }
|
|
73
83
|
};
|
|
74
84
|
if (headerHierarchy.size > 0) {
|
|
75
85
|
chunk.metadata.headers = {};
|
|
76
86
|
for (const [tagId, text] of headerHierarchy.entries()) {
|
|
77
|
-
const level = `h${tagId -
|
|
87
|
+
const level = `h${tagId - 7 + 1}`;
|
|
78
88
|
chunk.metadata.headers[level] = text;
|
|
79
89
|
}
|
|
80
90
|
}
|
|
@@ -86,7 +96,7 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
|
86
96
|
const maxOverlap = Math.max(0, originalChunkContent.length - 1);
|
|
87
97
|
lastChunkEndPosition = chunkEnd - Math.min(opts.chunkOverlap, maxOverlap);
|
|
88
98
|
} else lastChunkEndPosition = chunkEnd;
|
|
89
|
-
lineNumber += (originalChunkContent.match(
|
|
99
|
+
lineNumber += (originalChunkContent.match(NEWLINE_RE) || []).length;
|
|
90
100
|
}
|
|
91
101
|
const parseState = {
|
|
92
102
|
depthMap: processor.state.depthMap,
|
|
@@ -99,36 +109,36 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
|
99
109
|
});
|
|
100
110
|
for (const event of eventBuffer) {
|
|
101
111
|
const { type: eventType, node } = event;
|
|
102
|
-
if (node.type ===
|
|
112
|
+
if (node.type === 1) {
|
|
103
113
|
const element = node;
|
|
104
114
|
const tagId = element.tagId;
|
|
105
|
-
if (tagId && tagId >=
|
|
106
|
-
if (eventType ===
|
|
115
|
+
if (tagId && tagId >= 7 && tagId <= 12) {
|
|
116
|
+
if (eventType === 0) {
|
|
107
117
|
collectingHeaderText = true;
|
|
108
118
|
currentHeaderTagId = tagId;
|
|
109
119
|
currentHeaderText = "";
|
|
110
120
|
if (shouldSplitOnHeader(tagId, opts)) {
|
|
111
121
|
if (seenSplitHeaders.has(tagId)) {
|
|
112
122
|
yield* flushChunk();
|
|
113
|
-
for (let i = tagId; i <=
|
|
123
|
+
for (let i = tagId; i <= 12; i++) headerHierarchy.delete(i);
|
|
114
124
|
}
|
|
115
125
|
seenSplitHeaders.add(tagId);
|
|
116
126
|
}
|
|
117
|
-
} else if (eventType ===
|
|
127
|
+
} else if (eventType === 1 && currentHeaderTagId === tagId) {
|
|
118
128
|
headerHierarchy.set(tagId, currentHeaderText.trim());
|
|
119
129
|
collectingHeaderText = false;
|
|
120
130
|
currentHeaderTagId = null;
|
|
121
131
|
}
|
|
122
132
|
}
|
|
123
|
-
if (tagId ===
|
|
124
|
-
if (eventType ===
|
|
133
|
+
if (tagId === 23 && element.depthMap[34] > 0) {
|
|
134
|
+
if (eventType === 0) {
|
|
125
135
|
const lang = getCodeLanguage(element);
|
|
126
136
|
if (lang && !currentChunkCodeLanguage) currentChunkCodeLanguage = lang;
|
|
127
137
|
}
|
|
128
138
|
}
|
|
129
|
-
if (tagId ===
|
|
139
|
+
if (tagId === 13 && eventType === 0) yield* flushChunk();
|
|
130
140
|
}
|
|
131
|
-
if (collectingHeaderText && node.type ===
|
|
141
|
+
if (collectingHeaderText && node.type === 2) currentHeaderText += node.value;
|
|
132
142
|
processPluginsForEvent(event, opts.plugins, processor.state, processor.processEvent);
|
|
133
143
|
if (!opts.returnEachLine) {
|
|
134
144
|
const currentMd = getCurrentMarkdown(processor.state);
|
|
@@ -166,6 +176,10 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
|
166
176
|
}
|
|
167
177
|
yield* flushChunk();
|
|
168
178
|
}
|
|
179
|
+
/**
|
|
180
|
+
* Convert HTML to Markdown and split into chunks in single pass
|
|
181
|
+
* Chunks are created during HTML event processing
|
|
182
|
+
*/
|
|
169
183
|
function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
170
184
|
const opts = createOptions(options);
|
|
171
185
|
const chunks = [];
|
|
@@ -193,4 +207,5 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
193
207
|
}
|
|
194
208
|
return chunks;
|
|
195
209
|
}
|
|
210
|
+
//#endregion
|
|
196
211
|
export { htmlToMarkdownSplitChunks, htmlToMarkdownSplitChunksStream };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mdream",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.17.0",
|
|
5
5
|
"description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -39,6 +39,14 @@
|
|
|
39
39
|
},
|
|
40
40
|
"default": "./dist/cli.mjs"
|
|
41
41
|
},
|
|
42
|
+
"./negotiate": {
|
|
43
|
+
"types": "./dist/negotiate.d.mts",
|
|
44
|
+
"import": {
|
|
45
|
+
"types": "./dist/negotiate.d.mts",
|
|
46
|
+
"default": "./dist/negotiate.mjs"
|
|
47
|
+
},
|
|
48
|
+
"default": "./dist/negotiate.mjs"
|
|
49
|
+
},
|
|
42
50
|
"./plugins": {
|
|
43
51
|
"types": "./dist/plugins.d.mts",
|
|
44
52
|
"import": {
|
|
@@ -77,7 +85,7 @@
|
|
|
77
85
|
],
|
|
78
86
|
"browser": "./dist/iife.js",
|
|
79
87
|
"dependencies": {
|
|
80
|
-
"cac": "^
|
|
88
|
+
"cac": "^7.0.0",
|
|
81
89
|
"pathe": "^2.0.3",
|
|
82
90
|
"tinyglobby": "^0.2.15"
|
|
83
91
|
},
|