mdream 0.13.3 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +111 -0
- package/dist/_chunks/{const-BOAJ1T5c.mjs → const-Bf_XN9U9.mjs} +2 -5
- package/dist/_chunks/{extraction-BPaDGYvv.mjs → extraction-BA9MDtq3.mjs} +4 -6
- package/dist/_chunks/{llms-txt-DC12yO2l.mjs → llms-txt-T79S7X24.mjs} +123 -35
- package/dist/_chunks/{markdown-processor-f7XT0--8.mjs → markdown-processor-D26Uo5td.mjs} +35 -64
- package/dist/_chunks/{minimal-co1tIZYm.mjs → minimal-BiDhcwif.mjs} +3 -3
- package/dist/_chunks/{plugin-DrovQriD.mjs → plugin-CjWWQTuL.mjs} +1 -1
- package/dist/_chunks/{plugin-CgnpSqtP.d.mts → plugin-D5soyEXm.d.mts} +2 -2
- package/dist/_chunks/{plugins-C5_irVJs.mjs → plugins-DJnqR2fA.mjs} +23 -41
- package/dist/_chunks/{src-C3QpB75q.mjs → src-BJpipdul.mjs} +3 -4
- package/dist/_chunks/{types-DqiI86yW.d.mts → types-CT4ZxeOH.d.mts} +1 -1
- package/dist/cli.mjs +12 -18
- package/dist/iife.js +8 -18
- package/dist/index.d.mts +2 -5
- package/dist/index.mjs +4 -4
- package/dist/llms-txt.d.mts +45 -1
- package/dist/llms-txt.mjs +7 -7
- package/dist/plugins.d.mts +2 -2
- package/dist/plugins.mjs +4 -4
- package/dist/preset/minimal.d.mts +1 -1
- package/dist/preset/minimal.mjs +5 -5
- package/dist/splitter.d.mts +7 -2
- package/dist/splitter.mjs +45 -34
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -311,6 +311,117 @@ htmlToMarkdown(html, { plugins: [plugin] })
|
|
|
311
311
|
|
|
312
312
|
The extraction plugin provides memory-efficient element extraction with full text content and attributes, perfect for SEO analysis, content discovery, and data mining.
|
|
313
313
|
|
|
314
|
+
## Markdown Splitting
|
|
315
|
+
|
|
316
|
+
Split HTML into chunks during conversion for LLM context windows, vector databases, or document processing.
|
|
317
|
+
|
|
318
|
+
### Basic Chunking
|
|
319
|
+
|
|
320
|
+
```ts
|
|
321
|
+
import { TAG_H2 } from 'mdream'
|
|
322
|
+
import { htmlToMarkdownSplitChunks } from 'mdream/splitter'
|
|
323
|
+
|
|
324
|
+
const html = `
|
|
325
|
+
<h1>Documentation</h1>
|
|
326
|
+
<h2>Installation</h2>
|
|
327
|
+
<p>Install via npm...</p>
|
|
328
|
+
<h2>Usage</h2>
|
|
329
|
+
<p>Use it like this...</p>
|
|
330
|
+
`
|
|
331
|
+
|
|
332
|
+
const chunks = htmlToMarkdownSplitChunks(html, {
|
|
333
|
+
headersToSplitOn: [TAG_H2], // Split on h2 headers
|
|
334
|
+
chunkSize: 1000, // Max chars per chunk
|
|
335
|
+
chunkOverlap: 200, // Overlap for context
|
|
336
|
+
stripHeaders: true // Remove headers from content
|
|
337
|
+
})
|
|
338
|
+
|
|
339
|
+
// Each chunk includes content and metadata
|
|
340
|
+
chunks.forEach((chunk) => {
|
|
341
|
+
console.log(chunk.content)
|
|
342
|
+
console.log(chunk.metadata.headers) // { h1: "Documentation", h2: "Installation" }
|
|
343
|
+
console.log(chunk.metadata.code) // Language if chunk contains code
|
|
344
|
+
console.log(chunk.metadata.loc) // Line numbers
|
|
345
|
+
})
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
### Streaming Chunks (Memory Efficient)
|
|
349
|
+
|
|
350
|
+
For large documents, use the generator version to process chunks one at a time:
|
|
351
|
+
|
|
352
|
+
```ts
|
|
353
|
+
import { htmlToMarkdownSplitChunksStream } from 'mdream/splitter'
|
|
354
|
+
|
|
355
|
+
// Process chunks incrementally - lower memory usage
|
|
356
|
+
for (const chunk of htmlToMarkdownSplitChunksStream(html, options)) {
|
|
357
|
+
await processChunk(chunk) // Handle each chunk as it's generated
|
|
358
|
+
|
|
359
|
+
// Can break early if you found what you need
|
|
360
|
+
if (foundTarget)
|
|
361
|
+
break
|
|
362
|
+
}
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
**Benefits of streaming:**
|
|
366
|
+
- Lower memory usage - chunks aren't stored in an array
|
|
367
|
+
- Early termination - stop processing when you find what you need
|
|
368
|
+
- Better for large documents
|
|
369
|
+
|
|
370
|
+
### Splitting Options
|
|
371
|
+
|
|
372
|
+
```ts
|
|
373
|
+
interface SplitterOptions {
|
|
374
|
+
// Structural splitting
|
|
375
|
+
headersToSplitOn?: number[] // TAG_H1, TAG_H2, etc. Default: [TAG_H2-TAG_H6]
|
|
376
|
+
|
|
377
|
+
// Size-based splitting
|
|
378
|
+
chunkSize?: number // Max chunk size. Default: 1000
|
|
379
|
+
chunkOverlap?: number // Overlap between chunks. Default: 200
|
|
380
|
+
lengthFunction?: (text: string) => number // Custom length (e.g., token count)
|
|
381
|
+
|
|
382
|
+
// Output formatting
|
|
383
|
+
stripHeaders?: boolean // Remove headers from content. Default: true
|
|
384
|
+
returnEachLine?: boolean // Split into individual lines. Default: false
|
|
385
|
+
|
|
386
|
+
// Standard options
|
|
387
|
+
origin?: string // Base URL for links/images
|
|
388
|
+
plugins?: Plugin[] // Apply plugins during conversion
|
|
389
|
+
}
|
|
390
|
+
```
|
|
391
|
+
|
|
392
|
+
### Chunk Metadata
|
|
393
|
+
|
|
394
|
+
Each chunk includes rich metadata for context:
|
|
395
|
+
|
|
396
|
+
```ts
|
|
397
|
+
interface MarkdownChunk {
|
|
398
|
+
content: string
|
|
399
|
+
metadata: {
|
|
400
|
+
headers?: Record<string, string> // Header hierarchy: { h1: "Title", h2: "Section" }
|
|
401
|
+
code?: string // Code block language if present
|
|
402
|
+
loc?: { // Line number range
|
|
403
|
+
lines: { from: number, to: number }
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
### Use with Presets
|
|
410
|
+
|
|
411
|
+
Combine splitting with presets for optimized output:
|
|
412
|
+
|
|
413
|
+
```ts
|
|
414
|
+
import { TAG_H2 } from 'mdream'
|
|
415
|
+
import { withMinimalPreset } from 'mdream/preset/minimal'
|
|
416
|
+
import { htmlToMarkdownSplitChunks } from 'mdream/splitter'
|
|
417
|
+
|
|
418
|
+
const chunks = htmlToMarkdownSplitChunks(html, withMinimalPreset({
|
|
419
|
+
headersToSplitOn: [TAG_H2],
|
|
420
|
+
chunkSize: 500,
|
|
421
|
+
origin: 'https://example.com'
|
|
422
|
+
}))
|
|
423
|
+
```
|
|
424
|
+
|
|
314
425
|
## Credits
|
|
315
426
|
|
|
316
427
|
- [ultrahtml](https://github.com/natemoo-re/ultrahtml): HTML parsing inspiration
|
|
@@ -29,10 +29,7 @@ function collectNodeContent(node, content, state) {
|
|
|
29
29
|
*/
|
|
30
30
|
function assembleBufferedContent(state) {
|
|
31
31
|
const fragments = [];
|
|
32
|
-
for (const [regionId, content] of Array.from(state.regionContentBuffers.entries()))
|
|
33
|
-
const include = state.regionToggles.get(regionId);
|
|
34
|
-
if (include) fragments.push(...content);
|
|
35
|
-
}
|
|
32
|
+
for (const [regionId, content] of Array.from(state.regionContentBuffers.entries())) if (state.regionToggles.get(regionId)) fragments.push(...content);
|
|
36
33
|
state.regionToggles.clear();
|
|
37
34
|
state.regionContentBuffers.clear();
|
|
38
35
|
return fragments.join("").trimStart();
|
|
@@ -285,4 +282,4 @@ const LIST_ITEM_SPACING = [1, 0];
|
|
|
285
282
|
const TABLE_ROW_SPACING = [0, 1];
|
|
286
283
|
|
|
287
284
|
//#endregion
|
|
288
|
-
export {
|
|
285
|
+
export { TAG_H2 as $, TAG_TBODY as $t, TAG_BUTTON as A, TAG_P as At, TAG_DFN as B, TAG_SCRIPT as Bt, TAG_AUDIO as C, TAG_METER as Ct, TAG_BLOCKQUOTE as D, TAG_OBJECT as Dt, TAG_BDO as E, TAG_NOSCRIPT as Et, TAG_CODE as F, TAG_Q as Ft, TAG_EM as G, TAG_SPAN as Gt, TAG_DIV as H, TAG_SELECT as Ht, TAG_COL as I, TAG_RP as It, TAG_FIGCAPTION as J, TAG_SUB as Jt, TAG_EMBED as K, TAG_STRONG as Kt, TAG_DD as L, TAG_RT as Lt, TAG_CAPTION as M, TAG_PLAINTEXT as Mt, TAG_CENTER as N, TAG_PRE as Nt, TAG_BODY as O, TAG_OL as Ot, TAG_CITE as P, TAG_PROGRESS as Pt, TAG_H1 as Q, TAG_TABLE as Qt, TAG_DEL as R, TAG_RUBY as Rt, TAG_ASIDE as S, TAG_META as St, TAG_BASE as T, TAG_NOFRAMES as Tt, TAG_DL as U, TAG_SMALL as Ut, TAG_DIALOG as V, TAG_SECTION as Vt, TAG_DT as W, TAG_SOURCE as Wt, TAG_FOOTER as X, TAG_SUP as Xt, TAG_FIGURE as Y, TAG_SUMMARY as Yt, TAG_FORM as Z, TAG_SVG as Zt, TAG_A as _, TagIdMap as _n, TAG_LI as _t, LIST_ITEM_SPACING as a, TAG_THEAD as an, TAG_HEADER as at, TAG_AREA as b, createBufferRegion as bn, TAG_MAP as bt, MARKDOWN_HORIZONTAL_RULE as c, TAG_TR as cn, TAG_I as ct, MARKDOWN_STRONG as d, TAG_UL as dn, TAG_INPUT as dt, TAG_TD as en, TAG_H3 as et, MAX_TAG_ID as f, TAG_VAR as fn, TAG_INS as ft, TABLE_ROW_SPACING as g, TEXT_NODE as gn, TAG_LEGEND as gt, NodeEventExit as h, TAG_XMP as hn, TAG_LABEL as ht, HTML_ENTITIES as i, TAG_TH as in, TAG_HEAD as it, TAG_CANVAS as j, TAG_PARAM as jt, TAG_BR as k, TAG_OPTION as kt, MARKDOWN_INLINE_CODE as l, TAG_TRACK as ln, TAG_IFRAME as lt, NodeEventEnter as m, TAG_WBR as mn, TAG_KEYGEN as mt, DEFAULT_BLOCK_SPACING as n, TAG_TEXTAREA as nn, TAG_H5 as nt, MARKDOWN_CODE_BLOCK as o, TAG_TIME as on, TAG_HR as ot, NO_SPACING as p, TAG_VIDEO as pn, TAG_KBD as pt, TAG_FIELDSET as q, TAG_STYLE as qt, ELEMENT_NODE as r, TAG_TFOOT as rn, TAG_H6 as rt, MARKDOWN_EMPHASIS as s, TAG_TITLE as sn, TAG_HTML as st, BLOCKQUOTE_SPACING as t, TAG_TEMPLATE as tn, TAG_H4 as tt, MARKDOWN_STRIKETHROUGH as u, TAG_U as un, TAG_IMG as ut, TAG_ABBR as v, assembleBufferedContent as vn, TAG_LINK as vt, TAG_B as w, TAG_NAV as wt, TAG_ARTICLE as x, TAG_MARK as xt, TAG_ADDRESS as y, collectNodeContent as yn, TAG_MAIN as yt, TAG_DETAILS as z, TAG_SAMP as zt };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { createPlugin } from "./plugin-
|
|
1
|
+
import { t as createPlugin } from "./plugin-CjWWQTuL.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/libs/query-selector.ts
|
|
4
4
|
/**
|
|
@@ -28,8 +28,7 @@ function createClassSelector(selector) {
|
|
|
28
28
|
return {
|
|
29
29
|
matches: (element) => {
|
|
30
30
|
if (!element.attributes?.class) return false;
|
|
31
|
-
|
|
32
|
-
return classes.includes(className);
|
|
31
|
+
return element.attributes.class.trim().split(" ").filter(Boolean).includes(className);
|
|
33
32
|
},
|
|
34
33
|
toString: () => `.${className}`
|
|
35
34
|
};
|
|
@@ -109,7 +108,7 @@ function extractionPlugin(selectors) {
|
|
|
109
108
|
matcher: parseSelector(selector),
|
|
110
109
|
callback
|
|
111
110
|
}));
|
|
112
|
-
const trackedElements = new Map();
|
|
111
|
+
const trackedElements = /* @__PURE__ */ new Map();
|
|
113
112
|
return createPlugin({
|
|
114
113
|
onNodeEnter(element) {
|
|
115
114
|
matcherCallbacks.forEach(({ matcher, callback }) => {
|
|
@@ -126,7 +125,6 @@ function extractionPlugin(selectors) {
|
|
|
126
125
|
if (tracked) tracked.textContent += textNode.value;
|
|
127
126
|
currentParent = currentParent.parent;
|
|
128
127
|
}
|
|
129
|
-
return void 0;
|
|
130
128
|
},
|
|
131
129
|
onNodeExit(element, state) {
|
|
132
130
|
const tracked = trackedElements.get(element);
|
|
@@ -143,4 +141,4 @@ function extractionPlugin(selectors) {
|
|
|
143
141
|
}
|
|
144
142
|
|
|
145
143
|
//#endregion
|
|
146
|
-
export {
|
|
144
|
+
export { parseSelector as n, extractionPlugin as t };
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { htmlToMarkdown } from "./src-
|
|
2
|
-
import { extractionPlugin } from "./extraction-
|
|
3
|
-
import { readFile } from "node:fs/promises";
|
|
4
|
-
import { basename, dirname, relative, sep } from "pathe";
|
|
1
|
+
import { t as htmlToMarkdown } from "./src-BJpipdul.mjs";
|
|
2
|
+
import { t as extractionPlugin } from "./extraction-BA9MDtq3.mjs";
|
|
3
|
+
import { mkdir, open, readFile } from "node:fs/promises";
|
|
4
|
+
import { basename, dirname, join, relative, sep } from "pathe";
|
|
5
5
|
import { glob } from "tinyglobby";
|
|
6
6
|
|
|
7
7
|
//#region src/llms-txt.ts
|
|
@@ -13,28 +13,27 @@ function extractMetadata(html, url) {
|
|
|
13
13
|
let description = "";
|
|
14
14
|
let keywords = "";
|
|
15
15
|
let author = "";
|
|
16
|
-
const extractionPluginInstance = extractionPlugin({
|
|
17
|
-
"title": (element) => {
|
|
18
|
-
if (!title && element.textContent) title = element.textContent.trim();
|
|
19
|
-
},
|
|
20
|
-
"meta[name=\"description\"]": (element) => {
|
|
21
|
-
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
22
|
-
},
|
|
23
|
-
"meta[property=\"og:description\"]": (element) => {
|
|
24
|
-
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
25
|
-
},
|
|
26
|
-
"meta[name=\"keywords\"]": (element) => {
|
|
27
|
-
if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
|
|
28
|
-
},
|
|
29
|
-
"meta[name=\"author\"]": (element) => {
|
|
30
|
-
if (!author && element.attributes?.content) author = element.attributes.content.trim();
|
|
31
|
-
},
|
|
32
|
-
"meta[property=\"og:title\"]": (element) => {
|
|
33
|
-
if (!title && element.attributes?.content) title = element.attributes.content.trim();
|
|
34
|
-
}
|
|
35
|
-
});
|
|
36
16
|
htmlToMarkdown(html, {
|
|
37
|
-
plugins: [
|
|
17
|
+
plugins: [extractionPlugin({
|
|
18
|
+
"title": (element) => {
|
|
19
|
+
if (!title && element.textContent) title = element.textContent.trim();
|
|
20
|
+
},
|
|
21
|
+
"meta[name=\"description\"]": (element) => {
|
|
22
|
+
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
23
|
+
},
|
|
24
|
+
"meta[property=\"og:description\"]": (element) => {
|
|
25
|
+
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
26
|
+
},
|
|
27
|
+
"meta[name=\"keywords\"]": (element) => {
|
|
28
|
+
if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
|
|
29
|
+
},
|
|
30
|
+
"meta[name=\"author\"]": (element) => {
|
|
31
|
+
if (!author && element.attributes?.content) author = element.attributes.content.trim();
|
|
32
|
+
},
|
|
33
|
+
"meta[property=\"og:title\"]": (element) => {
|
|
34
|
+
if (!title && element.attributes?.content) title = element.attributes.content.trim();
|
|
35
|
+
}
|
|
36
|
+
})],
|
|
38
37
|
origin: url
|
|
39
38
|
});
|
|
40
39
|
return {
|
|
@@ -113,8 +112,7 @@ function generateLlmsTxtContent(files, options) {
|
|
|
113
112
|
* Parse frontmatter from markdown content
|
|
114
113
|
*/
|
|
115
114
|
function parseFrontmatter(content) {
|
|
116
|
-
const
|
|
117
|
-
const match = content.match(frontmatterRegex);
|
|
115
|
+
const match = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
|
|
118
116
|
if (!match) return {
|
|
119
117
|
frontmatter: null,
|
|
120
118
|
body: content
|
|
@@ -127,8 +125,7 @@ function parseFrontmatter(content) {
|
|
|
127
125
|
const colonIndex = line.indexOf(":");
|
|
128
126
|
if (colonIndex > 0) {
|
|
129
127
|
const key = line.substring(0, colonIndex).trim();
|
|
130
|
-
|
|
131
|
-
frontmatter[key] = value;
|
|
128
|
+
frontmatter[key] = line.substring(colonIndex + 1).trim();
|
|
132
129
|
}
|
|
133
130
|
}
|
|
134
131
|
return {
|
|
@@ -172,11 +169,10 @@ function generateLlmsFullTxtContent(files, options) {
|
|
|
172
169
|
if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
|
|
173
170
|
if (file.metadata.author) metadata.author = file.metadata.author;
|
|
174
171
|
}
|
|
175
|
-
const
|
|
172
|
+
const frontmatterString = serializeFrontmatter(frontmatter ? {
|
|
176
173
|
...frontmatter,
|
|
177
174
|
...metadata
|
|
178
|
-
} : metadata;
|
|
179
|
-
const frontmatterString = serializeFrontmatter(mergedFrontmatter);
|
|
175
|
+
} : metadata);
|
|
180
176
|
let contentBody = frontmatter ? body : file.content;
|
|
181
177
|
const titleLine = contentBody.trim().split("\n")[0];
|
|
182
178
|
if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
|
|
@@ -191,8 +187,7 @@ function generateLlmsFullTxtContent(files, options) {
|
|
|
191
187
|
function generateMarkdownFilesContent(files) {
|
|
192
188
|
const markdownFiles = [];
|
|
193
189
|
for (const file of files) {
|
|
194
|
-
const
|
|
195
|
-
const mdPath = `md/${urlPath}.md`;
|
|
190
|
+
const mdPath = `md/${file.url === "/" ? "index" : file.url.replace(/^\//, "").replace(/\/$/, "")}.md`;
|
|
196
191
|
markdownFiles.push({
|
|
197
192
|
path: mdPath,
|
|
198
193
|
content: file.content
|
|
@@ -220,6 +215,99 @@ async function generateLlmsTxtArtifacts(options) {
|
|
|
220
215
|
processedFiles: files
|
|
221
216
|
};
|
|
222
217
|
}
|
|
218
|
+
/**
|
|
219
|
+
* Create a WritableStream that generates llms.txt artifacts by streaming pages to disk
|
|
220
|
+
*
|
|
221
|
+
* Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
|
|
222
|
+
* never keeping full content in memory. Creates outputDir recursively if needed.
|
|
223
|
+
*
|
|
224
|
+
* @example
|
|
225
|
+
* ```typescript
|
|
226
|
+
* const stream = createLlmsTxtStream({
|
|
227
|
+
* siteName: 'My Docs',
|
|
228
|
+
* description: 'Documentation site',
|
|
229
|
+
* origin: 'https://example.com',
|
|
230
|
+
* generateFull: true,
|
|
231
|
+
* outputDir: './dist',
|
|
232
|
+
* })
|
|
233
|
+
*
|
|
234
|
+
* const writer = stream.getWriter()
|
|
235
|
+
* await writer.write({
|
|
236
|
+
* title: 'Home',
|
|
237
|
+
* content: '# Welcome\n\nHome page content.',
|
|
238
|
+
* url: '/',
|
|
239
|
+
* })
|
|
240
|
+
* await writer.close()
|
|
241
|
+
* ```
|
|
242
|
+
*
|
|
243
|
+
* @param options - Configuration options
|
|
244
|
+
* @returns WritableStream that accepts ProcessedFile objects
|
|
245
|
+
*/
|
|
246
|
+
function createLlmsTxtStream(options = {}) {
|
|
247
|
+
const { siteName = "Site", description, origin = "", generateFull, outputDir = process.cwd() } = options;
|
|
248
|
+
let llmsTxtHandle;
|
|
249
|
+
let llmsFullTxtHandle;
|
|
250
|
+
return new WritableStream({
|
|
251
|
+
async start() {
|
|
252
|
+
await mkdir(outputDir, { recursive: true });
|
|
253
|
+
llmsTxtHandle = await open(join(outputDir, "llms.txt"), "w");
|
|
254
|
+
let header = `# ${siteName}\n\n`;
|
|
255
|
+
if (description) header += `> ${description}\n\n`;
|
|
256
|
+
header += `## Pages\n\n`;
|
|
257
|
+
await llmsTxtHandle.write(header);
|
|
258
|
+
if (generateFull) {
|
|
259
|
+
llmsFullTxtHandle = await open(join(outputDir, "llms-full.txt"), "w");
|
|
260
|
+
let fullHeader = `# ${siteName}\n\n`;
|
|
261
|
+
if (description) fullHeader += `> ${description}\n\n`;
|
|
262
|
+
await llmsFullTxtHandle.write(fullHeader);
|
|
263
|
+
}
|
|
264
|
+
},
|
|
265
|
+
async write(file) {
|
|
266
|
+
const desc = file.metadata?.description;
|
|
267
|
+
const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
|
|
268
|
+
let chunk = "";
|
|
269
|
+
if (file.filePath && file.filePath.endsWith(".md")) {
|
|
270
|
+
const relativePath = relative(outputDir, file.filePath);
|
|
271
|
+
chunk = `- [${file.title}](${relativePath})${descText}\n`;
|
|
272
|
+
} else {
|
|
273
|
+
const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin + file.url;
|
|
274
|
+
chunk = `- [${file.title}](${url})${descText}\n`;
|
|
275
|
+
}
|
|
276
|
+
await llmsTxtHandle?.write(chunk);
|
|
277
|
+
if (generateFull && llmsFullTxtHandle) {
|
|
278
|
+
const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
|
|
279
|
+
const { frontmatter, body } = parseFrontmatter(file.content);
|
|
280
|
+
const metadata = {
|
|
281
|
+
title: file.title,
|
|
282
|
+
url
|
|
283
|
+
};
|
|
284
|
+
if (file.filePath) metadata.file = relative(outputDir, file.filePath);
|
|
285
|
+
if (file.metadata) {
|
|
286
|
+
if (file.metadata.description) metadata.description = file.metadata.description;
|
|
287
|
+
if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
|
|
288
|
+
if (file.metadata.author) metadata.author = file.metadata.author;
|
|
289
|
+
}
|
|
290
|
+
const frontmatterString = serializeFrontmatter(frontmatter ? {
|
|
291
|
+
...frontmatter,
|
|
292
|
+
...metadata
|
|
293
|
+
} : metadata);
|
|
294
|
+
let contentBody = frontmatter ? body : file.content;
|
|
295
|
+
const titleLine = contentBody.trim().split("\n")[0];
|
|
296
|
+
if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
|
|
297
|
+
const fullChunk = `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
|
|
298
|
+
await llmsFullTxtHandle.write(fullChunk);
|
|
299
|
+
}
|
|
300
|
+
},
|
|
301
|
+
async close() {
|
|
302
|
+
await llmsTxtHandle?.close();
|
|
303
|
+
await llmsFullTxtHandle?.close();
|
|
304
|
+
},
|
|
305
|
+
async abort(reason) {
|
|
306
|
+
await llmsTxtHandle?.close();
|
|
307
|
+
await llmsFullTxtHandle?.close();
|
|
308
|
+
}
|
|
309
|
+
});
|
|
310
|
+
}
|
|
223
311
|
|
|
224
312
|
//#endregion
|
|
225
|
-
export { generateLlmsTxtArtifacts };
|
|
313
|
+
export { generateLlmsTxtArtifacts as n, createLlmsTxtStream as t };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { $ as TAG_H2, $t as TAG_TBODY, A as TAG_BUTTON, At as TAG_P, B as TAG_DFN, Bt as TAG_SCRIPT, C as TAG_AUDIO, Ct as TAG_METER, D as TAG_BLOCKQUOTE, E as TAG_BDO, Et as TAG_NOSCRIPT, F as TAG_CODE, Ft as TAG_Q, G as TAG_EM, Gt as TAG_SPAN, H as TAG_DIV, Ht as TAG_SELECT, I as TAG_COL, It as TAG_RP, Jt as TAG_SUB, K as TAG_EMBED, Kt as TAG_STRONG, L as TAG_DD, Lt as TAG_RT, Mt as TAG_PLAINTEXT, N as TAG_CENTER, Nt as TAG_PRE, O as TAG_BODY, Ot as TAG_OL, P as TAG_CITE, Pt as TAG_PROGRESS, Q as TAG_H1, Qt as TAG_TABLE, R as TAG_DEL, Rt as TAG_RUBY, S as TAG_ASIDE, St as TAG_META, T as TAG_BASE, Tt as TAG_NOFRAMES, U as TAG_DL, Ut as TAG_SMALL, V as TAG_DIALOG, W as TAG_DT, Wt as TAG_SOURCE, X as TAG_FOOTER, Xt as TAG_SUP, Yt as TAG_SUMMARY, Z as TAG_FORM, Zt as TAG_SVG, _ as TAG_A, _n as TagIdMap, _t as TAG_LI, a as LIST_ITEM_SPACING, an as TAG_THEAD, b as TAG_AREA, bt as TAG_MAP, c as MARKDOWN_HORIZONTAL_RULE, cn as TAG_TR, ct as TAG_I, d as MARKDOWN_STRONG, dn as TAG_UL, dt as TAG_INPUT, en as TAG_TD, et as TAG_H3, f as MAX_TAG_ID, fn as TAG_VAR, ft as TAG_INS, g as TABLE_ROW_SPACING, gn as TEXT_NODE, gt as TAG_LEGEND, h as NodeEventExit, hn as TAG_XMP, ht as TAG_LABEL, i as HTML_ENTITIES, in as TAG_TH, it as TAG_HEAD, j as TAG_CANVAS, jt as TAG_PARAM, k as TAG_BR, kt as TAG_OPTION, l as MARKDOWN_INLINE_CODE, ln as TAG_TRACK, lt as TAG_IFRAME, m as NodeEventEnter, mn as TAG_WBR, mt as TAG_KEYGEN, n as DEFAULT_BLOCK_SPACING, nn as TAG_TEXTAREA, nt as TAG_H5, o as MARKDOWN_CODE_BLOCK, on as TAG_TIME, ot as TAG_HR, p as NO_SPACING, pn as TAG_VIDEO, pt as TAG_KBD, q as TAG_FIELDSET, qt as TAG_STYLE, r as ELEMENT_NODE, rn as TAG_TFOOT, rt as TAG_H6, s as MARKDOWN_EMPHASIS, sn as TAG_TITLE, t as BLOCKQUOTE_SPACING, tn as TAG_TEMPLATE, tt as TAG_H4, u as MARKDOWN_STRIKETHROUGH, un as TAG_U, ut as TAG_IMG, v as TAG_ABBR, vn as assembleBufferedContent, vt as TAG_LINK, w as TAG_B, wt as TAG_NAV, xt as TAG_MARK, y as TAG_ADDRESS, yn as collectNodeContent, z as TAG_DETAILS, zt as TAG_SAMP } from "./const-Bf_XN9U9.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/tags.ts
|
|
4
4
|
function resolveUrl(url, origin) {
|
|
@@ -6,15 +6,9 @@ function resolveUrl(url, origin) {
|
|
|
6
6
|
if (url.startsWith("//")) return `https:${url}`;
|
|
7
7
|
if (url.startsWith("#")) return url;
|
|
8
8
|
if (origin) {
|
|
9
|
-
if (url.startsWith("/") && origin) {
|
|
10
|
-
const cleanOrigin = origin.endsWith("/") ? origin.slice(0, -1) : origin;
|
|
11
|
-
return `${cleanOrigin}${url}`;
|
|
12
|
-
}
|
|
9
|
+
if (url.startsWith("/") && origin) return `${origin.endsWith("/") ? origin.slice(0, -1) : origin}${url}`;
|
|
13
10
|
if (url.startsWith("./")) return `${origin}/${url.slice(2)}`;
|
|
14
|
-
if (!url.startsWith("http")) {
|
|
15
|
-
const cleanUrl = url.startsWith("/") ? url.slice(1) : url;
|
|
16
|
-
return `${origin}/${cleanUrl}`;
|
|
17
|
-
}
|
|
11
|
+
if (!url.startsWith("http")) return `${origin}/${url.startsWith("/") ? url.slice(1) : url}`;
|
|
18
12
|
}
|
|
19
13
|
return url;
|
|
20
14
|
}
|
|
@@ -157,10 +151,7 @@ const tagHandlers = {
|
|
|
157
151
|
},
|
|
158
152
|
[TAG_CODE]: {
|
|
159
153
|
enter: ({ node }) => {
|
|
160
|
-
if ((node.depthMap[TAG_PRE] || 0) > 0) {
|
|
161
|
-
const language = getLanguageFromClass(node.attributes?.class);
|
|
162
|
-
return `${MARKDOWN_CODE_BLOCK}${language}\n`;
|
|
163
|
-
}
|
|
154
|
+
if ((node.depthMap[TAG_PRE] || 0) > 0) return `${MARKDOWN_CODE_BLOCK}${getLanguageFromClass(node.attributes?.class)}\n`;
|
|
164
155
|
return MARKDOWN_INLINE_CODE;
|
|
165
156
|
},
|
|
166
157
|
exit: ({ node }) => {
|
|
@@ -179,9 +170,7 @@ const tagHandlers = {
|
|
|
179
170
|
if (isInsideTableCell(node)) return "<li>";
|
|
180
171
|
const depth = (node.depthMap[TAG_UL] || 0) + (node.depthMap[TAG_OL] || 0) - 1;
|
|
181
172
|
const isOrdered = node.parent?.tagId === TAG_OL;
|
|
182
|
-
|
|
183
|
-
const marker = isOrdered ? `${node.index + 1}. ` : "- ";
|
|
184
|
-
return `${indent}${marker}`;
|
|
173
|
+
return `${" ".repeat(Math.max(0, depth))}${isOrdered ? `${node.index + 1}. ` : "- "}`;
|
|
185
174
|
},
|
|
186
175
|
exit: ({ node }) => isInsideTableCell(node) ? "</li>" : void 0,
|
|
187
176
|
spacing: LIST_ITEM_SPACING
|
|
@@ -194,8 +183,7 @@ const tagHandlers = {
|
|
|
194
183
|
if (!node.attributes?.href) return "";
|
|
195
184
|
const href = resolveUrl(node.attributes?.href || "", state.options?.origin);
|
|
196
185
|
let title = node.attributes?.title;
|
|
197
|
-
|
|
198
|
-
if (lastContent === title) title = "";
|
|
186
|
+
if (state.lastContentCache === title) title = "";
|
|
199
187
|
return title ? `](${href} "${title}")` : `](${href})`;
|
|
200
188
|
},
|
|
201
189
|
collapsesInnerWhiteSpace: true,
|
|
@@ -204,9 +192,7 @@ const tagHandlers = {
|
|
|
204
192
|
},
|
|
205
193
|
[TAG_IMG]: {
|
|
206
194
|
enter: ({ node, state }) => {
|
|
207
|
-
|
|
208
|
-
const src = resolveUrl(node.attributes?.src || "", state.options?.origin);
|
|
209
|
-
return ``;
|
|
195
|
+
return `})`;
|
|
210
196
|
},
|
|
211
197
|
collapsesInnerWhiteSpace: true,
|
|
212
198
|
isSelfClosing: true,
|
|
@@ -241,15 +227,14 @@ const tagHandlers = {
|
|
|
241
227
|
state.tableRenderedTable = true;
|
|
242
228
|
const alignments = state.tableColumnAlignments;
|
|
243
229
|
while (alignments.length < state.tableCurrentRowCells) alignments.push("");
|
|
244
|
-
|
|
230
|
+
return ` |\n| ${alignments.map((align) => {
|
|
245
231
|
switch (align) {
|
|
246
232
|
case "left": return ":---";
|
|
247
233
|
case "center": return ":---:";
|
|
248
234
|
case "right": return "---:";
|
|
249
235
|
default: return "---";
|
|
250
236
|
}
|
|
251
|
-
})
|
|
252
|
-
return ` |\n| ${alignmentMarkers.join(" | ")} |`;
|
|
237
|
+
}).join(" | ")} |`;
|
|
253
238
|
}
|
|
254
239
|
return " |";
|
|
255
240
|
},
|
|
@@ -637,17 +622,15 @@ function isWhitespace(charCode) {
|
|
|
637
622
|
*/
|
|
638
623
|
function parseHtml(html, options = {}) {
|
|
639
624
|
const events = [];
|
|
640
|
-
const state = {
|
|
641
|
-
depthMap: new Uint8Array(MAX_TAG_ID),
|
|
642
|
-
depth: 0,
|
|
643
|
-
plugins: options.plugins || []
|
|
644
|
-
};
|
|
645
|
-
const remainingHtml = parseHtmlInternal(html, state, (event) => {
|
|
646
|
-
events.push(event);
|
|
647
|
-
});
|
|
648
625
|
return {
|
|
649
626
|
events,
|
|
650
|
-
remainingHtml
|
|
627
|
+
remainingHtml: parseHtmlInternal(html, {
|
|
628
|
+
depthMap: new Uint8Array(MAX_TAG_ID),
|
|
629
|
+
depth: 0,
|
|
630
|
+
plugins: options.plugins || []
|
|
631
|
+
}, (event) => {
|
|
632
|
+
events.push(event);
|
|
633
|
+
})
|
|
651
634
|
};
|
|
652
635
|
}
|
|
653
636
|
/**
|
|
@@ -840,8 +823,7 @@ function processClosingTag(htmlChunk, position, state, handleEvent) {
|
|
|
840
823
|
const chunkLength = htmlChunk.length;
|
|
841
824
|
let foundClose = false;
|
|
842
825
|
while (i < chunkLength) {
|
|
843
|
-
|
|
844
|
-
if (charCode === GT_CHAR) {
|
|
826
|
+
if (htmlChunk.charCodeAt(i) === GT_CHAR) {
|
|
845
827
|
foundClose = true;
|
|
846
828
|
break;
|
|
847
829
|
}
|
|
@@ -852,8 +834,7 @@ function processClosingTag(htmlChunk, position, state, handleEvent) {
|
|
|
852
834
|
newPosition: position,
|
|
853
835
|
remainingText: htmlChunk.substring(position)
|
|
854
836
|
};
|
|
855
|
-
const
|
|
856
|
-
const tagId = TagIdMap[tagName] ?? -1;
|
|
837
|
+
const tagId = TagIdMap[htmlChunk.substring(tagNameStart, i).toLowerCase()] ?? -1;
|
|
857
838
|
if (state.currentNode?.tagHandler?.isNonNesting && tagId !== state.currentNode.tagId) return {
|
|
858
839
|
complete: false,
|
|
859
840
|
newPosition: position,
|
|
@@ -885,16 +866,15 @@ function closeNode(node, state, handleEvent) {
|
|
|
885
866
|
const prefix = node.attributes?.title || node.attributes?.["aria-label"] || "";
|
|
886
867
|
if (prefix) {
|
|
887
868
|
node.childTextNodeIndex = 1;
|
|
888
|
-
const textNode = {
|
|
889
|
-
type: TEXT_NODE,
|
|
890
|
-
value: prefix,
|
|
891
|
-
parent: node,
|
|
892
|
-
index: 0,
|
|
893
|
-
depth: node.depth + 1
|
|
894
|
-
};
|
|
895
869
|
handleEvent({
|
|
896
870
|
type: NodeEventEnter,
|
|
897
|
-
node:
|
|
871
|
+
node: {
|
|
872
|
+
type: TEXT_NODE,
|
|
873
|
+
value: prefix,
|
|
874
|
+
parent: node,
|
|
875
|
+
index: 0,
|
|
876
|
+
depth: node.depth + 1
|
|
877
|
+
}
|
|
898
878
|
});
|
|
899
879
|
for (const parent of traverseUpToFirstBlockNode(node)) parent.childTextNodeIndex = (parent.childTextNodeIndex || 0) + 1;
|
|
900
880
|
}
|
|
@@ -1262,8 +1242,8 @@ function calculateNewLineConfig(node) {
|
|
|
1262
1242
|
function createMarkdownProcessor(options = {}) {
|
|
1263
1243
|
const state = {
|
|
1264
1244
|
options,
|
|
1265
|
-
regionToggles: new Map(),
|
|
1266
|
-
regionContentBuffers: new Map(),
|
|
1245
|
+
regionToggles: /* @__PURE__ */ new Map(),
|
|
1246
|
+
regionContentBuffers: /* @__PURE__ */ new Map(),
|
|
1267
1247
|
depthMap: new Uint8Array(MAX_TAG_ID)
|
|
1268
1248
|
};
|
|
1269
1249
|
state.regionToggles.set(0, true);
|
|
@@ -1315,8 +1295,7 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1315
1295
|
const res = handler[eventFn](context);
|
|
1316
1296
|
if (res) output.push(res);
|
|
1317
1297
|
}
|
|
1318
|
-
const
|
|
1319
|
-
const configuredNewLines = newLineConfig[eventType] || 0;
|
|
1298
|
+
const configuredNewLines = calculateNewLineConfig(node)[eventType] || 0;
|
|
1320
1299
|
const newLines = Math.max(0, configuredNewLines - lastNewLines);
|
|
1321
1300
|
if (newLines > 0) {
|
|
1322
1301
|
if (!buff.length) {
|
|
@@ -1332,13 +1311,10 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1332
1311
|
const isInlineElement = node.tagHandler?.isInline;
|
|
1333
1312
|
const collapsesWhiteSpace = node.tagHandler?.collapsesInnerWhiteSpace;
|
|
1334
1313
|
const hasSpacing = node.tagHandler?.spacing && Array.isArray(node.tagHandler.spacing);
|
|
1335
|
-
|
|
1336
|
-
const shouldTrim = (!isInlineElement || eventType === NodeEventExit) && !isBlockElement && !(collapsesWhiteSpace && eventType === NodeEventEnter) && !(hasSpacing && eventType === NodeEventEnter);
|
|
1337
|
-
if (shouldTrim) {
|
|
1314
|
+
if ((!isInlineElement || eventType === NodeEventExit) && !(!isInlineElement && !collapsesWhiteSpace && configuredNewLines > 0) && !(collapsesWhiteSpace && eventType === NodeEventEnter) && !(hasSpacing && eventType === NodeEventEnter)) {
|
|
1338
1315
|
const originalLength = lastFragment.length;
|
|
1339
1316
|
const trimmed = lastFragment.trimEnd();
|
|
1340
|
-
|
|
1341
|
-
if (trimmedChars > 0) {
|
|
1317
|
+
if (originalLength - trimmed.length > 0) {
|
|
1342
1318
|
if (buff?.length && buff[buff.length - 1] === lastFragment) buff[buff.length - 1] = trimmed;
|
|
1343
1319
|
}
|
|
1344
1320
|
}
|
|
@@ -1352,12 +1328,11 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1352
1328
|
* Process HTML string and generate events
|
|
1353
1329
|
*/
|
|
1354
1330
|
function processHtml(html) {
|
|
1355
|
-
|
|
1331
|
+
parseHtmlStream(html, {
|
|
1356
1332
|
depthMap: state.depthMap,
|
|
1357
1333
|
depth: 0,
|
|
1358
1334
|
plugins: state.options?.plugins || []
|
|
1359
|
-
}
|
|
1360
|
-
parseHtmlStream(html, parseState, (event) => {
|
|
1335
|
+
}, (event) => {
|
|
1361
1336
|
processPluginsForEvent(event, state.options?.plugins, state, processEvent);
|
|
1362
1337
|
});
|
|
1363
1338
|
}
|
|
@@ -1365,18 +1340,14 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1365
1340
|
* Get the final markdown output
|
|
1366
1341
|
*/
|
|
1367
1342
|
function getMarkdown() {
|
|
1368
|
-
|
|
1369
|
-
return assembledContent.trimEnd();
|
|
1343
|
+
return assembleBufferedContent(state).trimEnd();
|
|
1370
1344
|
}
|
|
1371
1345
|
/**
|
|
1372
1346
|
* Get new markdown content since the last call (for streaming)
|
|
1373
1347
|
*/
|
|
1374
1348
|
function getMarkdownChunk() {
|
|
1375
1349
|
const fragments = [];
|
|
1376
|
-
for (const [regionId, content] of Array.from(state.regionContentBuffers.entries()))
|
|
1377
|
-
const include = state.regionToggles.get(regionId);
|
|
1378
|
-
if (include) fragments.push(...content);
|
|
1379
|
-
}
|
|
1350
|
+
for (const [regionId, content] of Array.from(state.regionContentBuffers.entries())) if (state.regionToggles.get(regionId)) fragments.push(...content);
|
|
1380
1351
|
const currentContent = fragments.join("").trimStart();
|
|
1381
1352
|
const newContent = currentContent.slice(lastYieldedLength);
|
|
1382
1353
|
lastYieldedLength = currentContent.length;
|
|
@@ -1393,4 +1364,4 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1393
1364
|
const MarkdownProcessor = createMarkdownProcessor;
|
|
1394
1365
|
|
|
1395
1366
|
//#endregion
|
|
1396
|
-
export {
|
|
1367
|
+
export { parseHtmlStream as a, parseHtml as i, createMarkdownProcessor as n, processPluginsForEvent as r, MarkdownProcessor as t };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin } from "./plugins-
|
|
1
|
+
import { A as TAG_BUTTON, Dt as TAG_OBJECT, Ht as TAG_SELECT, K as TAG_EMBED, S as TAG_ASIDE, X as TAG_FOOTER, Y as TAG_FIGURE, Z as TAG_FORM, dt as TAG_INPUT, lt as TAG_IFRAME, nn as TAG_TEXTAREA, q as TAG_FIELDSET, wt as TAG_NAV } from "./const-Bf_XN9U9.mjs";
|
|
2
|
+
import { a as filterPlugin, i as frontmatterPlugin, r as isolateMainPlugin, t as tailwindPlugin } from "./plugins-DJnqR2fA.mjs";
|
|
3
3
|
|
|
4
4
|
//#region src/preset/minimal.ts
|
|
5
5
|
/**
|
|
@@ -37,4 +37,4 @@ function withMinimalPreset(options = {}) {
|
|
|
37
37
|
}
|
|
38
38
|
|
|
39
39
|
//#endregion
|
|
40
|
-
export { withMinimalPreset };
|
|
40
|
+
export { withMinimalPreset as t };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Plugin } from "./types-
|
|
1
|
+
import { d as Plugin } from "./types-CT4ZxeOH.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/pluggable/plugin.d.ts
|
|
4
4
|
|
|
@@ -9,4 +9,4 @@ import { Plugin } from "./types-DqiI86yW.mjs";
|
|
|
9
9
|
*/
|
|
10
10
|
declare function createPlugin<T extends Partial<Plugin>>(plugin: T): Plugin;
|
|
11
11
|
//#endregion
|
|
12
|
-
export { createPlugin as
|
|
12
|
+
export { createPlugin as t };
|