mdream 0.13.3 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +111 -0
- package/dist/_chunks/{const-BOAJ1T5c.mjs → const-Bf_XN9U9.mjs} +2 -5
- package/dist/_chunks/{extraction-BPaDGYvv.mjs → extraction-BA9MDtq3.mjs} +4 -6
- package/dist/_chunks/{llms-txt-DC12yO2l.mjs → llms-txt-D7Hduhij.mjs} +28 -33
- package/dist/_chunks/{markdown-processor-f7XT0--8.mjs → markdown-processor-D26Uo5td.mjs} +35 -64
- package/dist/_chunks/{minimal-co1tIZYm.mjs → minimal-BiDhcwif.mjs} +3 -3
- package/dist/_chunks/{plugin-DrovQriD.mjs → plugin-CjWWQTuL.mjs} +1 -1
- package/dist/_chunks/{plugin-CgnpSqtP.d.mts → plugin-D5soyEXm.d.mts} +2 -2
- package/dist/_chunks/{plugins-C5_irVJs.mjs → plugins-DJnqR2fA.mjs} +23 -41
- package/dist/_chunks/{src-C3QpB75q.mjs → src-BJpipdul.mjs} +3 -4
- package/dist/_chunks/{types-DqiI86yW.d.mts → types-CT4ZxeOH.d.mts} +1 -1
- package/dist/cli.mjs +12 -18
- package/dist/iife.js +8 -18
- package/dist/index.d.mts +2 -5
- package/dist/index.mjs +4 -4
- package/dist/llms-txt.mjs +6 -6
- package/dist/plugins.d.mts +2 -2
- package/dist/plugins.mjs +4 -4
- package/dist/preset/minimal.d.mts +1 -1
- package/dist/preset/minimal.mjs +5 -5
- package/dist/splitter.d.mts +7 -2
- package/dist/splitter.mjs +45 -34
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -311,6 +311,117 @@ htmlToMarkdown(html, { plugins: [plugin] })
|
|
|
311
311
|
|
|
312
312
|
The extraction plugin provides memory-efficient element extraction with full text content and attributes, perfect for SEO analysis, content discovery, and data mining.
|
|
313
313
|
|
|
314
|
+
## Markdown Splitting
|
|
315
|
+
|
|
316
|
+
Split HTML into chunks during conversion for LLM context windows, vector databases, or document processing.
|
|
317
|
+
|
|
318
|
+
### Basic Chunking
|
|
319
|
+
|
|
320
|
+
```ts
|
|
321
|
+
import { TAG_H2 } from 'mdream'
|
|
322
|
+
import { htmlToMarkdownSplitChunks } from 'mdream/splitter'
|
|
323
|
+
|
|
324
|
+
const html = `
|
|
325
|
+
<h1>Documentation</h1>
|
|
326
|
+
<h2>Installation</h2>
|
|
327
|
+
<p>Install via npm...</p>
|
|
328
|
+
<h2>Usage</h2>
|
|
329
|
+
<p>Use it like this...</p>
|
|
330
|
+
`
|
|
331
|
+
|
|
332
|
+
const chunks = htmlToMarkdownSplitChunks(html, {
|
|
333
|
+
headersToSplitOn: [TAG_H2], // Split on h2 headers
|
|
334
|
+
chunkSize: 1000, // Max chars per chunk
|
|
335
|
+
chunkOverlap: 200, // Overlap for context
|
|
336
|
+
stripHeaders: true // Remove headers from content
|
|
337
|
+
})
|
|
338
|
+
|
|
339
|
+
// Each chunk includes content and metadata
|
|
340
|
+
chunks.forEach((chunk) => {
|
|
341
|
+
console.log(chunk.content)
|
|
342
|
+
console.log(chunk.metadata.headers) // { h1: "Documentation", h2: "Installation" }
|
|
343
|
+
console.log(chunk.metadata.code) // Language if chunk contains code
|
|
344
|
+
console.log(chunk.metadata.loc) // Line numbers
|
|
345
|
+
})
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
### Streaming Chunks (Memory Efficient)
|
|
349
|
+
|
|
350
|
+
For large documents, use the generator version to process chunks one at a time:
|
|
351
|
+
|
|
352
|
+
```ts
|
|
353
|
+
import { htmlToMarkdownSplitChunksStream } from 'mdream/splitter'
|
|
354
|
+
|
|
355
|
+
// Process chunks incrementally - lower memory usage
|
|
356
|
+
for (const chunk of htmlToMarkdownSplitChunksStream(html, options)) {
|
|
357
|
+
await processChunk(chunk) // Handle each chunk as it's generated
|
|
358
|
+
|
|
359
|
+
// Can break early if you found what you need
|
|
360
|
+
if (foundTarget)
|
|
361
|
+
break
|
|
362
|
+
}
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
**Benefits of streaming:**
|
|
366
|
+
- Lower memory usage - chunks aren't stored in an array
|
|
367
|
+
- Early termination - stop processing when you find what you need
|
|
368
|
+
- Better for large documents
|
|
369
|
+
|
|
370
|
+
### Splitting Options
|
|
371
|
+
|
|
372
|
+
```ts
|
|
373
|
+
interface SplitterOptions {
|
|
374
|
+
// Structural splitting
|
|
375
|
+
headersToSplitOn?: number[] // TAG_H1, TAG_H2, etc. Default: [TAG_H2-TAG_H6]
|
|
376
|
+
|
|
377
|
+
// Size-based splitting
|
|
378
|
+
chunkSize?: number // Max chunk size. Default: 1000
|
|
379
|
+
chunkOverlap?: number // Overlap between chunks. Default: 200
|
|
380
|
+
lengthFunction?: (text: string) => number // Custom length (e.g., token count)
|
|
381
|
+
|
|
382
|
+
// Output formatting
|
|
383
|
+
stripHeaders?: boolean // Remove headers from content. Default: true
|
|
384
|
+
returnEachLine?: boolean // Split into individual lines. Default: false
|
|
385
|
+
|
|
386
|
+
// Standard options
|
|
387
|
+
origin?: string // Base URL for links/images
|
|
388
|
+
plugins?: Plugin[] // Apply plugins during conversion
|
|
389
|
+
}
|
|
390
|
+
```
|
|
391
|
+
|
|
392
|
+
### Chunk Metadata
|
|
393
|
+
|
|
394
|
+
Each chunk includes rich metadata for context:
|
|
395
|
+
|
|
396
|
+
```ts
|
|
397
|
+
interface MarkdownChunk {
|
|
398
|
+
content: string
|
|
399
|
+
metadata: {
|
|
400
|
+
headers?: Record<string, string> // Header hierarchy: { h1: "Title", h2: "Section" }
|
|
401
|
+
code?: string // Code block language if present
|
|
402
|
+
loc?: { // Line number range
|
|
403
|
+
lines: { from: number, to: number }
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
### Use with Presets
|
|
410
|
+
|
|
411
|
+
Combine splitting with presets for optimized output:
|
|
412
|
+
|
|
413
|
+
```ts
|
|
414
|
+
import { TAG_H2 } from 'mdream'
|
|
415
|
+
import { withMinimalPreset } from 'mdream/preset/minimal'
|
|
416
|
+
import { htmlToMarkdownSplitChunks } from 'mdream/splitter'
|
|
417
|
+
|
|
418
|
+
const chunks = htmlToMarkdownSplitChunks(html, withMinimalPreset({
|
|
419
|
+
headersToSplitOn: [TAG_H2],
|
|
420
|
+
chunkSize: 500,
|
|
421
|
+
origin: 'https://example.com'
|
|
422
|
+
}))
|
|
423
|
+
```
|
|
424
|
+
|
|
314
425
|
## Credits
|
|
315
426
|
|
|
316
427
|
- [ultrahtml](https://github.com/natemoo-re/ultrahtml): HTML parsing inspiration
|
|
@@ -29,10 +29,7 @@ function collectNodeContent(node, content, state) {
|
|
|
29
29
|
*/
|
|
30
30
|
function assembleBufferedContent(state) {
|
|
31
31
|
const fragments = [];
|
|
32
|
-
for (const [regionId, content] of Array.from(state.regionContentBuffers.entries()))
|
|
33
|
-
const include = state.regionToggles.get(regionId);
|
|
34
|
-
if (include) fragments.push(...content);
|
|
35
|
-
}
|
|
32
|
+
for (const [regionId, content] of Array.from(state.regionContentBuffers.entries())) if (state.regionToggles.get(regionId)) fragments.push(...content);
|
|
36
33
|
state.regionToggles.clear();
|
|
37
34
|
state.regionContentBuffers.clear();
|
|
38
35
|
return fragments.join("").trimStart();
|
|
@@ -285,4 +282,4 @@ const LIST_ITEM_SPACING = [1, 0];
|
|
|
285
282
|
const TABLE_ROW_SPACING = [0, 1];
|
|
286
283
|
|
|
287
284
|
//#endregion
|
|
288
|
-
export {
|
|
285
|
+
export { TAG_H2 as $, TAG_TBODY as $t, TAG_BUTTON as A, TAG_P as At, TAG_DFN as B, TAG_SCRIPT as Bt, TAG_AUDIO as C, TAG_METER as Ct, TAG_BLOCKQUOTE as D, TAG_OBJECT as Dt, TAG_BDO as E, TAG_NOSCRIPT as Et, TAG_CODE as F, TAG_Q as Ft, TAG_EM as G, TAG_SPAN as Gt, TAG_DIV as H, TAG_SELECT as Ht, TAG_COL as I, TAG_RP as It, TAG_FIGCAPTION as J, TAG_SUB as Jt, TAG_EMBED as K, TAG_STRONG as Kt, TAG_DD as L, TAG_RT as Lt, TAG_CAPTION as M, TAG_PLAINTEXT as Mt, TAG_CENTER as N, TAG_PRE as Nt, TAG_BODY as O, TAG_OL as Ot, TAG_CITE as P, TAG_PROGRESS as Pt, TAG_H1 as Q, TAG_TABLE as Qt, TAG_DEL as R, TAG_RUBY as Rt, TAG_ASIDE as S, TAG_META as St, TAG_BASE as T, TAG_NOFRAMES as Tt, TAG_DL as U, TAG_SMALL as Ut, TAG_DIALOG as V, TAG_SECTION as Vt, TAG_DT as W, TAG_SOURCE as Wt, TAG_FOOTER as X, TAG_SUP as Xt, TAG_FIGURE as Y, TAG_SUMMARY as Yt, TAG_FORM as Z, TAG_SVG as Zt, TAG_A as _, TagIdMap as _n, TAG_LI as _t, LIST_ITEM_SPACING as a, TAG_THEAD as an, TAG_HEADER as at, TAG_AREA as b, createBufferRegion as bn, TAG_MAP as bt, MARKDOWN_HORIZONTAL_RULE as c, TAG_TR as cn, TAG_I as ct, MARKDOWN_STRONG as d, TAG_UL as dn, TAG_INPUT as dt, TAG_TD as en, TAG_H3 as et, MAX_TAG_ID as f, TAG_VAR as fn, TAG_INS as ft, TABLE_ROW_SPACING as g, TEXT_NODE as gn, TAG_LEGEND as gt, NodeEventExit as h, TAG_XMP as hn, TAG_LABEL as ht, HTML_ENTITIES as i, TAG_TH as in, TAG_HEAD as it, TAG_CANVAS as j, TAG_PARAM as jt, TAG_BR as k, TAG_OPTION as kt, MARKDOWN_INLINE_CODE as l, TAG_TRACK as ln, TAG_IFRAME as lt, NodeEventEnter as m, TAG_WBR as mn, TAG_KEYGEN as mt, DEFAULT_BLOCK_SPACING as n, TAG_TEXTAREA as nn, TAG_H5 as nt, MARKDOWN_CODE_BLOCK as o, TAG_TIME as on, TAG_HR as ot, NO_SPACING as p, TAG_VIDEO as pn, TAG_KBD as pt, TAG_FIELDSET as q, TAG_STYLE as qt, ELEMENT_NODE as r, TAG_TFOOT as rn, TAG_H6 as rt, MARKDOWN_EMPHASIS as s, TAG_TITLE as sn, TAG_HTML as st, BLOCKQUOTE_SPACING as t, TAG_TEMPLATE as tn, TAG_H4 as tt, MARKDOWN_STRIKETHROUGH as u, TAG_U as un, TAG_IMG as ut, TAG_ABBR as v, assembleBufferedContent as vn, TAG_LINK as vt, TAG_B as w, TAG_NAV as wt, TAG_ARTICLE as x, TAG_MARK as xt, TAG_ADDRESS as y, collectNodeContent as yn, TAG_MAIN as yt, TAG_DETAILS as z, TAG_SAMP as zt };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { createPlugin } from "./plugin-
|
|
1
|
+
import { t as createPlugin } from "./plugin-CjWWQTuL.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/libs/query-selector.ts
|
|
4
4
|
/**
|
|
@@ -28,8 +28,7 @@ function createClassSelector(selector) {
|
|
|
28
28
|
return {
|
|
29
29
|
matches: (element) => {
|
|
30
30
|
if (!element.attributes?.class) return false;
|
|
31
|
-
|
|
32
|
-
return classes.includes(className);
|
|
31
|
+
return element.attributes.class.trim().split(" ").filter(Boolean).includes(className);
|
|
33
32
|
},
|
|
34
33
|
toString: () => `.${className}`
|
|
35
34
|
};
|
|
@@ -109,7 +108,7 @@ function extractionPlugin(selectors) {
|
|
|
109
108
|
matcher: parseSelector(selector),
|
|
110
109
|
callback
|
|
111
110
|
}));
|
|
112
|
-
const trackedElements = new Map();
|
|
111
|
+
const trackedElements = /* @__PURE__ */ new Map();
|
|
113
112
|
return createPlugin({
|
|
114
113
|
onNodeEnter(element) {
|
|
115
114
|
matcherCallbacks.forEach(({ matcher, callback }) => {
|
|
@@ -126,7 +125,6 @@ function extractionPlugin(selectors) {
|
|
|
126
125
|
if (tracked) tracked.textContent += textNode.value;
|
|
127
126
|
currentParent = currentParent.parent;
|
|
128
127
|
}
|
|
129
|
-
return void 0;
|
|
130
128
|
},
|
|
131
129
|
onNodeExit(element, state) {
|
|
132
130
|
const tracked = trackedElements.get(element);
|
|
@@ -143,4 +141,4 @@ function extractionPlugin(selectors) {
|
|
|
143
141
|
}
|
|
144
142
|
|
|
145
143
|
//#endregion
|
|
146
|
-
export {
|
|
144
|
+
export { parseSelector as n, extractionPlugin as t };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { htmlToMarkdown } from "./src-
|
|
2
|
-
import { extractionPlugin } from "./extraction-
|
|
1
|
+
import { t as htmlToMarkdown } from "./src-BJpipdul.mjs";
|
|
2
|
+
import { t as extractionPlugin } from "./extraction-BA9MDtq3.mjs";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { basename, dirname, relative, sep } from "pathe";
|
|
5
5
|
import { glob } from "tinyglobby";
|
|
@@ -13,28 +13,27 @@ function extractMetadata(html, url) {
|
|
|
13
13
|
let description = "";
|
|
14
14
|
let keywords = "";
|
|
15
15
|
let author = "";
|
|
16
|
-
const extractionPluginInstance = extractionPlugin({
|
|
17
|
-
"title": (element) => {
|
|
18
|
-
if (!title && element.textContent) title = element.textContent.trim();
|
|
19
|
-
},
|
|
20
|
-
"meta[name=\"description\"]": (element) => {
|
|
21
|
-
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
22
|
-
},
|
|
23
|
-
"meta[property=\"og:description\"]": (element) => {
|
|
24
|
-
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
25
|
-
},
|
|
26
|
-
"meta[name=\"keywords\"]": (element) => {
|
|
27
|
-
if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
|
|
28
|
-
},
|
|
29
|
-
"meta[name=\"author\"]": (element) => {
|
|
30
|
-
if (!author && element.attributes?.content) author = element.attributes.content.trim();
|
|
31
|
-
},
|
|
32
|
-
"meta[property=\"og:title\"]": (element) => {
|
|
33
|
-
if (!title && element.attributes?.content) title = element.attributes.content.trim();
|
|
34
|
-
}
|
|
35
|
-
});
|
|
36
16
|
htmlToMarkdown(html, {
|
|
37
|
-
plugins: [
|
|
17
|
+
plugins: [extractionPlugin({
|
|
18
|
+
"title": (element) => {
|
|
19
|
+
if (!title && element.textContent) title = element.textContent.trim();
|
|
20
|
+
},
|
|
21
|
+
"meta[name=\"description\"]": (element) => {
|
|
22
|
+
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
23
|
+
},
|
|
24
|
+
"meta[property=\"og:description\"]": (element) => {
|
|
25
|
+
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
26
|
+
},
|
|
27
|
+
"meta[name=\"keywords\"]": (element) => {
|
|
28
|
+
if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
|
|
29
|
+
},
|
|
30
|
+
"meta[name=\"author\"]": (element) => {
|
|
31
|
+
if (!author && element.attributes?.content) author = element.attributes.content.trim();
|
|
32
|
+
},
|
|
33
|
+
"meta[property=\"og:title\"]": (element) => {
|
|
34
|
+
if (!title && element.attributes?.content) title = element.attributes.content.trim();
|
|
35
|
+
}
|
|
36
|
+
})],
|
|
38
37
|
origin: url
|
|
39
38
|
});
|
|
40
39
|
return {
|
|
@@ -113,8 +112,7 @@ function generateLlmsTxtContent(files, options) {
|
|
|
113
112
|
* Parse frontmatter from markdown content
|
|
114
113
|
*/
|
|
115
114
|
function parseFrontmatter(content) {
|
|
116
|
-
const
|
|
117
|
-
const match = content.match(frontmatterRegex);
|
|
115
|
+
const match = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
|
|
118
116
|
if (!match) return {
|
|
119
117
|
frontmatter: null,
|
|
120
118
|
body: content
|
|
@@ -127,8 +125,7 @@ function parseFrontmatter(content) {
|
|
|
127
125
|
const colonIndex = line.indexOf(":");
|
|
128
126
|
if (colonIndex > 0) {
|
|
129
127
|
const key = line.substring(0, colonIndex).trim();
|
|
130
|
-
|
|
131
|
-
frontmatter[key] = value;
|
|
128
|
+
frontmatter[key] = line.substring(colonIndex + 1).trim();
|
|
132
129
|
}
|
|
133
130
|
}
|
|
134
131
|
return {
|
|
@@ -172,11 +169,10 @@ function generateLlmsFullTxtContent(files, options) {
|
|
|
172
169
|
if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
|
|
173
170
|
if (file.metadata.author) metadata.author = file.metadata.author;
|
|
174
171
|
}
|
|
175
|
-
const
|
|
172
|
+
const frontmatterString = serializeFrontmatter(frontmatter ? {
|
|
176
173
|
...frontmatter,
|
|
177
174
|
...metadata
|
|
178
|
-
} : metadata;
|
|
179
|
-
const frontmatterString = serializeFrontmatter(mergedFrontmatter);
|
|
175
|
+
} : metadata);
|
|
180
176
|
let contentBody = frontmatter ? body : file.content;
|
|
181
177
|
const titleLine = contentBody.trim().split("\n")[0];
|
|
182
178
|
if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
|
|
@@ -191,8 +187,7 @@ function generateLlmsFullTxtContent(files, options) {
|
|
|
191
187
|
function generateMarkdownFilesContent(files) {
|
|
192
188
|
const markdownFiles = [];
|
|
193
189
|
for (const file of files) {
|
|
194
|
-
const
|
|
195
|
-
const mdPath = `md/${urlPath}.md`;
|
|
190
|
+
const mdPath = `md/${file.url === "/" ? "index" : file.url.replace(/^\//, "").replace(/\/$/, "")}.md`;
|
|
196
191
|
markdownFiles.push({
|
|
197
192
|
path: mdPath,
|
|
198
193
|
content: file.content
|
|
@@ -222,4 +217,4 @@ async function generateLlmsTxtArtifacts(options) {
|
|
|
222
217
|
}
|
|
223
218
|
|
|
224
219
|
//#endregion
|
|
225
|
-
export { generateLlmsTxtArtifacts };
|
|
220
|
+
export { generateLlmsTxtArtifacts as t };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { $ as TAG_H2, $t as TAG_TBODY, A as TAG_BUTTON, At as TAG_P, B as TAG_DFN, Bt as TAG_SCRIPT, C as TAG_AUDIO, Ct as TAG_METER, D as TAG_BLOCKQUOTE, E as TAG_BDO, Et as TAG_NOSCRIPT, F as TAG_CODE, Ft as TAG_Q, G as TAG_EM, Gt as TAG_SPAN, H as TAG_DIV, Ht as TAG_SELECT, I as TAG_COL, It as TAG_RP, Jt as TAG_SUB, K as TAG_EMBED, Kt as TAG_STRONG, L as TAG_DD, Lt as TAG_RT, Mt as TAG_PLAINTEXT, N as TAG_CENTER, Nt as TAG_PRE, O as TAG_BODY, Ot as TAG_OL, P as TAG_CITE, Pt as TAG_PROGRESS, Q as TAG_H1, Qt as TAG_TABLE, R as TAG_DEL, Rt as TAG_RUBY, S as TAG_ASIDE, St as TAG_META, T as TAG_BASE, Tt as TAG_NOFRAMES, U as TAG_DL, Ut as TAG_SMALL, V as TAG_DIALOG, W as TAG_DT, Wt as TAG_SOURCE, X as TAG_FOOTER, Xt as TAG_SUP, Yt as TAG_SUMMARY, Z as TAG_FORM, Zt as TAG_SVG, _ as TAG_A, _n as TagIdMap, _t as TAG_LI, a as LIST_ITEM_SPACING, an as TAG_THEAD, b as TAG_AREA, bt as TAG_MAP, c as MARKDOWN_HORIZONTAL_RULE, cn as TAG_TR, ct as TAG_I, d as MARKDOWN_STRONG, dn as TAG_UL, dt as TAG_INPUT, en as TAG_TD, et as TAG_H3, f as MAX_TAG_ID, fn as TAG_VAR, ft as TAG_INS, g as TABLE_ROW_SPACING, gn as TEXT_NODE, gt as TAG_LEGEND, h as NodeEventExit, hn as TAG_XMP, ht as TAG_LABEL, i as HTML_ENTITIES, in as TAG_TH, it as TAG_HEAD, j as TAG_CANVAS, jt as TAG_PARAM, k as TAG_BR, kt as TAG_OPTION, l as MARKDOWN_INLINE_CODE, ln as TAG_TRACK, lt as TAG_IFRAME, m as NodeEventEnter, mn as TAG_WBR, mt as TAG_KEYGEN, n as DEFAULT_BLOCK_SPACING, nn as TAG_TEXTAREA, nt as TAG_H5, o as MARKDOWN_CODE_BLOCK, on as TAG_TIME, ot as TAG_HR, p as NO_SPACING, pn as TAG_VIDEO, pt as TAG_KBD, q as TAG_FIELDSET, qt as TAG_STYLE, r as ELEMENT_NODE, rn as TAG_TFOOT, rt as TAG_H6, s as MARKDOWN_EMPHASIS, sn as TAG_TITLE, t as BLOCKQUOTE_SPACING, tn as TAG_TEMPLATE, tt as TAG_H4, u as MARKDOWN_STRIKETHROUGH, un as TAG_U, ut as TAG_IMG, v as TAG_ABBR, vn as assembleBufferedContent, vt as TAG_LINK, w as TAG_B, wt as TAG_NAV, xt as TAG_MARK, y as TAG_ADDRESS, yn as collectNodeContent, z as TAG_DETAILS, zt as TAG_SAMP } from "./const-Bf_XN9U9.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/tags.ts
|
|
4
4
|
function resolveUrl(url, origin) {
|
|
@@ -6,15 +6,9 @@ function resolveUrl(url, origin) {
|
|
|
6
6
|
if (url.startsWith("//")) return `https:${url}`;
|
|
7
7
|
if (url.startsWith("#")) return url;
|
|
8
8
|
if (origin) {
|
|
9
|
-
if (url.startsWith("/") && origin) {
|
|
10
|
-
const cleanOrigin = origin.endsWith("/") ? origin.slice(0, -1) : origin;
|
|
11
|
-
return `${cleanOrigin}${url}`;
|
|
12
|
-
}
|
|
9
|
+
if (url.startsWith("/") && origin) return `${origin.endsWith("/") ? origin.slice(0, -1) : origin}${url}`;
|
|
13
10
|
if (url.startsWith("./")) return `${origin}/${url.slice(2)}`;
|
|
14
|
-
if (!url.startsWith("http")) {
|
|
15
|
-
const cleanUrl = url.startsWith("/") ? url.slice(1) : url;
|
|
16
|
-
return `${origin}/${cleanUrl}`;
|
|
17
|
-
}
|
|
11
|
+
if (!url.startsWith("http")) return `${origin}/${url.startsWith("/") ? url.slice(1) : url}`;
|
|
18
12
|
}
|
|
19
13
|
return url;
|
|
20
14
|
}
|
|
@@ -157,10 +151,7 @@ const tagHandlers = {
|
|
|
157
151
|
},
|
|
158
152
|
[TAG_CODE]: {
|
|
159
153
|
enter: ({ node }) => {
|
|
160
|
-
if ((node.depthMap[TAG_PRE] || 0) > 0) {
|
|
161
|
-
const language = getLanguageFromClass(node.attributes?.class);
|
|
162
|
-
return `${MARKDOWN_CODE_BLOCK}${language}\n`;
|
|
163
|
-
}
|
|
154
|
+
if ((node.depthMap[TAG_PRE] || 0) > 0) return `${MARKDOWN_CODE_BLOCK}${getLanguageFromClass(node.attributes?.class)}\n`;
|
|
164
155
|
return MARKDOWN_INLINE_CODE;
|
|
165
156
|
},
|
|
166
157
|
exit: ({ node }) => {
|
|
@@ -179,9 +170,7 @@ const tagHandlers = {
|
|
|
179
170
|
if (isInsideTableCell(node)) return "<li>";
|
|
180
171
|
const depth = (node.depthMap[TAG_UL] || 0) + (node.depthMap[TAG_OL] || 0) - 1;
|
|
181
172
|
const isOrdered = node.parent?.tagId === TAG_OL;
|
|
182
|
-
|
|
183
|
-
const marker = isOrdered ? `${node.index + 1}. ` : "- ";
|
|
184
|
-
return `${indent}${marker}`;
|
|
173
|
+
return `${" ".repeat(Math.max(0, depth))}${isOrdered ? `${node.index + 1}. ` : "- "}`;
|
|
185
174
|
},
|
|
186
175
|
exit: ({ node }) => isInsideTableCell(node) ? "</li>" : void 0,
|
|
187
176
|
spacing: LIST_ITEM_SPACING
|
|
@@ -194,8 +183,7 @@ const tagHandlers = {
|
|
|
194
183
|
if (!node.attributes?.href) return "";
|
|
195
184
|
const href = resolveUrl(node.attributes?.href || "", state.options?.origin);
|
|
196
185
|
let title = node.attributes?.title;
|
|
197
|
-
|
|
198
|
-
if (lastContent === title) title = "";
|
|
186
|
+
if (state.lastContentCache === title) title = "";
|
|
199
187
|
return title ? `](${href} "${title}")` : `](${href})`;
|
|
200
188
|
},
|
|
201
189
|
collapsesInnerWhiteSpace: true,
|
|
@@ -204,9 +192,7 @@ const tagHandlers = {
|
|
|
204
192
|
},
|
|
205
193
|
[TAG_IMG]: {
|
|
206
194
|
enter: ({ node, state }) => {
|
|
207
|
-
|
|
208
|
-
const src = resolveUrl(node.attributes?.src || "", state.options?.origin);
|
|
209
|
-
return ``;
|
|
195
|
+
return `})`;
|
|
210
196
|
},
|
|
211
197
|
collapsesInnerWhiteSpace: true,
|
|
212
198
|
isSelfClosing: true,
|
|
@@ -241,15 +227,14 @@ const tagHandlers = {
|
|
|
241
227
|
state.tableRenderedTable = true;
|
|
242
228
|
const alignments = state.tableColumnAlignments;
|
|
243
229
|
while (alignments.length < state.tableCurrentRowCells) alignments.push("");
|
|
244
|
-
|
|
230
|
+
return ` |\n| ${alignments.map((align) => {
|
|
245
231
|
switch (align) {
|
|
246
232
|
case "left": return ":---";
|
|
247
233
|
case "center": return ":---:";
|
|
248
234
|
case "right": return "---:";
|
|
249
235
|
default: return "---";
|
|
250
236
|
}
|
|
251
|
-
})
|
|
252
|
-
return ` |\n| ${alignmentMarkers.join(" | ")} |`;
|
|
237
|
+
}).join(" | ")} |`;
|
|
253
238
|
}
|
|
254
239
|
return " |";
|
|
255
240
|
},
|
|
@@ -637,17 +622,15 @@ function isWhitespace(charCode) {
|
|
|
637
622
|
*/
|
|
638
623
|
function parseHtml(html, options = {}) {
|
|
639
624
|
const events = [];
|
|
640
|
-
const state = {
|
|
641
|
-
depthMap: new Uint8Array(MAX_TAG_ID),
|
|
642
|
-
depth: 0,
|
|
643
|
-
plugins: options.plugins || []
|
|
644
|
-
};
|
|
645
|
-
const remainingHtml = parseHtmlInternal(html, state, (event) => {
|
|
646
|
-
events.push(event);
|
|
647
|
-
});
|
|
648
625
|
return {
|
|
649
626
|
events,
|
|
650
|
-
remainingHtml
|
|
627
|
+
remainingHtml: parseHtmlInternal(html, {
|
|
628
|
+
depthMap: new Uint8Array(MAX_TAG_ID),
|
|
629
|
+
depth: 0,
|
|
630
|
+
plugins: options.plugins || []
|
|
631
|
+
}, (event) => {
|
|
632
|
+
events.push(event);
|
|
633
|
+
})
|
|
651
634
|
};
|
|
652
635
|
}
|
|
653
636
|
/**
|
|
@@ -840,8 +823,7 @@ function processClosingTag(htmlChunk, position, state, handleEvent) {
|
|
|
840
823
|
const chunkLength = htmlChunk.length;
|
|
841
824
|
let foundClose = false;
|
|
842
825
|
while (i < chunkLength) {
|
|
843
|
-
|
|
844
|
-
if (charCode === GT_CHAR) {
|
|
826
|
+
if (htmlChunk.charCodeAt(i) === GT_CHAR) {
|
|
845
827
|
foundClose = true;
|
|
846
828
|
break;
|
|
847
829
|
}
|
|
@@ -852,8 +834,7 @@ function processClosingTag(htmlChunk, position, state, handleEvent) {
|
|
|
852
834
|
newPosition: position,
|
|
853
835
|
remainingText: htmlChunk.substring(position)
|
|
854
836
|
};
|
|
855
|
-
const
|
|
856
|
-
const tagId = TagIdMap[tagName] ?? -1;
|
|
837
|
+
const tagId = TagIdMap[htmlChunk.substring(tagNameStart, i).toLowerCase()] ?? -1;
|
|
857
838
|
if (state.currentNode?.tagHandler?.isNonNesting && tagId !== state.currentNode.tagId) return {
|
|
858
839
|
complete: false,
|
|
859
840
|
newPosition: position,
|
|
@@ -885,16 +866,15 @@ function closeNode(node, state, handleEvent) {
|
|
|
885
866
|
const prefix = node.attributes?.title || node.attributes?.["aria-label"] || "";
|
|
886
867
|
if (prefix) {
|
|
887
868
|
node.childTextNodeIndex = 1;
|
|
888
|
-
const textNode = {
|
|
889
|
-
type: TEXT_NODE,
|
|
890
|
-
value: prefix,
|
|
891
|
-
parent: node,
|
|
892
|
-
index: 0,
|
|
893
|
-
depth: node.depth + 1
|
|
894
|
-
};
|
|
895
869
|
handleEvent({
|
|
896
870
|
type: NodeEventEnter,
|
|
897
|
-
node:
|
|
871
|
+
node: {
|
|
872
|
+
type: TEXT_NODE,
|
|
873
|
+
value: prefix,
|
|
874
|
+
parent: node,
|
|
875
|
+
index: 0,
|
|
876
|
+
depth: node.depth + 1
|
|
877
|
+
}
|
|
898
878
|
});
|
|
899
879
|
for (const parent of traverseUpToFirstBlockNode(node)) parent.childTextNodeIndex = (parent.childTextNodeIndex || 0) + 1;
|
|
900
880
|
}
|
|
@@ -1262,8 +1242,8 @@ function calculateNewLineConfig(node) {
|
|
|
1262
1242
|
function createMarkdownProcessor(options = {}) {
|
|
1263
1243
|
const state = {
|
|
1264
1244
|
options,
|
|
1265
|
-
regionToggles: new Map(),
|
|
1266
|
-
regionContentBuffers: new Map(),
|
|
1245
|
+
regionToggles: /* @__PURE__ */ new Map(),
|
|
1246
|
+
regionContentBuffers: /* @__PURE__ */ new Map(),
|
|
1267
1247
|
depthMap: new Uint8Array(MAX_TAG_ID)
|
|
1268
1248
|
};
|
|
1269
1249
|
state.regionToggles.set(0, true);
|
|
@@ -1315,8 +1295,7 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1315
1295
|
const res = handler[eventFn](context);
|
|
1316
1296
|
if (res) output.push(res);
|
|
1317
1297
|
}
|
|
1318
|
-
const
|
|
1319
|
-
const configuredNewLines = newLineConfig[eventType] || 0;
|
|
1298
|
+
const configuredNewLines = calculateNewLineConfig(node)[eventType] || 0;
|
|
1320
1299
|
const newLines = Math.max(0, configuredNewLines - lastNewLines);
|
|
1321
1300
|
if (newLines > 0) {
|
|
1322
1301
|
if (!buff.length) {
|
|
@@ -1332,13 +1311,10 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1332
1311
|
const isInlineElement = node.tagHandler?.isInline;
|
|
1333
1312
|
const collapsesWhiteSpace = node.tagHandler?.collapsesInnerWhiteSpace;
|
|
1334
1313
|
const hasSpacing = node.tagHandler?.spacing && Array.isArray(node.tagHandler.spacing);
|
|
1335
|
-
|
|
1336
|
-
const shouldTrim = (!isInlineElement || eventType === NodeEventExit) && !isBlockElement && !(collapsesWhiteSpace && eventType === NodeEventEnter) && !(hasSpacing && eventType === NodeEventEnter);
|
|
1337
|
-
if (shouldTrim) {
|
|
1314
|
+
if ((!isInlineElement || eventType === NodeEventExit) && !(!isInlineElement && !collapsesWhiteSpace && configuredNewLines > 0) && !(collapsesWhiteSpace && eventType === NodeEventEnter) && !(hasSpacing && eventType === NodeEventEnter)) {
|
|
1338
1315
|
const originalLength = lastFragment.length;
|
|
1339
1316
|
const trimmed = lastFragment.trimEnd();
|
|
1340
|
-
|
|
1341
|
-
if (trimmedChars > 0) {
|
|
1317
|
+
if (originalLength - trimmed.length > 0) {
|
|
1342
1318
|
if (buff?.length && buff[buff.length - 1] === lastFragment) buff[buff.length - 1] = trimmed;
|
|
1343
1319
|
}
|
|
1344
1320
|
}
|
|
@@ -1352,12 +1328,11 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1352
1328
|
* Process HTML string and generate events
|
|
1353
1329
|
*/
|
|
1354
1330
|
function processHtml(html) {
|
|
1355
|
-
|
|
1331
|
+
parseHtmlStream(html, {
|
|
1356
1332
|
depthMap: state.depthMap,
|
|
1357
1333
|
depth: 0,
|
|
1358
1334
|
plugins: state.options?.plugins || []
|
|
1359
|
-
}
|
|
1360
|
-
parseHtmlStream(html, parseState, (event) => {
|
|
1335
|
+
}, (event) => {
|
|
1361
1336
|
processPluginsForEvent(event, state.options?.plugins, state, processEvent);
|
|
1362
1337
|
});
|
|
1363
1338
|
}
|
|
@@ -1365,18 +1340,14 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1365
1340
|
* Get the final markdown output
|
|
1366
1341
|
*/
|
|
1367
1342
|
function getMarkdown() {
|
|
1368
|
-
|
|
1369
|
-
return assembledContent.trimEnd();
|
|
1343
|
+
return assembleBufferedContent(state).trimEnd();
|
|
1370
1344
|
}
|
|
1371
1345
|
/**
|
|
1372
1346
|
* Get new markdown content since the last call (for streaming)
|
|
1373
1347
|
*/
|
|
1374
1348
|
function getMarkdownChunk() {
|
|
1375
1349
|
const fragments = [];
|
|
1376
|
-
for (const [regionId, content] of Array.from(state.regionContentBuffers.entries()))
|
|
1377
|
-
const include = state.regionToggles.get(regionId);
|
|
1378
|
-
if (include) fragments.push(...content);
|
|
1379
|
-
}
|
|
1350
|
+
for (const [regionId, content] of Array.from(state.regionContentBuffers.entries())) if (state.regionToggles.get(regionId)) fragments.push(...content);
|
|
1380
1351
|
const currentContent = fragments.join("").trimStart();
|
|
1381
1352
|
const newContent = currentContent.slice(lastYieldedLength);
|
|
1382
1353
|
lastYieldedLength = currentContent.length;
|
|
@@ -1393,4 +1364,4 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1393
1364
|
const MarkdownProcessor = createMarkdownProcessor;
|
|
1394
1365
|
|
|
1395
1366
|
//#endregion
|
|
1396
|
-
export {
|
|
1367
|
+
export { parseHtmlStream as a, parseHtml as i, createMarkdownProcessor as n, processPluginsForEvent as r, MarkdownProcessor as t };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin } from "./plugins-
|
|
1
|
+
import { A as TAG_BUTTON, Dt as TAG_OBJECT, Ht as TAG_SELECT, K as TAG_EMBED, S as TAG_ASIDE, X as TAG_FOOTER, Y as TAG_FIGURE, Z as TAG_FORM, dt as TAG_INPUT, lt as TAG_IFRAME, nn as TAG_TEXTAREA, q as TAG_FIELDSET, wt as TAG_NAV } from "./const-Bf_XN9U9.mjs";
|
|
2
|
+
import { a as filterPlugin, i as frontmatterPlugin, r as isolateMainPlugin, t as tailwindPlugin } from "./plugins-DJnqR2fA.mjs";
|
|
3
3
|
|
|
4
4
|
//#region src/preset/minimal.ts
|
|
5
5
|
/**
|
|
@@ -37,4 +37,4 @@ function withMinimalPreset(options = {}) {
|
|
|
37
37
|
}
|
|
38
38
|
|
|
39
39
|
//#endregion
|
|
40
|
-
export { withMinimalPreset };
|
|
40
|
+
export { withMinimalPreset as t };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Plugin } from "./types-
|
|
1
|
+
import { d as Plugin } from "./types-CT4ZxeOH.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/pluggable/plugin.d.ts
|
|
4
4
|
|
|
@@ -9,4 +9,4 @@ import { Plugin } from "./types-DqiI86yW.mjs";
|
|
|
9
9
|
*/
|
|
10
10
|
declare function createPlugin<T extends Partial<Plugin>>(plugin: T): Plugin;
|
|
11
11
|
//#endregion
|
|
12
|
-
export { createPlugin as
|
|
12
|
+
export { createPlugin as t };
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { createPlugin } from "./plugin-
|
|
3
|
-
import { parseSelector } from "./extraction-
|
|
1
|
+
import { $ as TAG_H2, $t as TAG_TBODY, A as TAG_BUTTON, At as TAG_P, Bt as TAG_SCRIPT, C as TAG_AUDIO, D as TAG_BLOCKQUOTE, Dt as TAG_OBJECT, F as TAG_CODE, G as TAG_EM, Gt as TAG_SPAN, H as TAG_DIV, Ht as TAG_SELECT, J as TAG_FIGCAPTION, K as TAG_EMBED, Kt as TAG_STRONG, L as TAG_DD, M as TAG_CAPTION, Nt as TAG_PRE, O as TAG_BODY, Ot as TAG_OL, Q as TAG_H1, Qt as TAG_TABLE, S as TAG_ASIDE, St as TAG_META, U as TAG_DL, Vt as TAG_SECTION, W as TAG_DT, X as TAG_FOOTER, Y as TAG_FIGURE, Yt as TAG_SUMMARY, Z as TAG_FORM, Zt as TAG_SVG, _ as TAG_A, _t as TAG_LI, an as TAG_THEAD, at as TAG_HEADER, bn as createBufferRegion, cn as TAG_TR, ct as TAG_I, dn as TAG_UL, dt as TAG_INPUT, en as TAG_TD, et as TAG_H3, gn as TEXT_NODE, in as TAG_TH, it as TAG_HEAD, k as TAG_BR, lt as TAG_IFRAME, nn as TAG_TEXTAREA, nt as TAG_H5, ot as TAG_HR, pn as TAG_VIDEO, q as TAG_FIELDSET, qt as TAG_STYLE, r as ELEMENT_NODE, rn as TAG_TFOOT, rt as TAG_H6, sn as TAG_TITLE, st as TAG_HTML, tt as TAG_H4, ut as TAG_IMG, w as TAG_B, wt as TAG_NAV, x as TAG_ARTICLE, y as TAG_ADDRESS, yn as collectNodeContent, yt as TAG_MAIN, z as TAG_DETAILS } from "./const-Bf_XN9U9.mjs";
|
|
2
|
+
import { t as createPlugin } from "./plugin-CjWWQTuL.mjs";
|
|
3
|
+
import { n as parseSelector } from "./extraction-BA9MDtq3.mjs";
|
|
4
4
|
|
|
5
5
|
//#region src/plugins/filter.ts
|
|
6
6
|
/**
|
|
@@ -28,11 +28,9 @@ function filterPlugin(options = {}) {
|
|
|
28
28
|
return createPlugin({ beforeNodeProcess(event) {
|
|
29
29
|
const { node } = event;
|
|
30
30
|
if (node.type === TEXT_NODE) {
|
|
31
|
-
|
|
32
|
-
let currentParent$1 = textNode.parent;
|
|
31
|
+
let currentParent$1 = node.parent;
|
|
33
32
|
while (currentParent$1 && excludeSelectors.length) {
|
|
34
|
-
|
|
35
|
-
if (parentShouldExclude) return { skip: true };
|
|
33
|
+
if (excludeSelectors.some((selector) => selector.matches(currentParent$1))) return { skip: true };
|
|
36
34
|
currentParent$1 = currentParent$1.parent;
|
|
37
35
|
}
|
|
38
36
|
return;
|
|
@@ -41,22 +39,19 @@ function filterPlugin(options = {}) {
|
|
|
41
39
|
const element = node;
|
|
42
40
|
if (excludeSelectors.length) {
|
|
43
41
|
if (element.attributes.style?.includes("absolute") || element.attributes.style?.includes("fixed")) return { skip: true };
|
|
44
|
-
|
|
45
|
-
if (shouldExclude) return { skip: true };
|
|
42
|
+
if (excludeSelectors.some((selector) => selector.matches(element))) return { skip: true };
|
|
46
43
|
}
|
|
47
44
|
let currentParent = element.parent;
|
|
48
45
|
while (currentParent) {
|
|
49
46
|
if (excludeSelectors.length) {
|
|
50
|
-
|
|
51
|
-
if (parentShouldExclude) return { skip: true };
|
|
47
|
+
if (excludeSelectors.some((selector) => selector.matches(currentParent))) return { skip: true };
|
|
52
48
|
}
|
|
53
49
|
currentParent = currentParent.parent;
|
|
54
50
|
}
|
|
55
51
|
if (includeSelectors.length) {
|
|
56
52
|
let currentElement = element;
|
|
57
53
|
while (currentElement) {
|
|
58
|
-
|
|
59
|
-
if (shouldInclude) return;
|
|
54
|
+
if (includeSelectors.some((selector) => selector.matches(currentElement))) return;
|
|
60
55
|
if (!processChildren) break;
|
|
61
56
|
currentElement = currentElement.parent;
|
|
62
57
|
}
|
|
@@ -102,25 +97,20 @@ function frontmatterPlugin(options = {}) {
|
|
|
102
97
|
}
|
|
103
98
|
if (inHead && node.type === ELEMENT_NODE && node.tagId === TAG_TITLE) return;
|
|
104
99
|
if (inHead && node.type === ELEMENT_NODE && node.tagId === TAG_META) {
|
|
105
|
-
const
|
|
106
|
-
const { name, property, content } = elementNode.attributes || {};
|
|
100
|
+
const { name, property, content } = node.attributes || {};
|
|
107
101
|
const metaName = property || name;
|
|
108
102
|
if (metaName && content && metaFields.has(metaName)) frontmatter.meta[metaName.includes(":") ? `"${metaName}"` : metaName] = formatValue(metaName, content);
|
|
109
|
-
return
|
|
103
|
+
return;
|
|
110
104
|
}
|
|
111
105
|
},
|
|
112
106
|
onNodeExit(node, state) {
|
|
113
107
|
if (node.type === ELEMENT_NODE && node.tagId === TAG_HEAD) {
|
|
114
108
|
inHead = false;
|
|
115
|
-
if (Object.keys(frontmatter).length > 0) {
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
regionId: 0
|
|
120
|
-
}, frontmatterContent, state);
|
|
121
|
-
}
|
|
109
|
+
if (Object.keys(frontmatter).length > 0) collectNodeContent({
|
|
110
|
+
type: 1,
|
|
111
|
+
regionId: 0
|
|
112
|
+
}, generateFrontmatter(), state);
|
|
122
113
|
}
|
|
123
|
-
return void 0;
|
|
124
114
|
},
|
|
125
115
|
processTextNode(node) {
|
|
126
116
|
if (!inHead) return;
|
|
@@ -237,8 +227,7 @@ function isolateMainPlugin() {
|
|
|
237
227
|
}
|
|
238
228
|
}
|
|
239
229
|
if (firstHeaderElement && !afterFooter && element.tagId === TAG_FOOTER) {
|
|
240
|
-
|
|
241
|
-
if (depthDifference <= 5) {
|
|
230
|
+
if (element.depth - firstHeaderElement.depth <= 5) {
|
|
242
231
|
afterFooter = true;
|
|
243
232
|
return { skip: true };
|
|
244
233
|
}
|
|
@@ -400,8 +389,7 @@ function readabilityPlugin() {
|
|
|
400
389
|
node.context.tagCount = 1;
|
|
401
390
|
node.context.linkTextLength = 0;
|
|
402
391
|
node.context.textLength = 0;
|
|
403
|
-
|
|
404
|
-
if (hasStrongNegativePattern) createBufferRegion(node, state, false);
|
|
392
|
+
if (node.name && /nav|header|footer|aside|form|fieldset|button/i.test(node.name) || node.attributes?.class && /nav|menu|header|footer|sidebar|hidden|copyright|ad-|advertisement|banner|promo|related|comment|login|register|subscribe|newsletter|category|meta|tag|cta|button|apply|trial|engagement|sharing|likes|views|metrics|stats|breadcrumb|pagination|filter|sort|search/i.test(node.attributes.class) || node.attributes?.id && /nav|menu|header|footer|sidebar|hidden|copyright|ad-|advertisement|banner|promo|related|comment|login|register|subscribe|newsletter|category|meta|tag|cta|button|apply|trial|engagement|sharing|likes|views|metrics|stats|breadcrumb|pagination|filter|sort|search/i.test(node.attributes.id) || node.attributes?.style && /display:\s*none|visibility:\s*hidden/i.test(node.attributes.style) || node.attributes && Object.keys(node.attributes).some((attr) => attr.startsWith("aria-") && node.attributes[attr] === "true" && /hidden|invisible/i.test(attr))) createBufferRegion(node, state, false);
|
|
405
393
|
else if (node.parent && node.parent.context) node.context.score = (node.context.score || 0) + (node.parent.context.score || 0);
|
|
406
394
|
},
|
|
407
395
|
processTextNode(node) {
|
|
@@ -418,7 +406,6 @@ function readabilityPlugin() {
|
|
|
418
406
|
if (isInsideLink) parent.context.linkTextLength = (parent.context.linkTextLength || 0) + len;
|
|
419
407
|
parent = parent.parent;
|
|
420
408
|
}
|
|
421
|
-
return void 0;
|
|
422
409
|
},
|
|
423
410
|
onNodeExit(node, state) {
|
|
424
411
|
if (!node.context) return;
|
|
@@ -505,14 +492,13 @@ const TAILWIND_TO_MARKDOWN_MAP = {
|
|
|
505
492
|
* Extract base class name from a responsive breakpoint variant
|
|
506
493
|
*/
|
|
507
494
|
function extractBaseClass(className) {
|
|
508
|
-
const
|
|
495
|
+
for (const bp of [
|
|
509
496
|
"sm:",
|
|
510
497
|
"md:",
|
|
511
498
|
"lg:",
|
|
512
499
|
"xl:",
|
|
513
500
|
"2xl:"
|
|
514
|
-
]
|
|
515
|
-
for (const bp of breakpoints) if (className.startsWith(bp)) return {
|
|
501
|
+
]) if (className.startsWith(bp)) return {
|
|
516
502
|
baseClass: className.substring(bp.length),
|
|
517
503
|
breakpoint: bp
|
|
518
504
|
};
|
|
@@ -604,8 +590,7 @@ function processTailwindClasses(classes) {
|
|
|
604
590
|
let prefix = "";
|
|
605
591
|
let suffix = "";
|
|
606
592
|
let hidden = false;
|
|
607
|
-
const
|
|
608
|
-
const grouped = groupByFormattingType(normalizedClasses);
|
|
593
|
+
const grouped = groupByFormattingType(normalizeClasses(classes));
|
|
609
594
|
if (grouped.weight.length > 0) {
|
|
610
595
|
const { baseClass } = extractBaseClass(grouped.weight[0]);
|
|
611
596
|
const mapping = TAILWIND_TO_MARKDOWN_MAP[baseClass];
|
|
@@ -660,8 +645,7 @@ function tailwindPlugin() {
|
|
|
660
645
|
processAttributes(node) {
|
|
661
646
|
const classAttr = node.attributes?.class;
|
|
662
647
|
if (!classAttr) return;
|
|
663
|
-
const
|
|
664
|
-
const { prefix, suffix, hidden } = processTailwindClasses(classes);
|
|
648
|
+
const { prefix, suffix, hidden } = processTailwindClasses(classAttr.trim().split(" ").filter(Boolean));
|
|
665
649
|
node.context = node.context || {};
|
|
666
650
|
node.context.tailwind = {
|
|
667
651
|
prefix,
|
|
@@ -671,7 +655,7 @@ function tailwindPlugin() {
|
|
|
671
655
|
},
|
|
672
656
|
processTextNode(node) {
|
|
673
657
|
const parentNode = node.parent;
|
|
674
|
-
if (!parentNode || parentNode.type !== ELEMENT_NODE) return
|
|
658
|
+
if (!parentNode || parentNode.type !== ELEMENT_NODE) return;
|
|
675
659
|
const tailwindData = parentNode.context?.tailwind;
|
|
676
660
|
if (tailwindData?.hidden) return {
|
|
677
661
|
content: "",
|
|
@@ -691,13 +675,11 @@ function tailwindPlugin() {
|
|
|
691
675
|
},
|
|
692
676
|
beforeNodeProcess({ node }) {
|
|
693
677
|
if (node.type === ELEMENT_NODE) {
|
|
694
|
-
|
|
695
|
-
const tailwindData = elementNode.context?.tailwind;
|
|
696
|
-
if (tailwindData?.hidden) return { skip: true };
|
|
678
|
+
if ((node.context?.tailwind)?.hidden) return { skip: true };
|
|
697
679
|
}
|
|
698
680
|
}
|
|
699
681
|
});
|
|
700
682
|
}
|
|
701
683
|
|
|
702
684
|
//#endregion
|
|
703
|
-
export { filterPlugin, frontmatterPlugin
|
|
685
|
+
export { filterPlugin as a, frontmatterPlugin as i, readabilityPlugin as n, isolateMainPlugin as r, tailwindPlugin as t };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { a as parseHtmlStream, n as createMarkdownProcessor, r as processPluginsForEvent } from "./markdown-processor-D26Uo5td.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/stream.ts
|
|
4
4
|
/**
|
|
@@ -22,8 +22,7 @@ async function* streamHtmlToMarkdown(htmlStream, options = {}) {
|
|
|
22
22
|
while (true) {
|
|
23
23
|
const { done, value } = await reader.read();
|
|
24
24
|
if (done) break;
|
|
25
|
-
|
|
26
|
-
remainingHtml = parseHtmlStream(htmlContent, parseState, (event) => {
|
|
25
|
+
remainingHtml = parseHtmlStream(`${remainingHtml}${typeof value === "string" ? value : decoder.decode(value, { stream: true })}`, parseState, (event) => {
|
|
27
26
|
processPluginsForEvent(event, options.plugins, processor.state, processor.processEvent);
|
|
28
27
|
});
|
|
29
28
|
const chunk = processor.getMarkdownChunk();
|
|
@@ -49,4 +48,4 @@ function htmlToMarkdown(html, options = {}) {
|
|
|
49
48
|
}
|
|
50
49
|
|
|
51
50
|
//#endregion
|
|
52
|
-
export {
|
|
51
|
+
export { streamHtmlToMarkdown as n, htmlToMarkdown as t };
|
|
@@ -317,4 +317,4 @@ interface SplitterOptions extends HTMLToMarkdownOptions {
|
|
|
317
317
|
keepSeparator?: boolean;
|
|
318
318
|
}
|
|
319
319
|
//#endregion
|
|
320
|
-
export {
|
|
320
|
+
export { TagHandler as _, HandlerContext as a, ExtractedElement as b, MdreamRuntimeState as c, Plugin as d, PluginContext as f, TEXT_NODE as g, SplitterOptions as h, HTMLToMarkdownOptions as i, Node as l, ReadabilityContext as m, ELEMENT_NODE as n, MarkdownChunk as o, PluginCreationOptions as p, ElementNode as r, MdreamProcessingState as s, BufferRegion as t, NodeEvent as u, TailwindContext as v, extractionPlugin as x, TextNode as y };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
import "./_chunks/const-
|
|
2
|
-
import "./_chunks/markdown-processor-
|
|
3
|
-
import "./_chunks/plugin-
|
|
4
|
-
import { streamHtmlToMarkdown } from "./_chunks/src-
|
|
5
|
-
import "./_chunks/extraction-
|
|
6
|
-
import { generateLlmsTxtArtifacts } from "./_chunks/llms-txt-
|
|
7
|
-
import "./_chunks/plugins-
|
|
8
|
-
import { withMinimalPreset } from "./_chunks/minimal-
|
|
1
|
+
import "./_chunks/const-Bf_XN9U9.mjs";
|
|
2
|
+
import "./_chunks/markdown-processor-D26Uo5td.mjs";
|
|
3
|
+
import "./_chunks/plugin-CjWWQTuL.mjs";
|
|
4
|
+
import { n as streamHtmlToMarkdown } from "./_chunks/src-BJpipdul.mjs";
|
|
5
|
+
import "./_chunks/extraction-BA9MDtq3.mjs";
|
|
6
|
+
import { t as generateLlmsTxtArtifacts } from "./_chunks/llms-txt-D7Hduhij.mjs";
|
|
7
|
+
import "./_chunks/plugins-DJnqR2fA.mjs";
|
|
8
|
+
import { t as withMinimalPreset } from "./_chunks/minimal-BiDhcwif.mjs";
|
|
9
9
|
import { readFileSync } from "node:fs";
|
|
10
10
|
import { mkdir, writeFile } from "node:fs/promises";
|
|
11
11
|
import { Readable } from "node:stream";
|
|
@@ -38,12 +38,8 @@ async function generateLlms(patterns, options) {
|
|
|
38
38
|
generateMarkdown: artifacts.includes("markdown")
|
|
39
39
|
});
|
|
40
40
|
await mkdir(outputDir, { recursive: true });
|
|
41
|
-
|
|
42
|
-
await writeFile(
|
|
43
|
-
if (artifacts.includes("llms-full.txt") && result.llmsFullTxt) {
|
|
44
|
-
const fullPath = join(outputDir, "llms-full.txt");
|
|
45
|
-
await writeFile(fullPath, result.llmsFullTxt, "utf-8");
|
|
46
|
-
}
|
|
41
|
+
await writeFile(join(outputDir, "llms.txt"), result.llmsTxt, "utf-8");
|
|
42
|
+
if (artifacts.includes("llms-full.txt") && result.llmsFullTxt) await writeFile(join(outputDir, "llms-full.txt"), result.llmsFullTxt, "utf-8");
|
|
47
43
|
if (artifacts.includes("markdown") && result.markdownFiles) for (const mdFile of result.markdownFiles) {
|
|
48
44
|
const fullPath = join(outputDir, mdFile.path);
|
|
49
45
|
await mkdir(dirname(fullPath), { recursive: true });
|
|
@@ -55,10 +51,8 @@ async function generateLlms(patterns, options) {
|
|
|
55
51
|
process.exit(1);
|
|
56
52
|
}
|
|
57
53
|
}
|
|
58
|
-
const
|
|
59
|
-
const
|
|
60
|
-
const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
|
|
61
|
-
const version = packageJson.version;
|
|
54
|
+
const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
|
|
55
|
+
const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
|
|
62
56
|
const cli = cac();
|
|
63
57
|
cli.command("[options]", "Convert HTML from stdin to Markdown on stdout").option("--origin <url>", "Origin URL for resolving relative image paths").option("--preset <preset>", "Conversion presets: minimal").action(async (_, opts) => {
|
|
64
58
|
await streamingConvert(opts);
|
package/dist/iife.js
CHANGED
|
@@ -1,22 +1,12 @@
|
|
|
1
|
-
(function() {
|
|
2
|
-
'use strict';
|
|
1
|
+
function e(e,t,n){if(!t)return;let r=e.regionId||0,i=n.regionContentBuffers.get(r);i&&(i.push(t),n.lastContentCache=t)}function t(e){let t=[];for(let[n,r]of Array.from(e.regionContentBuffers.entries()))e.regionToggles.get(n)&&t.push(...r);return e.regionToggles.clear(),e.regionContentBuffers.clear(),t.join(``).trimStart()}const n={"&":`&`,"<":`<`,">":`>`,""":`"`,"'":`'`,"'":`'`," ":` `},r={html:0,head:1,details:2,summary:3,title:4,meta:5,br:6,h1:7,h2:8,h3:9,h4:10,h5:11,h6:12,hr:13,strong:14,b:15,em:16,i:17,del:18,sub:19,sup:20,ins:21,blockquote:22,code:23,ul:24,li:25,a:26,img:27,table:28,thead:29,tr:30,th:31,td:32,ol:33,pre:34,p:35,div:36,span:37,tbody:38,tfoot:39,form:40,nav:41,label:42,button:43,body:44,center:45,kbd:46,footer:47,path:48,svg:49,article:50,section:51,script:52,style:53,link:54,area:55,base:56,col:57,embed:58,input:59,keygen:60,param:61,source:62,track:63,wbr:64,select:65,textarea:66,option:67,fieldset:68,legend:69,audio:70,video:71,canvas:72,iframe:73,map:74,dialog:75,meter:76,progress:77,template:78,abbr:79,mark:80,q:81,samp:82,small:83,noscript:84,noframes:85,xmp:86,plaintext:87,aside:88,u:89,cite:90,dfn:91,var:92,time:93,bdo:94,ruby:95,rt:96,rp:97,dd:98,dt:99,dl:101,address:100,figure:102,object:103,main:104,header:105,figcaption:106,caption:107},i=[0,0],a=[2,2],o=[1,1],s=[1,0],c=[0,1];function l(e,t){if(!e)return e;if(e.startsWith(`//`))return`https:${e}`;if(e.startsWith(`#`))return e;if(t){if(e.startsWith(`/`)&&t)return`${t.endsWith(`/`)?t.slice(0,-1):t}${e}`;if(e.startsWith(`./`))return`${t}/${e.slice(2)}`;if(!e.startsWith(`http`))return`${t}/${e.startsWith(`/`)?e.slice(1):e}`}return e}function u(e){return e.depthMap[32]>0}function d(e){if(!e)return``;let t=e.split(` `).map(e=>e.split(`language-`)[1]).filter(Boolean);return t.length>0?t[0].trim():``}function f(e){return{enter:({node:t})=>t.depthMap[26]?`<h${e}>`:`${`#`.repeat(e)} `,exit:({node:t})=>{if(t.depthMap[26])return`</h${e}>`},collapsesInnerWhiteSpace:!0}}const p={enter:({node:e})=>e.depthMap[15]>1?``:`**`,exit:({node:e})=>e.depthMap[15]>1?``:`**`,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},m={enter:({node:e})=>e.depthMap[17]>1?``:`_`,exit:({node:e})=>e.depthMap[17]>1?``:`_`,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},h={1:{spacing:i,collapsesInnerWhiteSpace:!0},2:{enter:()=>`<details>`,exit:()=>`</details>
|
|
3
2
|
|
|
4
|
-
|
|
3
|
+
`},3:{enter:()=>`<summary>`,exit:()=>`</summary>
|
|
5
4
|
|
|
6
|
-
`},[a]:{enter:()=>`<summary>`,exit:()=>`</summary>
|
|
7
|
-
|
|
8
|
-
`},[o]:{collapsesInnerWhiteSpace:!0,isNonNesting:!0,spacing:z},[N]:{excludesTextNodes:!0,isNonNesting:!0},[P]:{isNonNesting:!0,excludesTextNodes:!0},[s]:{collapsesInnerWhiteSpace:!0,isSelfClosing:!0,spacing:z},[c]:{enter:({node:e})=>V(e)?`<br>`:void 0,isSelfClosing:!0,spacing:z,collapsesInnerWhiteSpace:!0,isInline:!0},[l]:H(1),[u]:H(2),[d]:H(3),[f]:H(4),[p]:H(5),[m]:H(6),[h]:{enter:()=>kt,isSelfClosing:!0},[g]:Ft,[_]:Ft,[v]:It,[y]:It,[ee]:{enter:()=>Et,exit:()=>Et,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[te]:{enter:()=>`<sub>`,exit:()=>`</sub>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ne]:{enter:()=>`<sup>`,exit:()=>`</sup>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[re]:{enter:()=>`<ins>`,exit:()=>`</ins>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[b]:{enter:({node:e})=>{let t=e.depthMap[b]||1,n=`> `.repeat(t);return e.depthMap[C]>0&&(n=`\n${` `.repeat(e.depthMap[C])}${n}`),n},spacing:jt},[x]:{enter:({node:e})=>{if((e.depthMap[O]||0)>0){let t=Pt(e.attributes?.class);return`${Dt}${t}\n`}return Ot},exit:({node:e})=>e.depthMap[O]>0?`\n${Dt}`:Ot,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[S]:{enter:({node:e})=>V(e)?`<ul>`:void 0,exit:({node:e})=>V(e)?`</ul>`:void 0},[C]:{enter:({node:e})=>{if(V(e))return`<li>`;let t=(e.depthMap[S]||0)+(e.depthMap[D]||0)-1,n=e.parent?.tagId===D,r=` `.repeat(Math.max(0,t)),i=n?`${e.index+1}. `:`- `;return`${r}${i}`},exit:({node:e})=>V(e)?`</li>`:void 0,spacing:Mt},[w]:{enter:({node:e})=>{if(e.attributes?.href)return`[`},exit:({node:e,state:t})=>{if(!e.attributes?.href)return``;let n=Nt(e.attributes?.href||``,t.options?.origin),r=e.attributes?.title,i=t.lastContentCache;return i===r&&(r=``),r?`](${n} "${r}")`:`](${n})`},collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ie]:{enter:({node:e,state:t})=>{let n=e.attributes?.alt||``,r=Nt(e.attributes?.src||``,t.options?.origin);return``},collapsesInnerWhiteSpace:!0,isSelfClosing:!0,spacing:z,isInline:!0},[T]:{enter:({node:e,state:t})=>{if(V(e))return`<table>`;e.depthMap[T]<=1&&(t.tableRenderedTable=!1),t.tableColumnAlignments=[]},exit:({node:e})=>V(e)?`</table>`:void 0},[ae]:{enter:({node:e})=>{if(V(e))return`<thead>`},exit:({node:e})=>V(e)?`</thead>`:void 0,spacing:B,excludesTextNodes:!0},[oe]:{enter:({node:e,state:t})=>V(e)?`<tr>`:(t.tableCurrentRowCells=0,`| `),exit:({node:e,state:t})=>{if(V(e)||e.depthMap[T]>1)return`</tr>`;if(!t.tableRenderedTable){t.tableRenderedTable=!0;let e=t.tableColumnAlignments;for(;e.length<t.tableCurrentRowCells;)e.push(``);let n=e.map(e=>{switch(e){case`left`:return`:---`;case`center`:return`:---:`;case`right`:return`---:`;default:return`---`}});return` |\n| ${n.join(` | `)} |`}return` |`},excludesTextNodes:!0,spacing:B},[se]:{enter:({node:e,state:t})=>{if(e.depthMap[T]>1)return`<th>`;let n=e.attributes?.align?.toLowerCase();return n?t.tableColumnAlignments.push(n):t.tableColumnAlignments.length<=t.tableCurrentRowCells&&t.tableColumnAlignments.push(``),e.index===0?``:` | `},exit:({node:e,state:t})=>{if(e.depthMap[T]>1)return`</th>`;t.tableCurrentRowCells++},collapsesInnerWhiteSpace:!0,spacing:z},[E]:{enter:({node:e})=>e.depthMap[T]>1?`<td>`:e.index===0?``:` | `,exit:({node:e,state:t})=>{if(e.depthMap[T]>1)return`</td>`;t.tableCurrentRowCells++},collapsesInnerWhiteSpace:!0,spacing:z},[k]:{},[A]:{},[j]:{collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[de]:{},[fe]:{collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[pe]:{collapsesInnerWhiteSpace:!0,isInline:!0},[me]:{spacing:z},[he]:{enter:({node:e})=>{if(e.depthMap[T]>1)return`<center>`},exit:({node:e})=>{if(e.depthMap[T]>1)return`</center>`},spacing:z},[ce]:{spacing:z,excludesTextNodes:!0},[le]:{spacing:B,excludesTextNodes:!0},[ge]:{enter:()=>"`",exit:()=>"`",collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[_e]:{spacing:z},[ue]:{spacing:z},[F]:{isSelfClosing:!0,spacing:z,collapsesInnerWhiteSpace:!0,isInline:!0},[xe]:{isSelfClosing:!0,spacing:z,isInline:!0},[Se]:{isSelfClosing:!0,spacing:z,isInline:!0},[Ce]:{isSelfClosing:!0,spacing:z},[we]:{isSelfClosing:!0,spacing:z},[Te]:{isSelfClosing:!0,spacing:z,isInline:!0},[Ee]:{isSelfClosing:!0,spacing:z,isInline:!0},[De]:{isSelfClosing:!0,spacing:z},[Oe]:{isSelfClosing:!0,spacing:z},[ke]:{isSelfClosing:!0,spacing:z},[Ae]:{isSelfClosing:!0,spacing:z,isInline:!0},[M]:{spacing:z},[je]:{spacing:z},[Me]:{isNonNesting:!0,spacing:z},[Ne]:{isNonNesting:!0,spacing:z},[Pe]:{spacing:z},[Fe]:{spacing:z},[Ie]:{spacing:z},[Le]:{spacing:z},[Re]:{spacing:z},[ze]:{isNonNesting:!0,spacing:z},[Be]:{spacing:z},[Ve]:{spacing:z},[He]:{spacing:z},[Ue]:{spacing:z},[We]:{spacing:z},[Ge]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[Ke]:{enter:()=>`<mark>`,exit:()=>`</mark>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[qe]:{enter:()=>`"`,exit:()=>`"`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[Je]:{enter:()=>"`",exit:()=>"`",collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[Ye]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[Xe]:{excludesTextNodes:!0,spacing:z},[Ze]:{isNonNesting:!0,spacing:z},[Qe]:{isNonNesting:!0,spacing:z},[$e]:{isNonNesting:!0,spacing:z},[et]:{spacing:z},[tt]:{enter:()=>`<u>`,exit:()=>`</u>`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[nt]:{enter:()=>`*`,exit:()=>`*`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[rt]:{enter:()=>`**`,exit:()=>`**`,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[it]:{enter:()=>"`",exit:()=>"`",collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[at]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ot]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[st]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ct]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[lt]:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:z,isInline:!0},[ft]:{enter:()=>`<address>`,exit:()=>`</address>`,spacing:z,collapsesInnerWhiteSpace:!0},[pt]:{spacing:z,enter:()=>`<dl>`,exit:()=>`</dl>`},[dt]:{enter:()=>`<dt>`,exit:()=>`</dt>`,collapsesInnerWhiteSpace:!0,spacing:[0,1]},[ut]:{enter:()=>`<dd>`,exit:()=>`</dd>`,spacing:[0,1]}};function Rt(e){let t=``,n=0;for(;n<e.length;){if(e[n]===`&`){let r=!1;for(let[i,a]of Object.entries(xt))if(e.startsWith(i,n)){t+=a,n+=i.length,r=!0;break}if(r)continue;if(n+2<e.length&&e[n+1]===`#`){let r=n;n+=2;let i=e[n]===`x`||e[n]===`X`;i&&n++;let a=n;for(;n<e.length&&e[n]!==`;`;)n++;if(n<e.length&&e[n]===`;`){let r=e.substring(a,n),o=i?16:10;try{let e=Number.parseInt(r,o);if(!Number.isNaN(e)){t+=String.fromCodePoint(e),n++;continue}}catch{}}n=r}}t+=e[n],n++}return t}function zt(e){let t=e,n=[t];for(;t.tagHandler?.isInline&&t.parent;)t=t.parent,n.push(t);return n}const Bt=60,U=62,W=47,G=61,K=34,q=39,Vt=33,Ht=38,J=92,Y=45,X=32,Ut=9,Wt=10,Gt=13,Kt=96,qt=124,Jt=91,Yt=93,Xt=Object.freeze({});function Zt(e){return new Uint8Array(e)}function Z(e){return e===X||e===Ut||e===Wt||e===Gt}function Qt(e,t,n){return $t(e,t,n)}function $t(e,t,n){let r=``;t.depthMap??=new Uint8Array(bt),t.depth??=0,t.lastCharWasWhitespace??=!0,t.justClosedTag??=!1,t.isFirstTextInElement??=!1,t.lastCharWasBackslash??=!1;let i=0,a=e.length;for(;i<a;){let o=e.charCodeAt(i);if(o!==Bt){if(o===Ht&&(t.hasEncodedHtmlEntity=!0),Z(o)){let n=t.depthMap[O]>0;if(t.justClosedTag&&(t.justClosedTag=!1,t.lastCharWasWhitespace=!1),!n&&t.lastCharWasWhitespace){i++;continue}n?r+=e[i]:(o===X||!t.lastCharWasWhitespace)&&(r+=` `),t.lastCharWasWhitespace=!0,t.textBufferContainsWhitespace=!0,t.lastCharWasBackslash=!1}else t.textBufferContainsNonWhitespace=!0,t.lastCharWasWhitespace=!1,t.justClosedTag=!1,o===qt&&t.depthMap[T]?r+=`\\|`:o===Kt&&(t.depthMap[x]||t.depthMap[O])?r+="\\`":o===Jt&&t.depthMap[w]?r+=`\\[`:o===Yt&&t.depthMap[w]?r+=`\\]`:o===U&&t.depthMap[b]?r+=`\\>`:r+=e[i],t.currentNode?.tagHandler?.isNonNesting&&(t.lastCharWasBackslash||(o===q&&!t.inDoubleQuote&&!t.inBacktick?t.inSingleQuote=!t.inSingleQuote:o===K&&!t.inSingleQuote&&!t.inBacktick?t.inDoubleQuote=!t.inDoubleQuote:o===Kt&&!t.inSingleQuote&&!t.inDoubleQuote&&(t.inBacktick=!t.inBacktick))),t.lastCharWasBackslash=o===J;i++;continue}if(i+1>=a){r+=e[i];break}let s=e.charCodeAt(i+1);if(s===Vt){r.length>0&&(Q(r,t,n),r=``);let a=tn(e,i);if(a.complete)i=a.newPosition;else{r+=a.remainingText;break}}else if(s===W){let a=t.inSingleQuote||t.inDoubleQuote||t.inBacktick;if(t.currentNode?.tagHandler?.isNonNesting&&a){r+=e[i],i++;continue}r.length>0&&(Q(r,t,n),r=``);let o=en(e,i,t,n);if(o.complete)i=o.newPosition;else{r+=o.remainingText;break}}else{let o=i+1,s=o,c=-1;for(;o<a;){let t=e.charCodeAt(o);if(Z(t)||t===W||t===U){c=o;break}o++}if(c===-1){r+=e.substring(i);break}let l=e.substring(s,c).toLowerCase();if(!l){i=c;break}let u=Ct[l]??-1;if(o=c,t.currentNode?.tagHandler?.isNonNesting&&u!==t.currentNode?.tagId){r+=e[i++];continue}r.length>0&&(Q(r,t,n),r=``);let d=nn(l,u,e,o,t,n);if(d.skip)r+=e[i++];else if(d.complete)i=d.newPosition,d.selfClosing||(t.isFirstTextInElement=!0);else{r+=d.remainingText;break}}}return r}function Q(e,t,n){let r=t.textBufferContainsNonWhitespace,i=t.textBufferContainsWhitespace;if(t.textBufferContainsNonWhitespace=!1,t.textBufferContainsWhitespace=!1,!t.currentNode)return;let a=t.currentNode?.tagHandler?.excludesTextNodes,o=t.depthMap[O]>0;if(!o&&!r&&!t.currentNode.childTextNodeIndex)return;let s=e;if(s.length===0)return;let c=zt(t.currentNode),l=c[c.length-1];if(i&&!l?.childTextNodeIndex){let e=0;for(;e<s.length&&(o?s.charCodeAt(e)===Wt||s.charCodeAt(e)===Gt:Z(s.charCodeAt(e)));)e++;e>0&&(s=s.substring(e))}t.hasEncodedHtmlEntity&&(s=Rt(String(s)),t.hasEncodedHtmlEntity=!1);let u={type:L,value:s,parent:t.currentNode,regionId:t.currentNode?.regionId,index:t.currentNode.currentWalkIndex++,depth:t.depth,containsWhitespace:i,excludedFromMarkdown:a};for(let e of c)e.childTextNodeIndex=(e.childTextNodeIndex||0)+1;n({type:R,node:u}),t.lastTextNode=u}function en(e,t,n,r){let i=t+2,a=i,o=e.length,s=!1;for(;i<o;){let t=e.charCodeAt(i);if(t===U){s=!0;break}i++}if(!s)return{complete:!1,newPosition:t,remainingText:e.substring(t)};let c=e.substring(a,i).toLowerCase(),l=Ct[c]??-1;if(n.currentNode?.tagHandler?.isNonNesting&&l!==n.currentNode.tagId)return{complete:!1,newPosition:t,remainingText:e.substring(t)};let u=n.currentNode;if(u){let e=u.tagId!==l;for(;u&&e;)$(u,n,r),u=u.parent,e=u?.tagId!==l}return u&&$(u,n,r),n.justClosedTag=!0,{complete:!0,newPosition:i+1,remainingText:``}}function $(e,t,n){if(e){if(e.tagId===w&&!e.childTextNodeIndex){let t=e.attributes?.title||e.attributes?.[`aria-label`]||``;if(t){e.childTextNodeIndex=1;let r={type:L,value:t,parent:e,index:0,depth:e.depth+1};n({type:R,node:r});for(let t of zt(e))t.childTextNodeIndex=(t.childTextNodeIndex||0)+1}}e.tagId&&(t.depthMap[e.tagId]=Math.max(0,t.depthMap[e.tagId]-1)),e.tagHandler?.isNonNesting&&(t.inSingleQuote=!1,t.inDoubleQuote=!1,t.inBacktick=!1,t.lastCharWasBackslash=!1),t.depth--,n({type:St,node:e}),t.currentNode=t.currentNode.parent,t.hasEncodedHtmlEntity=!1,t.justClosedTag=!0}}function tn(e,t){let n=t,r=e.length;if(n+3<r&&e.charCodeAt(n+2)===Y&&e.charCodeAt(n+3)===Y){for(n+=4;n<r-2;){if(e.charCodeAt(n)===Y&&e.charCodeAt(n+1)===Y&&e.charCodeAt(n+2)===U)return n+=3,{complete:!0,newPosition:n,remainingText:``};n++}return{complete:!1,newPosition:t,remainingText:e.substring(t)}}else{for(n+=2;n<r;){if(e.charCodeAt(n)===U)return n++,{complete:!0,newPosition:n,remainingText:``};n++}return{complete:!1,newPosition:n,remainingText:e.substring(t,n)}}}function nn(e,t,n,r,i,a){i.currentNode?.tagHandler?.isNonNesting&&$(i.currentNode,i,a);let o=Lt[t],s=rn(n,r,o);if(!s.complete)return{complete:!1,newPosition:r,remainingText:`<${e}${s.attrBuffer}`,selfClosing:!1};let c=i.depthMap[t];i.depthMap[t]=c+1,i.depth++,r=s.newPosition,i.currentNode&&(i.currentNode.currentWalkIndex=i.currentNode.currentWalkIndex||0);let l=i.currentNode?i.currentNode.currentWalkIndex++:0,u={type:I,name:e,attributes:s.attributes,parent:i.currentNode,depthMap:Zt(i.depthMap),depth:i.depth,index:l,regionId:i.currentNode?.regionId,tagId:t,tagHandler:o};i.lastTextNode=u,a({type:R,node:u});let d=u;return d.currentWalkIndex=0,i.currentNode=d,i.hasEncodedHtmlEntity=!1,o?.isNonNesting&&!s.selfClosing&&(i.inSingleQuote=!1,i.inDoubleQuote=!1,i.inBacktick=!1,i.lastCharWasBackslash=!1),s.selfClosing?($(u,i,a),i.justClosedTag=!0):i.justClosedTag=!1,{complete:!0,newPosition:r,remainingText:``,selfClosing:s.selfClosing}}function rn(e,t,n){let r=t,i=e.length,a=n?.isSelfClosing||!1,o=r,s=!1,c=0,l=0;for(;r<i;){let t=e.charCodeAt(r);if(s){t===c&&l!==J&&(s=!1),r++;continue}else if(t===K||t===q)s=!0,c=t;else if(t===W&&r+1<i&&e.charCodeAt(r+1)===U){let t=e.substring(o,r).trim();return{complete:!0,newPosition:r+2,attributes:an(t),selfClosing:!0,attrBuffer:t}}else if(t===U){let t=e.substring(o,r).trim();return{complete:!0,newPosition:r+1,attributes:an(t),selfClosing:a,attrBuffer:t}}r++,l=t}return{complete:!1,newPosition:r,attributes:Xt,selfClosing:!1,attrBuffer:e.substring(o,r)}}function an(e){if(!e)return Xt;let t={},n=e.length,r=0,i=0,a=1,o=2,s=3,c=4,l=5,u=i,d=0,f=0,p=0,m=0,h=``;for(;r<n;){let g=e.charCodeAt(r),_=Z(g);switch(u){case i:_||(u=a,d=r,f=0);break;case a:(g===G||_)&&(f=r,h=e.substring(d,f).toLowerCase(),u=g===G?s:o);break;case o:g===G?u=s:_||(t[h]=``,u=a,d=r,f=0);break;case s:g===K||g===q?(m=g,u=c,p=r+1):_||(u=l,p=r);break;case c:g===J&&r+1<n?r++:g===m&&(t[h]=e.substring(p,r),u=i);break;case l:(_||g===U)&&(t[h]=e.substring(p,r),u=i);break}r++}if(u===c||u===l)h&&(t[h]=e.substring(p,r));else if(u===a||u===o||u===s){f||=r;let n=e.substring(d,f).toLowerCase();n&&(t[n]=``)}return t}function on(e,t,n,r){if(t?.length){for(let r of t){let t=r.beforeNodeProcess?.(e,n);if(typeof t==`object`&&t.skip)return!0}if(e.node.type===I){let r=e.node;if(e.type===R)for(let e of t)e.processAttributes&&e.processAttributes(r,n);let i=e.type===R?`onNodeEnter`:`onNodeExit`,a=[];for(let e of t)if(e[i]){let t=e[i](r,n);t&&a.push(t)}a.length>0&&(r.pluginOutput=(r.pluginOutput||[]).concat(a))}else if(e.node.type===L&&e.type===R){let r=e.node;for(let e of t)if(e.processTextNode){let t=e.processTextNode(r,n);if(t){if(t.skip)return!0;r.value=t.content}}}}return r(e),!1}function sn(e,t,n){if(e===` `||e===`
|
|
5
|
+
`},4:{collapsesInnerWhiteSpace:!0,isNonNesting:!0,spacing:i},52:{excludesTextNodes:!0,isNonNesting:!0},53:{isNonNesting:!0,excludesTextNodes:!0},5:{collapsesInnerWhiteSpace:!0,isSelfClosing:!0,spacing:i},6:{enter:({node:e})=>u(e)?`<br>`:void 0,isSelfClosing:!0,spacing:i,collapsesInnerWhiteSpace:!0,isInline:!0},7:f(1),8:f(2),9:f(3),10:f(4),11:f(5),12:f(6),13:{enter:()=>`---`,isSelfClosing:!0},14:p,15:p,16:m,17:m,18:{enter:()=>`~~`,exit:()=>`~~`,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},19:{enter:()=>`<sub>`,exit:()=>`</sub>`,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},20:{enter:()=>`<sup>`,exit:()=>`</sup>`,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},21:{enter:()=>`<ins>`,exit:()=>`</ins>`,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},22:{enter:({node:e})=>{let t=e.depthMap[22]||1,n=`> `.repeat(t);return e.depthMap[25]>0&&(n=`\n${` `.repeat(e.depthMap[25])}${n}`),n},spacing:o},23:{enter:({node:e})=>(e.depthMap[34]||0)>0?`\`\`\`${d(e.attributes?.class)}\n`:"`",exit:({node:e})=>e.depthMap[34]>0?"\n```":"`",collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},24:{enter:({node:e})=>u(e)?`<ul>`:void 0,exit:({node:e})=>u(e)?`</ul>`:void 0},25:{enter:({node:e})=>{if(u(e))return`<li>`;let t=(e.depthMap[24]||0)+(e.depthMap[33]||0)-1,n=e.parent?.tagId===33;return`${` `.repeat(Math.max(0,t))}${n?`${e.index+1}. `:`- `}`},exit:({node:e})=>u(e)?`</li>`:void 0,spacing:s},26:{enter:({node:e})=>{if(e.attributes?.href)return`[`},exit:({node:e,state:t})=>{if(!e.attributes?.href)return``;let n=l(e.attributes?.href||``,t.options?.origin),r=e.attributes?.title;return t.lastContentCache===r&&(r=``),r?`](${n} "${r}")`:`](${n})`},collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},27:{enter:({node:e,state:t})=>`})`,collapsesInnerWhiteSpace:!0,isSelfClosing:!0,spacing:i,isInline:!0},28:{enter:({node:e,state:t})=>{if(u(e))return`<table>`;e.depthMap[28]<=1&&(t.tableRenderedTable=!1),t.tableColumnAlignments=[]},exit:({node:e})=>u(e)?`</table>`:void 0},29:{enter:({node:e})=>{if(u(e))return`<thead>`},exit:({node:e})=>u(e)?`</thead>`:void 0,spacing:c,excludesTextNodes:!0},30:{enter:({node:e,state:t})=>u(e)?`<tr>`:(t.tableCurrentRowCells=0,`| `),exit:({node:e,state:t})=>{if(u(e)||e.depthMap[28]>1)return`</tr>`;if(!t.tableRenderedTable){t.tableRenderedTable=!0;let e=t.tableColumnAlignments;for(;e.length<t.tableCurrentRowCells;)e.push(``);return` |\n| ${e.map(e=>{switch(e){case`left`:return`:---`;case`center`:return`:---:`;case`right`:return`---:`;default:return`---`}}).join(` | `)} |`}return` |`},excludesTextNodes:!0,spacing:c},31:{enter:({node:e,state:t})=>{if(e.depthMap[28]>1)return`<th>`;let n=e.attributes?.align?.toLowerCase();return n?t.tableColumnAlignments.push(n):t.tableColumnAlignments.length<=t.tableCurrentRowCells&&t.tableColumnAlignments.push(``),e.index===0?``:` | `},exit:({node:e,state:t})=>{if(e.depthMap[28]>1)return`</th>`;t.tableCurrentRowCells++},collapsesInnerWhiteSpace:!0,spacing:i},32:{enter:({node:e})=>e.depthMap[28]>1?`<td>`:e.index===0?``:` | `,exit:({node:e,state:t})=>{if(e.depthMap[28]>1)return`</td>`;t.tableCurrentRowCells++},collapsesInnerWhiteSpace:!0,spacing:i},35:{},36:{},37:{collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},41:{},42:{collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},43:{collapsesInnerWhiteSpace:!0,isInline:!0},44:{spacing:i},45:{enter:({node:e})=>{if(e.depthMap[28]>1)return`<center>`},exit:({node:e})=>{if(e.depthMap[28]>1)return`</center>`},spacing:i},38:{spacing:i,excludesTextNodes:!0},39:{spacing:c,excludesTextNodes:!0},46:{enter:()=>"`",exit:()=>"`",collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},47:{spacing:i},40:{spacing:i},54:{isSelfClosing:!0,spacing:i,collapsesInnerWhiteSpace:!0,isInline:!0},55:{isSelfClosing:!0,spacing:i,isInline:!0},56:{isSelfClosing:!0,spacing:i,isInline:!0},57:{isSelfClosing:!0,spacing:i},58:{isSelfClosing:!0,spacing:i},59:{isSelfClosing:!0,spacing:i,isInline:!0},60:{isSelfClosing:!0,spacing:i,isInline:!0},61:{isSelfClosing:!0,spacing:i},62:{isSelfClosing:!0,spacing:i},63:{isSelfClosing:!0,spacing:i},64:{isSelfClosing:!0,spacing:i,isInline:!0},49:{spacing:i},65:{spacing:i},66:{isNonNesting:!0,spacing:i},67:{isNonNesting:!0,spacing:i},68:{spacing:i},69:{spacing:i},70:{spacing:i},71:{spacing:i},72:{spacing:i},73:{isNonNesting:!0,spacing:i},74:{spacing:i},75:{spacing:i},76:{spacing:i},77:{spacing:i},78:{spacing:i},79:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},80:{enter:()=>`<mark>`,exit:()=>`</mark>`,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},81:{enter:()=>`"`,exit:()=>`"`,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},82:{enter:()=>"`",exit:()=>"`",collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},83:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},84:{excludesTextNodes:!0,spacing:i},85:{isNonNesting:!0,spacing:i},86:{isNonNesting:!0,spacing:i},87:{isNonNesting:!0,spacing:i},88:{spacing:i},89:{enter:()=>`<u>`,exit:()=>`</u>`,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},90:{enter:()=>`*`,exit:()=>`*`,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},91:{enter:()=>`**`,exit:()=>`**`,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},92:{enter:()=>"`",exit:()=>"`",collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},93:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},94:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},95:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},96:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},97:{enter:()=>``,exit:()=>``,collapsesInnerWhiteSpace:!0,spacing:i,isInline:!0},100:{enter:()=>`<address>`,exit:()=>`</address>`,spacing:i,collapsesInnerWhiteSpace:!0},101:{spacing:i,enter:()=>`<dl>`,exit:()=>`</dl>`},99:{enter:()=>`<dt>`,exit:()=>`</dt>`,collapsesInnerWhiteSpace:!0,spacing:[0,1]},98:{enter:()=>`<dd>`,exit:()=>`</dd>`,spacing:[0,1]}};function g(e){let t=``,r=0;for(;r<e.length;){if(e[r]===`&`){let i=!1;for(let[a,o]of Object.entries(n))if(e.startsWith(a,r)){t+=o,r+=a.length,i=!0;break}if(i)continue;if(r+2<e.length&&e[r+1]===`#`){let n=r;r+=2;let i=e[r]===`x`||e[r]===`X`;i&&r++;let a=r;for(;r<e.length&&e[r]!==`;`;)r++;if(r<e.length&&e[r]===`;`){let n=e.substring(a,r),o=i?16:10;try{let e=Number.parseInt(n,o);if(!Number.isNaN(e)){t+=String.fromCodePoint(e),r++;continue}}catch{}}r=n}}t+=e[r],r++}return t}function _(e){let t=e,n=[t];for(;t.tagHandler?.isInline&&t.parent;)t=t.parent,n.push(t);return n}const v=Object.freeze({});function y(e){return new Uint8Array(e)}function b(e){return e===32||e===9||e===10||e===13}function x(e,t,n){return S(e,t,n)}function S(e,t,n){let i=``;t.depthMap??=new Uint8Array(108),t.depth??=0,t.lastCharWasWhitespace??=!0,t.justClosedTag??=!1,t.isFirstTextInElement??=!1,t.lastCharWasBackslash??=!1;let a=0,o=e.length;for(;a<o;){let s=e.charCodeAt(a);if(s!==60){if(s===38&&(t.hasEncodedHtmlEntity=!0),b(s)){let n=t.depthMap[34]>0;if(t.justClosedTag&&(t.justClosedTag=!1,t.lastCharWasWhitespace=!1),!n&&t.lastCharWasWhitespace){a++;continue}n?i+=e[a]:(s===32||!t.lastCharWasWhitespace)&&(i+=` `),t.lastCharWasWhitespace=!0,t.textBufferContainsWhitespace=!0,t.lastCharWasBackslash=!1}else t.textBufferContainsNonWhitespace=!0,t.lastCharWasWhitespace=!1,t.justClosedTag=!1,s===124&&t.depthMap[28]?i+=`\\|`:s===96&&(t.depthMap[23]||t.depthMap[34])?i+="\\`":s===91&&t.depthMap[26]?i+=`\\[`:s===93&&t.depthMap[26]?i+=`\\]`:s===62&&t.depthMap[22]?i+=`\\>`:i+=e[a],t.currentNode?.tagHandler?.isNonNesting&&(t.lastCharWasBackslash||(s===39&&!t.inDoubleQuote&&!t.inBacktick?t.inSingleQuote=!t.inSingleQuote:s===34&&!t.inSingleQuote&&!t.inBacktick?t.inDoubleQuote=!t.inDoubleQuote:s===96&&!t.inSingleQuote&&!t.inDoubleQuote&&(t.inBacktick=!t.inBacktick))),t.lastCharWasBackslash=s===92;a++;continue}if(a+1>=o){i+=e[a];break}let c=e.charCodeAt(a+1);if(c===33){i.length>0&&(C(i,t,n),i=``);let r=E(e,a);if(r.complete)a=r.newPosition;else{i+=r.remainingText;break}}else if(c===47){let r=t.inSingleQuote||t.inDoubleQuote||t.inBacktick;if(t.currentNode?.tagHandler?.isNonNesting&&r){i+=e[a],a++;continue}i.length>0&&(C(i,t,n),i=``);let o=w(e,a,t,n);if(o.complete)a=o.newPosition;else{i+=o.remainingText;break}}else{let s=a+1,c=s,l=-1;for(;s<o;){let t=e.charCodeAt(s);if(b(t)||t===47||t===62){l=s;break}s++}if(l===-1){i+=e.substring(a);break}let u=e.substring(c,l).toLowerCase();if(!u){a=l;break}let d=r[u]??-1;if(s=l,t.currentNode?.tagHandler?.isNonNesting&&d!==t.currentNode?.tagId){i+=e[a++];continue}i.length>0&&(C(i,t,n),i=``);let f=D(u,d,e,s,t,n);if(f.skip)i+=e[a++];else if(f.complete)a=f.newPosition,f.selfClosing||(t.isFirstTextInElement=!0);else{i+=f.remainingText;break}}}return i}function C(e,t,n){let r=t.textBufferContainsNonWhitespace,i=t.textBufferContainsWhitespace;if(t.textBufferContainsNonWhitespace=!1,t.textBufferContainsWhitespace=!1,!t.currentNode)return;let a=t.currentNode?.tagHandler?.excludesTextNodes,o=t.depthMap[34]>0;if(!o&&!r&&!t.currentNode.childTextNodeIndex)return;let s=e;if(s.length===0)return;let c=_(t.currentNode),l=c[c.length-1];if(i&&!l?.childTextNodeIndex){let e=0;for(;e<s.length&&(o?s.charCodeAt(e)===10||s.charCodeAt(e)===13:b(s.charCodeAt(e)));)e++;e>0&&(s=s.substring(e))}t.hasEncodedHtmlEntity&&=(s=g(String(s)),!1);let u={type:2,value:s,parent:t.currentNode,regionId:t.currentNode?.regionId,index:t.currentNode.currentWalkIndex++,depth:t.depth,containsWhitespace:i,excludedFromMarkdown:a};for(let e of c)e.childTextNodeIndex=(e.childTextNodeIndex||0)+1;n({type:0,node:u}),t.lastTextNode=u}function w(e,t,n,i){let a=t+2,o=a,s=e.length,c=!1;for(;a<s;){if(e.charCodeAt(a)===62){c=!0;break}a++}if(!c)return{complete:!1,newPosition:t,remainingText:e.substring(t)};let l=r[e.substring(o,a).toLowerCase()]??-1;if(n.currentNode?.tagHandler?.isNonNesting&&l!==n.currentNode.tagId)return{complete:!1,newPosition:t,remainingText:e.substring(t)};let u=n.currentNode;if(u){let e=u.tagId!==l;for(;u&&e;)T(u,n,i),u=u.parent,e=u?.tagId!==l}return u&&T(u,n,i),n.justClosedTag=!0,{complete:!0,newPosition:a+1,remainingText:``}}function T(e,t,n){if(e){if(e.tagId===26&&!e.childTextNodeIndex){let t=e.attributes?.title||e.attributes?.[`aria-label`]||``;if(t){e.childTextNodeIndex=1,n({type:0,node:{type:2,value:t,parent:e,index:0,depth:e.depth+1}});for(let t of _(e))t.childTextNodeIndex=(t.childTextNodeIndex||0)+1}}e.tagId&&(t.depthMap[e.tagId]=Math.max(0,t.depthMap[e.tagId]-1)),e.tagHandler?.isNonNesting&&(t.inSingleQuote=!1,t.inDoubleQuote=!1,t.inBacktick=!1,t.lastCharWasBackslash=!1),t.depth--,n({type:1,node:e}),t.currentNode=t.currentNode.parent,t.hasEncodedHtmlEntity=!1,t.justClosedTag=!0}}function E(e,t){let n=t,r=e.length;if(n+3<r&&e.charCodeAt(n+2)===45&&e.charCodeAt(n+3)===45){for(n+=4;n<r-2;){if(e.charCodeAt(n)===45&&e.charCodeAt(n+1)===45&&e.charCodeAt(n+2)===62)return n+=3,{complete:!0,newPosition:n,remainingText:``};n++}return{complete:!1,newPosition:t,remainingText:e.substring(t)}}else{for(n+=2;n<r;){if(e.charCodeAt(n)===62)return n++,{complete:!0,newPosition:n,remainingText:``};n++}return{complete:!1,newPosition:n,remainingText:e.substring(t,n)}}}function D(e,t,n,r,i,a){i.currentNode?.tagHandler?.isNonNesting&&T(i.currentNode,i,a);let o=h[t],s=O(n,r,o);if(!s.complete)return{complete:!1,newPosition:r,remainingText:`<${e}${s.attrBuffer}`,selfClosing:!1};let c=i.depthMap[t];i.depthMap[t]=c+1,i.depth++,r=s.newPosition,i.currentNode&&(i.currentNode.currentWalkIndex=i.currentNode.currentWalkIndex||0);let l=i.currentNode?i.currentNode.currentWalkIndex++:0,u={type:1,name:e,attributes:s.attributes,parent:i.currentNode,depthMap:y(i.depthMap),depth:i.depth,index:l,regionId:i.currentNode?.regionId,tagId:t,tagHandler:o};i.lastTextNode=u,a({type:0,node:u});let d=u;return d.currentWalkIndex=0,i.currentNode=d,i.hasEncodedHtmlEntity=!1,o?.isNonNesting&&!s.selfClosing&&(i.inSingleQuote=!1,i.inDoubleQuote=!1,i.inBacktick=!1,i.lastCharWasBackslash=!1),s.selfClosing?(T(u,i,a),i.justClosedTag=!0):i.justClosedTag=!1,{complete:!0,newPosition:r,remainingText:``,selfClosing:s.selfClosing}}function O(e,t,n){let r=t,i=e.length,a=n?.isSelfClosing||!1,o=r,s=!1,c=0,l=0;for(;r<i;){let t=e.charCodeAt(r);if(s){t===c&&l!==92&&(s=!1),r++;continue}else if(t===34||t===39)s=!0,c=t;else if(t===47&&r+1<i&&e.charCodeAt(r+1)===62){let t=e.substring(o,r).trim();return{complete:!0,newPosition:r+2,attributes:k(t),selfClosing:!0,attrBuffer:t}}else if(t===62){let t=e.substring(o,r).trim();return{complete:!0,newPosition:r+1,attributes:k(t),selfClosing:a,attrBuffer:t}}r++,l=t}return{complete:!1,newPosition:r,attributes:v,selfClosing:!1,attrBuffer:e.substring(o,r)}}function k(e){if(!e)return v;let t={},n=e.length,r=0,i=0,a=0,o=0,s=0,c=0,l=``;for(;r<n;){let u=e.charCodeAt(r),d=b(u);switch(i){case 0:d||(i=1,a=r,o=0);break;case 1:(u===61||d)&&(o=r,l=e.substring(a,o).toLowerCase(),i=u===61?3:2);break;case 2:u===61?i=3:d||(t[l]=``,i=1,a=r,o=0);break;case 3:u===34||u===39?(c=u,i=4,s=r+1):d||(i=5,s=r);break;case 4:u===92&&r+1<n?r++:u===c&&(t[l]=e.substring(s,r),i=0);break;case 5:(d||u===62)&&(t[l]=e.substring(s,r),i=0);break}r++}if(i===4||i===5)l&&(t[l]=e.substring(s,r));else if(i===1||i===2||i===3){o||=r;let n=e.substring(a,o).toLowerCase();n&&(t[n]=``)}return t}function A(e,t,n,r){if(t?.length){for(let r of t){let t=r.beforeNodeProcess?.(e,n);if(typeof t==`object`&&t.skip)return!0}if(e.node.type===1){let r=e.node;if(e.type===0)for(let e of t)e.processAttributes&&e.processAttributes(r,n);let i=e.type===0?`onNodeEnter`:`onNodeExit`,a=[];for(let e of t)if(e[i]){let t=e[i](r,n);t&&a.push(t)}a.length>0&&(r.pluginOutput=(r.pluginOutput||[]).concat(a))}else if(e.node.type===2&&e.type===0){let r=e.node;for(let e of t)if(e.processTextNode){let t=e.processTextNode(r,n);if(t){if(t.skip)return!0;r.value=t.content}}}}return r(e),!1}function j(e,t,n){if(e===` `||e===`
|
|
9
6
|
`||e===` `||t===` `||t===`
|
|
10
|
-
`||t===` `)return!1;let r=new Set([`[`,`(`,`>`,`*`,`_`,"`"]),i=new Set([`]`,`)`,`<`,`.`,`,`,`!`,`?`,`:`,`;`,`*`,`_`,"`"]);return e===`|`&&t===`<`&&n&&n.depthMap[
|
|
11
|
-
`&&e!==` `&&e!==`[`&&e!==`>`&&!t?.tagHandler?.isInline&&n.value[0]!==` `}function
|
|
12
|
-
`)return;
|
|
7
|
+
`||t===` `)return!1;let r=new Set([`[`,`(`,`>`,`*`,`_`,"`"]),i=new Set([`]`,`)`,`<`,`.`,`,`,`!`,`?`,`:`,`;`,`*`,`_`,"`"]);return e===`|`&&t===`<`&&n&&n.depthMap[28]>0?!0:!(r.has(e)||i.has(t))}function M(e,t,n){return!!e&&e!==`
|
|
8
|
+
`&&e!==` `&&e!==`[`&&e!==`>`&&!t?.tagHandler?.isInline&&n.value[0]!==` `}function N(e){let t=e.tagId,n=e.depthMap;if(t!==25&&n[25]>0||t!==22&&n[22]>0)return i;let r=t!==void 0&&(t>=7&&t<=12||t===35||t===36),o=e.parent;for(;o;){if(o.tagHandler?.collapsesInnerWhiteSpace){if(r&&o.tagId===37){o=o.parent;continue}return i}o=o.parent}return e.tagHandler?.spacing?e.tagHandler?.spacing:a}function P(n={}){let r={options:n,regionToggles:new Map,regionContentBuffers:new Map,depthMap:new Uint8Array(108)};r.regionToggles.set(0,!0),r.regionContentBuffers.set(0,[]);let i=0;function a(t){let{type:n,node:i}=t,a=r.lastNode;r.lastNode=t.node,r.depth=i.depth;let o=r.regionContentBuffers.get(i.regionId||0)||[],s=o[o.length-1],c=s?.charAt(s.length-1)||``,l;if(l=s?.length>1?s.charAt(s.length-2):o[o.length-2]?.charAt(o[o.length-2].length-1),i.type===2&&n===0){let t=i;if(t.value){if(t.excludedFromMarkdown||t.value===` `&&c===`
|
|
9
|
+
`)return;M(c,a,t)&&(t.value=` ${t.value}`),e(t,t.value,r)}r.lastTextNode=t;return}if(i.type!==1)return;let u={node:i,state:r},d=[],f=i;f.pluginOutput?.length&&(d.push(...f.pluginOutput),f.pluginOutput=[]);let p=r.lastContentCache,m=0;c===`
|
|
13
10
|
`&&m++,l===`
|
|
14
|
-
`&&m++;let h=n===
|
|
15
|
-
`.repeat(
|
|
16
|
-
|
|
17
|
-
// Expose mdream globally
|
|
18
|
-
if (typeof window !== 'undefined') {
|
|
19
|
-
window.mdream = fn;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
})();
|
|
11
|
+
`&&m++;let h=n===0?`enter`:`exit`,g=i.tagHandler;if(!d.length&&g?.[h]){let e=g[h](u);e&&d.push(e)}let _=N(i)[n]||0,v=Math.max(0,_-m);if(v>0){if(!o.length){for(let t of d)e(i,t,r);return}let t=`
|
|
12
|
+
`.repeat(v);c===` `&&o?.length&&(o[o.length-1]=o[o.length-1].substring(0,o[o.length-1].length-1)),n===0?d.unshift(t):d.push(t)}else if(p&&r.lastTextNode?.containsWhitespace&&i.parent&&`value`in r.lastTextNode&&typeof r.lastTextNode.value==`string`&&(!i.parent.depthMap[34]||i.parent.tagId===34)){let e=i.tagHandler?.isInline,t=i.tagHandler?.collapsesInnerWhiteSpace,a=i.tagHandler?.spacing&&Array.isArray(i.tagHandler.spacing);if((!e||n===1)&&!(!e&&!t&&_>0)&&!(t&&n===0)&&!(a&&n===0)){let e=p.length,t=p.trimEnd();e-t.length>0&&o?.length&&o[o.length-1]===p&&(o[o.length-1]=t)}r.lastTextNode=void 0}d[0]?.[0]&&n===0&&c&&j(c,d[0][0],r)&&e(i,` `,r);for(let t of d)e(i,t,r)}function o(e){x(e,{depthMap:r.depthMap,depth:0,plugins:r.options?.plugins||[]},e=>{A(e,r.options?.plugins,r,a)})}function s(){return t(r).trimEnd()}function c(){let e=[];for(let[t,n]of Array.from(r.regionContentBuffers.entries()))r.regionToggles.get(t)&&e.push(...n);let t=e.join(``).trimStart(),n=t.slice(i);return i=t.length,n}return{processEvent:a,processHtml:o,getMarkdown:s,getMarkdownChunk:c,state:r}}function F(e,t={}){let n=P(t);return n.processHtml(e),n.getMarkdown()}const I={htmlToMarkdown:F};typeof window<`u`&&(window.mdream=I);var L=I;
|
package/dist/index.d.mts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
1
|
+
import { _ as TagHandler, a as HandlerContext, b as ExtractedElement, c as MdreamRuntimeState, d as Plugin, f as PluginContext, g as TEXT_NODE, h as SplitterOptions, i as HTMLToMarkdownOptions, l as Node, m as ReadabilityContext, n as ELEMENT_NODE, o as MarkdownChunk, p as PluginCreationOptions, r as ElementNode, s as MdreamProcessingState, t as BufferRegion, u as NodeEvent, v as TailwindContext, y as TextNode } from "./_chunks/types-CT4ZxeOH.mjs";
|
|
2
|
+
import { t as createPlugin } from "./_chunks/plugin-D5soyEXm.mjs";
|
|
3
3
|
import { ReadableStream } from "node:stream/web";
|
|
4
4
|
|
|
5
5
|
//#region src/const.d.ts
|
|
@@ -165,9 +165,6 @@ interface ParseResult {
|
|
|
165
165
|
* Completely decoupled from markdown generation
|
|
166
166
|
*/
|
|
167
167
|
declare function parseHtml(html: string, options?: ParseOptions): ParseResult;
|
|
168
|
-
/**
|
|
169
|
-
* Streaming HTML parser - calls onEvent for each DOM event
|
|
170
|
-
*/
|
|
171
168
|
//#endregion
|
|
172
169
|
//#region src/stream.d.ts
|
|
173
170
|
/**
|
package/dist/index.mjs
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { TagIdMap } from "./_chunks/const-
|
|
2
|
-
import {
|
|
3
|
-
import { createPlugin } from "./_chunks/plugin-
|
|
4
|
-
import {
|
|
1
|
+
import { _n as TagIdMap } from "./_chunks/const-Bf_XN9U9.mjs";
|
|
2
|
+
import { i as parseHtml, t as MarkdownProcessor } from "./_chunks/markdown-processor-D26Uo5td.mjs";
|
|
3
|
+
import { t as createPlugin } from "./_chunks/plugin-CjWWQTuL.mjs";
|
|
4
|
+
import { n as streamHtmlToMarkdown, t as htmlToMarkdown } from "./_chunks/src-BJpipdul.mjs";
|
|
5
5
|
|
|
6
6
|
export { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
package/dist/llms-txt.mjs
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import "./_chunks/const-
|
|
2
|
-
import "./_chunks/markdown-processor-
|
|
3
|
-
import "./_chunks/plugin-
|
|
4
|
-
import "./_chunks/src-
|
|
5
|
-
import "./_chunks/extraction-
|
|
6
|
-
import { generateLlmsTxtArtifacts } from "./_chunks/llms-txt-
|
|
1
|
+
import "./_chunks/const-Bf_XN9U9.mjs";
|
|
2
|
+
import "./_chunks/markdown-processor-D26Uo5td.mjs";
|
|
3
|
+
import "./_chunks/plugin-CjWWQTuL.mjs";
|
|
4
|
+
import "./_chunks/src-BJpipdul.mjs";
|
|
5
|
+
import "./_chunks/extraction-BA9MDtq3.mjs";
|
|
6
|
+
import { t as generateLlmsTxtArtifacts } from "./_chunks/llms-txt-D7Hduhij.mjs";
|
|
7
7
|
|
|
8
8
|
export { generateLlmsTxtArtifacts };
|
package/dist/plugins.d.mts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Plugin,
|
|
2
|
-
import {
|
|
1
|
+
import { d as Plugin, x as extractionPlugin } from "./_chunks/types-CT4ZxeOH.mjs";
|
|
2
|
+
import { t as createPlugin } from "./_chunks/plugin-D5soyEXm.mjs";
|
|
3
3
|
|
|
4
4
|
//#region src/plugins/filter.d.ts
|
|
5
5
|
|
package/dist/plugins.mjs
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import "./_chunks/const-
|
|
2
|
-
import { createPlugin } from "./_chunks/plugin-
|
|
3
|
-
import { extractionPlugin } from "./_chunks/extraction-
|
|
4
|
-
import { filterPlugin, frontmatterPlugin,
|
|
1
|
+
import "./_chunks/const-Bf_XN9U9.mjs";
|
|
2
|
+
import { t as createPlugin } from "./_chunks/plugin-CjWWQTuL.mjs";
|
|
3
|
+
import { t as extractionPlugin } from "./_chunks/extraction-BA9MDtq3.mjs";
|
|
4
|
+
import { a as filterPlugin, i as frontmatterPlugin, n as readabilityPlugin, r as isolateMainPlugin, t as tailwindPlugin } from "./_chunks/plugins-DJnqR2fA.mjs";
|
|
5
5
|
|
|
6
6
|
export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
|
package/dist/preset/minimal.mjs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import "../_chunks/const-
|
|
2
|
-
import "../_chunks/plugin-
|
|
3
|
-
import "../_chunks/extraction-
|
|
4
|
-
import "../_chunks/plugins-
|
|
5
|
-
import { withMinimalPreset } from "../_chunks/minimal-
|
|
1
|
+
import "../_chunks/const-Bf_XN9U9.mjs";
|
|
2
|
+
import "../_chunks/plugin-CjWWQTuL.mjs";
|
|
3
|
+
import "../_chunks/extraction-BA9MDtq3.mjs";
|
|
4
|
+
import "../_chunks/plugins-DJnqR2fA.mjs";
|
|
5
|
+
import { t as withMinimalPreset } from "../_chunks/minimal-BiDhcwif.mjs";
|
|
6
6
|
|
|
7
7
|
export { withMinimalPreset };
|
package/dist/splitter.d.mts
CHANGED
|
@@ -1,11 +1,16 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { h as SplitterOptions, o as MarkdownChunk } from "./_chunks/types-CT4ZxeOH.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/splitter.d.ts
|
|
4
4
|
|
|
5
|
+
/**
|
|
6
|
+
* Convert HTML to Markdown and split into chunks in single pass
|
|
7
|
+
* Yields chunks during HTML event processing for better memory efficiency
|
|
8
|
+
*/
|
|
9
|
+
declare function htmlToMarkdownSplitChunksStream(html: string, options?: SplitterOptions): Generator<MarkdownChunk, void, undefined>;
|
|
5
10
|
/**
|
|
6
11
|
* Convert HTML to Markdown and split into chunks in single pass
|
|
7
12
|
* Chunks are created during HTML event processing
|
|
8
13
|
*/
|
|
9
14
|
declare function htmlToMarkdownSplitChunks(html: string, options?: SplitterOptions): MarkdownChunk[];
|
|
10
15
|
//#endregion
|
|
11
|
-
export { type MarkdownChunk, type SplitterOptions, htmlToMarkdownSplitChunks };
|
|
16
|
+
export { type MarkdownChunk, type SplitterOptions, htmlToMarkdownSplitChunks, htmlToMarkdownSplitChunksStream };
|
package/dist/splitter.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
1
|
+
import { $ as TAG_H2, F as TAG_CODE, Nt as TAG_PRE, Q as TAG_H1, et as TAG_H3, gn as TEXT_NODE, h as NodeEventExit, m as NodeEventEnter, nt as TAG_H5, ot as TAG_HR, r as ELEMENT_NODE, rt as TAG_H6, tt as TAG_H4 } from "./_chunks/const-Bf_XN9U9.mjs";
|
|
2
|
+
import { a as parseHtmlStream, n as createMarkdownProcessor, r as processPluginsForEvent } from "./_chunks/markdown-processor-D26Uo5td.mjs";
|
|
3
3
|
|
|
4
4
|
//#region src/splitter.ts
|
|
5
5
|
const DEFAULT_HEADERS_TO_SPLIT_ON = [
|
|
@@ -36,26 +36,22 @@ function shouldSplitOnHeader(tagId, options) {
|
|
|
36
36
|
*/
|
|
37
37
|
function getCurrentMarkdown(state) {
|
|
38
38
|
const fragments = [];
|
|
39
|
-
for (const [regionId, content] of state.regionContentBuffers.entries())
|
|
40
|
-
const include = state.regionToggles.get(regionId);
|
|
41
|
-
if (include) fragments.push(...content);
|
|
42
|
-
}
|
|
39
|
+
for (const [regionId, content] of state.regionContentBuffers.entries()) if (state.regionToggles.get(regionId)) fragments.push(...content);
|
|
43
40
|
return fragments.join("").trimStart();
|
|
44
41
|
}
|
|
45
42
|
/**
|
|
46
43
|
* Convert HTML to Markdown and split into chunks in single pass
|
|
47
|
-
*
|
|
44
|
+
* Yields chunks during HTML event processing for better memory efficiency
|
|
48
45
|
*/
|
|
49
|
-
function
|
|
46
|
+
function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
50
47
|
const opts = createOptions(options);
|
|
51
48
|
if (opts.chunkOverlap >= opts.chunkSize) throw new Error("chunkOverlap must be less than chunkSize");
|
|
52
49
|
const processor = createMarkdownProcessor({
|
|
53
50
|
origin: opts.origin,
|
|
54
51
|
plugins: opts.plugins
|
|
55
52
|
});
|
|
56
|
-
const
|
|
57
|
-
const
|
|
58
|
-
const seenSplitHeaders = new Set();
|
|
53
|
+
const headerHierarchy = /* @__PURE__ */ new Map();
|
|
54
|
+
const seenSplitHeaders = /* @__PURE__ */ new Set();
|
|
59
55
|
let currentChunkCodeLanguage = "";
|
|
60
56
|
let collectingHeaderText = false;
|
|
61
57
|
let currentHeaderTagId = null;
|
|
@@ -63,19 +59,27 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
63
59
|
let lineNumber = 1;
|
|
64
60
|
let lastChunkEndPosition = 0;
|
|
65
61
|
let lastSplitPosition = 0;
|
|
66
|
-
function flushChunk(endPosition, applyOverlap = false) {
|
|
62
|
+
function* flushChunk(endPosition, applyOverlap = false) {
|
|
67
63
|
const currentMd = getCurrentMarkdown(processor.state);
|
|
68
64
|
const chunkEnd = endPosition ?? currentMd.length;
|
|
69
|
-
const
|
|
70
|
-
if (!
|
|
65
|
+
const originalChunkContent = currentMd.slice(lastChunkEndPosition, chunkEnd);
|
|
66
|
+
if (!originalChunkContent.trim()) {
|
|
71
67
|
lastChunkEndPosition = chunkEnd;
|
|
72
68
|
return;
|
|
73
69
|
}
|
|
70
|
+
let chunkContent = originalChunkContent;
|
|
71
|
+
if (opts.stripHeaders) {
|
|
72
|
+
chunkContent = chunkContent.split("\n").filter((line) => !line.match(/^#{1,6}\s+/)).join("\n").trim();
|
|
73
|
+
if (!chunkContent) {
|
|
74
|
+
lastChunkEndPosition = chunkEnd;
|
|
75
|
+
return;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
74
78
|
const chunk = {
|
|
75
79
|
content: chunkContent.trimEnd(),
|
|
76
80
|
metadata: { loc: { lines: {
|
|
77
81
|
from: lineNumber,
|
|
78
|
-
to: lineNumber + (
|
|
82
|
+
to: lineNumber + (originalChunkContent.match(/\n/g) || []).length
|
|
79
83
|
} } }
|
|
80
84
|
};
|
|
81
85
|
if (headerHierarchy.size > 0) {
|
|
@@ -86,22 +90,25 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
86
90
|
}
|
|
87
91
|
}
|
|
88
92
|
if (currentChunkCodeLanguage) chunk.metadata.code = currentChunkCodeLanguage;
|
|
89
|
-
|
|
93
|
+
yield chunk;
|
|
90
94
|
currentChunkCodeLanguage = "";
|
|
91
95
|
lastSplitPosition = chunkEnd;
|
|
92
96
|
if (applyOverlap && opts.chunkOverlap > 0) {
|
|
93
|
-
const maxOverlap = Math.max(0,
|
|
94
|
-
|
|
95
|
-
lastChunkEndPosition = chunkEnd - actualOverlap;
|
|
97
|
+
const maxOverlap = Math.max(0, originalChunkContent.length - 1);
|
|
98
|
+
lastChunkEndPosition = chunkEnd - Math.min(opts.chunkOverlap, maxOverlap);
|
|
96
99
|
} else lastChunkEndPosition = chunkEnd;
|
|
97
|
-
lineNumber += (
|
|
100
|
+
lineNumber += (originalChunkContent.match(/\n/g) || []).length;
|
|
98
101
|
}
|
|
99
102
|
const parseState = {
|
|
100
103
|
depthMap: processor.state.depthMap,
|
|
101
104
|
depth: 0,
|
|
102
105
|
plugins: opts.plugins
|
|
103
106
|
};
|
|
107
|
+
const eventBuffer = [];
|
|
104
108
|
parseHtmlStream(html, parseState, (event) => {
|
|
109
|
+
eventBuffer.push(event);
|
|
110
|
+
});
|
|
111
|
+
for (const event of eventBuffer) {
|
|
105
112
|
const { type: eventType, node } = event;
|
|
106
113
|
if (node.type === ELEMENT_NODE) {
|
|
107
114
|
const element = node;
|
|
@@ -113,7 +120,7 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
113
120
|
currentHeaderText = "";
|
|
114
121
|
if (shouldSplitOnHeader(tagId, opts)) {
|
|
115
122
|
if (seenSplitHeaders.has(tagId)) {
|
|
116
|
-
flushChunk();
|
|
123
|
+
yield* flushChunk();
|
|
117
124
|
for (let i = tagId; i <= TAG_H6; i++) headerHierarchy.delete(i);
|
|
118
125
|
}
|
|
119
126
|
seenSplitHeaders.add(tagId);
|
|
@@ -130,17 +137,13 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
130
137
|
if (lang && !currentChunkCodeLanguage) currentChunkCodeLanguage = lang;
|
|
131
138
|
}
|
|
132
139
|
}
|
|
133
|
-
if (tagId === TAG_HR && eventType === NodeEventEnter) flushChunk();
|
|
134
|
-
}
|
|
135
|
-
if (collectingHeaderText && node.type === TEXT_NODE) {
|
|
136
|
-
const textNode = node;
|
|
137
|
-
currentHeaderText += textNode.value;
|
|
140
|
+
if (tagId === TAG_HR && eventType === NodeEventEnter) yield* flushChunk();
|
|
138
141
|
}
|
|
142
|
+
if (collectingHeaderText && node.type === TEXT_NODE) currentHeaderText += node.value;
|
|
139
143
|
processPluginsForEvent(event, opts.plugins, processor.state, processor.processEvent);
|
|
140
144
|
if (!opts.returnEachLine) {
|
|
141
145
|
const currentMd = getCurrentMarkdown(processor.state);
|
|
142
|
-
|
|
143
|
-
if (currentChunkSize > opts.chunkSize) {
|
|
146
|
+
if (opts.lengthFunction(currentMd.slice(lastChunkEndPosition)) > opts.chunkSize) {
|
|
144
147
|
const idealSplitPos = lastChunkEndPosition + opts.chunkSize;
|
|
145
148
|
const separators = [
|
|
146
149
|
"\n\n",
|
|
@@ -168,11 +171,20 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
168
171
|
}
|
|
169
172
|
}
|
|
170
173
|
if (splitPosition === -1 || splitPosition <= lastChunkEndPosition) splitPosition = currentMd.length;
|
|
171
|
-
flushChunk(splitPosition, true);
|
|
174
|
+
yield* flushChunk(splitPosition, true);
|
|
172
175
|
}
|
|
173
176
|
}
|
|
174
|
-
}
|
|
175
|
-
flushChunk();
|
|
177
|
+
}
|
|
178
|
+
yield* flushChunk();
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Convert HTML to Markdown and split into chunks in single pass
|
|
182
|
+
* Chunks are created during HTML event processing
|
|
183
|
+
*/
|
|
184
|
+
function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
185
|
+
const opts = createOptions(options);
|
|
186
|
+
const chunks = [];
|
|
187
|
+
for (const chunk of htmlToMarkdownSplitChunksStream(html, options)) chunks.push(chunk);
|
|
176
188
|
if (opts.returnEachLine && chunks.length > 0) {
|
|
177
189
|
const lineChunks = [];
|
|
178
190
|
for (const chunk of chunks) {
|
|
@@ -194,9 +206,8 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
194
206
|
}
|
|
195
207
|
return lineChunks;
|
|
196
208
|
}
|
|
197
|
-
|
|
198
|
-
return chunks.filter((chunk) => chunk.content.length > 0);
|
|
209
|
+
return chunks;
|
|
199
210
|
}
|
|
200
211
|
|
|
201
212
|
//#endregion
|
|
202
|
-
export { htmlToMarkdownSplitChunks };
|
|
213
|
+
export { htmlToMarkdownSplitChunks, htmlToMarkdownSplitChunksStream };
|