@kreuzberg/html-to-markdown-wasm 3.5.5 → 3.6.0-rc.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -0
- package/package.json +1 -1
- package/pkg/bundler/README.md +19 -0
- package/pkg/bundler/html_to_markdown_wasm.d.ts +82 -9
- package/pkg/bundler/html_to_markdown_wasm.js +1 -1
- package/pkg/bundler/html_to_markdown_wasm_bg.js +373 -57
- package/pkg/bundler/html_to_markdown_wasm_bg.wasm +0 -0
- package/pkg/bundler/html_to_markdown_wasm_bg.wasm.d.ts +35 -15
- package/pkg/bundler/package.json +1 -1
- package/pkg/deno/README.md +19 -0
- package/pkg/deno/html_to_markdown_wasm.d.ts +82 -9
- package/pkg/deno/html_to_markdown_wasm.js +373 -57
- package/pkg/deno/html_to_markdown_wasm_bg.wasm +0 -0
- package/pkg/deno/html_to_markdown_wasm_bg.wasm.d.ts +35 -15
- package/pkg/nodejs/README.md +19 -0
- package/pkg/nodejs/html_to_markdown_wasm.d.ts +82 -9
- package/pkg/nodejs/html_to_markdown_wasm.js +377 -57
- package/pkg/nodejs/html_to_markdown_wasm_bg.wasm +0 -0
- package/pkg/nodejs/html_to_markdown_wasm_bg.wasm.d.ts +35 -15
- package/pkg/nodejs/package.json +1 -1
- package/pkg/web/README.md +19 -0
- package/pkg/web/html_to_markdown_wasm.d.ts +117 -24
- package/pkg/web/html_to_markdown_wasm.js +373 -57
- package/pkg/web/html_to_markdown_wasm_bg.wasm +0 -0
- package/pkg/web/html_to_markdown_wasm_bg.wasm.d.ts +35 -15
- package/pkg/web/package.json +1 -1
package/README.md
CHANGED
|
@@ -126,6 +126,25 @@ const markdown = result.content;
|
|
|
126
126
|
console.log(markdown);
|
|
127
127
|
```
|
|
128
128
|
|
|
129
|
+
## Architecture
|
|
130
|
+
|
|
131
|
+
The converter routes each input through one of three tiers based on a fast prescan of the byte stream:
|
|
132
|
+
|
|
133
|
+
1. **Tier-1 — single-pass byte scanner.** Handles 110+ HTML tags directly. Bails on any construct it cannot prove byte-equivalent to Tier-2.
|
|
134
|
+
2. **Tier-2 — DOM walker.** Picks up Tier-1 bails and inputs the classifier rejected up front.
|
|
135
|
+
3. **Tier-3 — standards-conformant parser.** Engaged for malformed HTML requiring full HTML5 repair.
|
|
136
|
+
|
|
137
|
+
The dispatcher is invisible to the caller. Output is byte-identical across tiers — enforced by a 116-snapshot oracle.
|
|
138
|
+
|
|
139
|
+
## Capabilities
|
|
140
|
+
|
|
141
|
+
- **16 languages, one Rust core.** Rust, Python, Node.js, WASM, Java, Go, C#, PHP, Ruby, Elixir, R, Dart, Kotlin (Android), Swift, Zig, C ABI.
|
|
142
|
+
- **CommonMark-compatible Markdown** with GFM-style tables.
|
|
143
|
+
- **Djot output**: set `output_format = "djot"` (see Djot Output Format section below).
|
|
144
|
+
- **Real-HTML robust**: unclosed tags, CDATA, custom elements, malformed entities, nested tables, mixed encodings handled without losing content.
|
|
145
|
+
- **Metadata extraction**, **visitor API**, **inline images**, **configurable preprocessing presets**.
|
|
146
|
+
- **Per-group regression gates in CI**: every PR runs the bench harness against per-group thresholds.
|
|
147
|
+
|
|
129
148
|
## API Reference
|
|
130
149
|
|
|
131
150
|
### Core Function
|
package/package.json
CHANGED
package/pkg/bundler/README.md
CHANGED
|
@@ -126,6 +126,25 @@ const markdown = result.content;
|
|
|
126
126
|
console.log(markdown);
|
|
127
127
|
```
|
|
128
128
|
|
|
129
|
+
## Architecture
|
|
130
|
+
|
|
131
|
+
The converter routes each input through one of three tiers based on a fast prescan of the byte stream:
|
|
132
|
+
|
|
133
|
+
1. **Tier-1 — single-pass byte scanner.** Handles 110+ HTML tags directly. Bails on any construct it cannot prove byte-equivalent to Tier-2.
|
|
134
|
+
2. **Tier-2 — DOM walker.** Picks up Tier-1 bails and inputs the classifier rejected up front.
|
|
135
|
+
3. **Tier-3 — standards-conformant parser.** Engaged for malformed HTML requiring full HTML5 repair.
|
|
136
|
+
|
|
137
|
+
The dispatcher is invisible to the caller. Output is byte-identical across tiers — enforced by a 116-snapshot oracle.
|
|
138
|
+
|
|
139
|
+
## Capabilities
|
|
140
|
+
|
|
141
|
+
- **16 languages, one Rust core.** Rust, Python, Node.js, WASM, Java, Go, C#, PHP, Ruby, Elixir, R, Dart, Kotlin (Android), Swift, Zig, C ABI.
|
|
142
|
+
- **CommonMark-compatible Markdown** with GFM-style tables.
|
|
143
|
+
- **Djot output**: set `output_format = "djot"` (see Djot Output Format section below).
|
|
144
|
+
- **Real-HTML robust**: unclosed tags, CDATA, custom elements, malformed entities, nested tables, mixed encodings handled without losing content.
|
|
145
|
+
- **Metadata extraction**, **visitor API**, **inline images**, **configurable preprocessing presets**.
|
|
146
|
+
- **Per-group regression gates in CI**: every PR runs the bench harness against per-group thresholds.
|
|
147
|
+
|
|
129
148
|
## API Reference
|
|
130
149
|
|
|
131
150
|
### Core Function
|
|
@@ -40,7 +40,7 @@ export class WasmConversionOptions {
|
|
|
40
40
|
free(): void;
|
|
41
41
|
[Symbol.dispose](): void;
|
|
42
42
|
static default(): WasmConversionOptions;
|
|
43
|
-
constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptions | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, excludeSelectors?: string[] | null, maxDepth?: number | null);
|
|
43
|
+
constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptions | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, urlEscapeStyle?: WasmUrlEscapeStyle | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, excludeSelectors?: string[] | null, tierStrategy?: WasmTierStrategy | null, maxDepth?: number | null);
|
|
44
44
|
autolinks: boolean;
|
|
45
45
|
brInTables: boolean;
|
|
46
46
|
bullets: string;
|
|
@@ -87,6 +87,10 @@ export class WasmConversionOptions {
|
|
|
87
87
|
strongEmSymbol: string;
|
|
88
88
|
subSymbol: string;
|
|
89
89
|
supSymbol: string;
|
|
90
|
+
get tierStrategy(): string;
|
|
91
|
+
set tierStrategy(value: WasmTierStrategy);
|
|
92
|
+
get urlEscapeStyle(): string;
|
|
93
|
+
set urlEscapeStyle(value: WasmUrlEscapeStyle);
|
|
90
94
|
get visitor(): WasmVisitorHandle | undefined;
|
|
91
95
|
set visitor(value: WasmVisitorHandle | null | undefined);
|
|
92
96
|
get whitespaceMode(): string;
|
|
@@ -105,7 +109,7 @@ export class WasmConversionOptionsUpdate {
|
|
|
105
109
|
free(): void;
|
|
106
110
|
[Symbol.dispose](): void;
|
|
107
111
|
static default(): WasmConversionOptionsUpdate;
|
|
108
|
-
constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptionsUpdate | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, maxDepth?: number | null, excludeSelectors?: string[] | null);
|
|
112
|
+
constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptionsUpdate | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, urlEscapeStyle?: WasmUrlEscapeStyle | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, maxDepth?: number | null, excludeSelectors?: string[] | null, tierStrategy?: WasmTierStrategy | null);
|
|
109
113
|
get autolinks(): boolean | undefined;
|
|
110
114
|
set autolinks(value: boolean | null | undefined);
|
|
111
115
|
get brInTables(): boolean | undefined;
|
|
@@ -182,6 +186,10 @@ export class WasmConversionOptionsUpdate {
|
|
|
182
186
|
set subSymbol(value: string | null | undefined);
|
|
183
187
|
get supSymbol(): string | undefined;
|
|
184
188
|
set supSymbol(value: string | null | undefined);
|
|
189
|
+
get tierStrategy(): string | undefined;
|
|
190
|
+
set tierStrategy(value: WasmTierStrategy | null | undefined);
|
|
191
|
+
get urlEscapeStyle(): string | undefined;
|
|
192
|
+
set urlEscapeStyle(value: WasmUrlEscapeStyle | null | undefined);
|
|
185
193
|
get visitor(): WasmVisitorHandle | undefined;
|
|
186
194
|
set visitor(value: WasmVisitorHandle | null | undefined);
|
|
187
195
|
get whitespaceMode(): string | undefined;
|
|
@@ -217,7 +225,6 @@ export class WasmConversionResult {
|
|
|
217
225
|
set content(value: string | null | undefined);
|
|
218
226
|
get document(): WasmDocumentStructure | undefined;
|
|
219
227
|
set document(value: WasmDocumentStructure | null | undefined);
|
|
220
|
-
images: string[];
|
|
221
228
|
metadata: WasmHtmlMetadata;
|
|
222
229
|
tables: WasmTableData[];
|
|
223
230
|
warnings: WasmProcessingWarning[];
|
|
@@ -379,6 +386,23 @@ export class WasmHtmlMetadata {
|
|
|
379
386
|
structuredData: WasmStructuredData[];
|
|
380
387
|
}
|
|
381
388
|
|
|
389
|
+
/**
|
|
390
|
+
* Image dimensions in pixels.
|
|
391
|
+
*
|
|
392
|
+
* Binding-safe replacement for `(u32, u32)` tuples, which degrade to
|
|
393
|
+
* `Vec<Vec<String>>` when sanitized for cross-language binding generation.
|
|
394
|
+
* Used by both `ImageMetadata` and
|
|
395
|
+
* `InlineImage`.
|
|
396
|
+
*/
|
|
397
|
+
export class WasmImageDimensions {
|
|
398
|
+
free(): void;
|
|
399
|
+
[Symbol.dispose](): void;
|
|
400
|
+
static default(): WasmImageDimensions;
|
|
401
|
+
constructor(width: number, height: number);
|
|
402
|
+
height: number;
|
|
403
|
+
width: number;
|
|
404
|
+
}
|
|
405
|
+
|
|
382
406
|
/**
|
|
383
407
|
* Image metadata with source and dimensions.
|
|
384
408
|
*
|
|
@@ -391,12 +415,12 @@ export class WasmImageMetadata {
|
|
|
391
415
|
free(): void;
|
|
392
416
|
[Symbol.dispose](): void;
|
|
393
417
|
static default(): WasmImageMetadata;
|
|
394
|
-
constructor(src: string, imageType: WasmImageType, attributes: any, alt?: string | null, title?: string | null, dimensions?:
|
|
418
|
+
constructor(src: string, imageType: WasmImageType, attributes: any, alt?: string | null, title?: string | null, dimensions?: WasmImageDimensions | null);
|
|
395
419
|
get alt(): string | undefined;
|
|
396
420
|
set alt(value: string | null | undefined);
|
|
397
421
|
attributes: any;
|
|
398
|
-
get dimensions():
|
|
399
|
-
set dimensions(value:
|
|
422
|
+
get dimensions(): WasmImageDimensions | undefined;
|
|
423
|
+
set dimensions(value: WasmImageDimensions | null | undefined);
|
|
400
424
|
get imageType(): string;
|
|
401
425
|
set imageType(value: WasmImageType);
|
|
402
426
|
src: string;
|
|
@@ -473,6 +497,22 @@ export enum WasmListIndentType {
|
|
|
473
497
|
Tabs = 1,
|
|
474
498
|
}
|
|
475
499
|
|
|
500
|
+
/**
|
|
501
|
+
* A single key-value metadata entry from `<head>` meta tags.
|
|
502
|
+
*
|
|
503
|
+
* Binding-safe replacement for `(String, String)` tuples used in
|
|
504
|
+
* `NodeContent.MetadataBlock`. Tuple pairs cannot be represented
|
|
505
|
+
* across language boundaries without lossy degradation.
|
|
506
|
+
*/
|
|
507
|
+
export class WasmMetadataEntry {
|
|
508
|
+
free(): void;
|
|
509
|
+
[Symbol.dispose](): void;
|
|
510
|
+
static default(): WasmMetadataEntry;
|
|
511
|
+
constructor(key: string, value: string);
|
|
512
|
+
key: string;
|
|
513
|
+
value: string;
|
|
514
|
+
}
|
|
515
|
+
|
|
476
516
|
/**
|
|
477
517
|
* Line break syntax in Markdown output.
|
|
478
518
|
*
|
|
@@ -499,8 +539,8 @@ export class WasmNodeContent {
|
|
|
499
539
|
set definition(value: string | null | undefined);
|
|
500
540
|
get description(): string | undefined;
|
|
501
541
|
set description(value: string | null | undefined);
|
|
502
|
-
get entries():
|
|
503
|
-
set entries(value:
|
|
542
|
+
get entries(): WasmMetadataEntry[] | undefined;
|
|
543
|
+
set entries(value: WasmMetadataEntry[] | null | undefined);
|
|
504
544
|
get format(): string | undefined;
|
|
505
545
|
set format(value: string | null | undefined);
|
|
506
546
|
get grid(): WasmTableGrid | undefined;
|
|
@@ -826,6 +866,34 @@ export enum WasmTextDirection {
|
|
|
826
866
|
Auto = 2,
|
|
827
867
|
}
|
|
828
868
|
|
|
869
|
+
/**
|
|
870
|
+
* Controls which conversion tier is used.
|
|
871
|
+
*/
|
|
872
|
+
export enum WasmTierStrategy {
|
|
873
|
+
Auto = 0,
|
|
874
|
+
Tier2 = 1,
|
|
875
|
+
Tier1 = 2,
|
|
876
|
+
}
|
|
877
|
+
|
|
878
|
+
/**
|
|
879
|
+
* URL encoding strategy for link and image destinations.
|
|
880
|
+
*
|
|
881
|
+
* Controls how special characters in URL destinations are handled when they
|
|
882
|
+
* require escaping to produce valid Markdown.
|
|
883
|
+
*
|
|
884
|
+
* The `Angle` variant (default) wraps the destination in angle brackets:
|
|
885
|
+
* `[text](<url with spaces>)`. This is the CommonMark-specified escape hatch
|
|
886
|
+
* but breaks when the URL itself contains `>`.
|
|
887
|
+
*
|
|
888
|
+
* The `Percent` variant percent-encodes every character that is not an RFC 3986
|
|
889
|
+
* unreserved character or `/`, producing a destination safe for all Markdown
|
|
890
|
+
* parsers: `[text](url%20with%20spaces)`.
|
|
891
|
+
*/
|
|
892
|
+
export enum WasmUrlEscapeStyle {
|
|
893
|
+
Angle = 0,
|
|
894
|
+
Percent = 1,
|
|
895
|
+
}
|
|
896
|
+
|
|
829
897
|
/**
|
|
830
898
|
* Result of a visitor callback.
|
|
831
899
|
*
|
|
@@ -884,7 +952,12 @@ export enum WasmWhitespaceMode {
|
|
|
884
952
|
* # Arguments
|
|
885
953
|
*
|
|
886
954
|
* * `html` — the HTML string to convert.
|
|
887
|
-
* * `options` —
|
|
955
|
+
* * `options` — conversion options. The parameter bound is
|
|
956
|
+
* `impl Into<Option<ConversionOptions>>`, so any of the following call shapes are accepted:
|
|
957
|
+
* - `convert(html, ConversionOptions.default())` — bare options.
|
|
958
|
+
* - `convert(html, opts)` — bare options.
|
|
959
|
+
* - `convert(html, Some(opts))` — explicit `Option`.
|
|
960
|
+
* - `convert(html, None)` — fall back to `ConversionOptions.default`.
|
|
888
961
|
*
|
|
889
962
|
* # Example
|
|
890
963
|
*
|
|
@@ -5,5 +5,5 @@ import { __wbg_set_wasm } from "./html_to_markdown_wasm_bg.js";
|
|
|
5
5
|
__wbg_set_wasm(wasm);
|
|
6
6
|
|
|
7
7
|
export {
|
|
8
|
-
WasmAnnotationKind, WasmCodeBlockStyle, WasmConversionOptions, WasmConversionOptionsUpdate, WasmConversionResult, WasmDocumentMetadata, WasmDocumentNode, WasmDocumentStructure, WasmGridCell, WasmHeaderMetadata, WasmHeadingStyle, WasmHighlightStyle, WasmHtmlMetadata, WasmImageMetadata, WasmImageType, WasmLinkMetadata, WasmLinkStyle, WasmLinkType, WasmListIndentType, WasmNewlineStyle, WasmNodeContent, WasmNodeContext, WasmNodeType, WasmOutputFormat, WasmPreprocessingOptions, WasmPreprocessingOptionsUpdate, WasmPreprocessingPreset, WasmProcessingWarning, WasmStructuredData, WasmStructuredDataType, WasmTableData, WasmTableGrid, WasmTextAnnotation, WasmTextDirection, WasmVisitResult, WasmVisitorHandle, WasmWarningKind, WasmWhitespaceMode, convert
|
|
8
|
+
WasmAnnotationKind, WasmCodeBlockStyle, WasmConversionOptions, WasmConversionOptionsUpdate, WasmConversionResult, WasmDocumentMetadata, WasmDocumentNode, WasmDocumentStructure, WasmGridCell, WasmHeaderMetadata, WasmHeadingStyle, WasmHighlightStyle, WasmHtmlMetadata, WasmImageDimensions, WasmImageMetadata, WasmImageType, WasmLinkMetadata, WasmLinkStyle, WasmLinkType, WasmListIndentType, WasmMetadataEntry, WasmNewlineStyle, WasmNodeContent, WasmNodeContext, WasmNodeType, WasmOutputFormat, WasmPreprocessingOptions, WasmPreprocessingOptionsUpdate, WasmPreprocessingPreset, WasmProcessingWarning, WasmStructuredData, WasmStructuredDataType, WasmTableData, WasmTableGrid, WasmTextAnnotation, WasmTextDirection, WasmTierStrategy, WasmUrlEscapeStyle, WasmVisitResult, WasmVisitorHandle, WasmWarningKind, WasmWhitespaceMode, convert
|
|
9
9
|
} from "./html_to_markdown_wasm_bg.js";
|