@kreuzberg/html-to-markdown-wasm 3.6.0-rc.1 → 3.6.0-rc.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -0
- package/package.json +1 -1
- package/pkg/bundler/README.md +19 -0
- package/pkg/bundler/html_to_markdown_wasm.d.ts +53 -8
- package/pkg/bundler/html_to_markdown_wasm.js +1 -1
- package/pkg/bundler/html_to_markdown_wasm_bg.js +295 -56
- package/pkg/bundler/html_to_markdown_wasm_bg.wasm +0 -0
- package/pkg/bundler/html_to_markdown_wasm_bg.wasm.d.ts +31 -15
- package/pkg/bundler/package.json +1 -1
- package/pkg/deno/README.md +19 -0
- package/pkg/deno/html_to_markdown_wasm.d.ts +53 -8
- package/pkg/deno/html_to_markdown_wasm.js +295 -56
- package/pkg/deno/html_to_markdown_wasm_bg.wasm +0 -0
- package/pkg/deno/html_to_markdown_wasm_bg.wasm.d.ts +31 -15
- package/pkg/nodejs/README.md +19 -0
- package/pkg/nodejs/html_to_markdown_wasm.d.ts +53 -8
- package/pkg/nodejs/html_to_markdown_wasm.js +298 -56
- package/pkg/nodejs/html_to_markdown_wasm_bg.wasm +0 -0
- package/pkg/nodejs/html_to_markdown_wasm_bg.wasm.d.ts +31 -15
- package/pkg/nodejs/package.json +1 -1
- package/pkg/web/README.md +19 -0
- package/pkg/web/html_to_markdown_wasm.d.ts +84 -23
- package/pkg/web/html_to_markdown_wasm.js +295 -56
- package/pkg/web/html_to_markdown_wasm_bg.wasm +0 -0
- package/pkg/web/html_to_markdown_wasm_bg.wasm.d.ts +31 -15
- package/pkg/web/package.json +1 -1
package/README.md
CHANGED
|
@@ -126,6 +126,25 @@ const markdown = result.content;
|
|
|
126
126
|
console.log(markdown);
|
|
127
127
|
```
|
|
128
128
|
|
|
129
|
+
## Architecture
|
|
130
|
+
|
|
131
|
+
The converter routes each input through one of three tiers based on a fast prescan of the byte stream:
|
|
132
|
+
|
|
133
|
+
1. **Tier-1 — single-pass byte scanner.** Handles 110+ HTML tags directly. Bails on any construct it cannot prove byte-equivalent to Tier-2.
|
|
134
|
+
2. **Tier-2 — DOM walker.** Picks up Tier-1 bails and inputs the classifier rejected up front.
|
|
135
|
+
3. **Tier-3 — standards-conformant parser.** Engaged for malformed HTML requiring full HTML5 repair.
|
|
136
|
+
|
|
137
|
+
The dispatcher is invisible to the caller. Output is byte-identical across tiers — enforced by a 116-snapshot oracle.
|
|
138
|
+
|
|
139
|
+
## Capabilities
|
|
140
|
+
|
|
141
|
+
- **16 languages, one Rust core.** Rust, Python, Node.js, WASM, Java, Go, C#, PHP, Ruby, Elixir, R, Dart, Kotlin (Android), Swift, Zig, C ABI.
|
|
142
|
+
- **CommonMark-compatible Markdown** with GFM-style tables.
|
|
143
|
+
- **Djot output**: set `output_format = "djot"` (see Djot Output Format section below).
|
|
144
|
+
- **Real-HTML robust**: unclosed tags, CDATA, custom elements, malformed entities, nested tables, mixed encodings handled without losing content.
|
|
145
|
+
- **Metadata extraction**, **visitor API**, **inline images**, **configurable preprocessing presets**.
|
|
146
|
+
- **Per-group regression gates in CI**: every PR runs the bench harness against per-group thresholds.
|
|
147
|
+
|
|
129
148
|
## API Reference
|
|
130
149
|
|
|
131
150
|
### Core Function
|
package/package.json
CHANGED
package/pkg/bundler/README.md
CHANGED
|
@@ -126,6 +126,25 @@ const markdown = result.content;
|
|
|
126
126
|
console.log(markdown);
|
|
127
127
|
```
|
|
128
128
|
|
|
129
|
+
## Architecture
|
|
130
|
+
|
|
131
|
+
The converter routes each input through one of three tiers based on a fast prescan of the byte stream:
|
|
132
|
+
|
|
133
|
+
1. **Tier-1 — single-pass byte scanner.** Handles 110+ HTML tags directly. Bails on any construct it cannot prove byte-equivalent to Tier-2.
|
|
134
|
+
2. **Tier-2 — DOM walker.** Picks up Tier-1 bails and inputs the classifier rejected up front.
|
|
135
|
+
3. **Tier-3 — standards-conformant parser.** Engaged for malformed HTML requiring full HTML5 repair.
|
|
136
|
+
|
|
137
|
+
The dispatcher is invisible to the caller. Output is byte-identical across tiers — enforced by a 116-snapshot oracle.
|
|
138
|
+
|
|
139
|
+
## Capabilities
|
|
140
|
+
|
|
141
|
+
- **16 languages, one Rust core.** Rust, Python, Node.js, WASM, Java, Go, C#, PHP, Ruby, Elixir, R, Dart, Kotlin (Android), Swift, Zig, C ABI.
|
|
142
|
+
- **CommonMark-compatible Markdown** with GFM-style tables.
|
|
143
|
+
- **Djot output**: set `output_format = "djot"` (see Djot Output Format section below).
|
|
144
|
+
- **Real-HTML robust**: unclosed tags, CDATA, custom elements, malformed entities, nested tables, mixed encodings handled without losing content.
|
|
145
|
+
- **Metadata extraction**, **visitor API**, **inline images**, **configurable preprocessing presets**.
|
|
146
|
+
- **Per-group regression gates in CI**: every PR runs the bench harness against per-group thresholds.
|
|
147
|
+
|
|
129
148
|
## API Reference
|
|
130
149
|
|
|
131
150
|
### Core Function
|
|
@@ -40,7 +40,7 @@ export class WasmConversionOptions {
|
|
|
40
40
|
free(): void;
|
|
41
41
|
[Symbol.dispose](): void;
|
|
42
42
|
static default(): WasmConversionOptions;
|
|
43
|
-
constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptions | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, urlEscapeStyle?: WasmUrlEscapeStyle | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, excludeSelectors?: string[] | null, maxDepth?: number | null);
|
|
43
|
+
constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptions | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, urlEscapeStyle?: WasmUrlEscapeStyle | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, excludeSelectors?: string[] | null, tierStrategy?: WasmTierStrategy | null, maxDepth?: number | null);
|
|
44
44
|
autolinks: boolean;
|
|
45
45
|
brInTables: boolean;
|
|
46
46
|
bullets: string;
|
|
@@ -87,6 +87,8 @@ export class WasmConversionOptions {
|
|
|
87
87
|
strongEmSymbol: string;
|
|
88
88
|
subSymbol: string;
|
|
89
89
|
supSymbol: string;
|
|
90
|
+
get tierStrategy(): string;
|
|
91
|
+
set tierStrategy(value: WasmTierStrategy);
|
|
90
92
|
get urlEscapeStyle(): string;
|
|
91
93
|
set urlEscapeStyle(value: WasmUrlEscapeStyle);
|
|
92
94
|
get visitor(): WasmVisitorHandle | undefined;
|
|
@@ -107,7 +109,7 @@ export class WasmConversionOptionsUpdate {
|
|
|
107
109
|
free(): void;
|
|
108
110
|
[Symbol.dispose](): void;
|
|
109
111
|
static default(): WasmConversionOptionsUpdate;
|
|
110
|
-
constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptionsUpdate | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, urlEscapeStyle?: WasmUrlEscapeStyle | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, maxDepth?: number | null, excludeSelectors?: string[] | null);
|
|
112
|
+
constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptionsUpdate | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, urlEscapeStyle?: WasmUrlEscapeStyle | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, maxDepth?: number | null, excludeSelectors?: string[] | null, tierStrategy?: WasmTierStrategy | null);
|
|
111
113
|
get autolinks(): boolean | undefined;
|
|
112
114
|
set autolinks(value: boolean | null | undefined);
|
|
113
115
|
get brInTables(): boolean | undefined;
|
|
@@ -184,6 +186,8 @@ export class WasmConversionOptionsUpdate {
|
|
|
184
186
|
set subSymbol(value: string | null | undefined);
|
|
185
187
|
get supSymbol(): string | undefined;
|
|
186
188
|
set supSymbol(value: string | null | undefined);
|
|
189
|
+
get tierStrategy(): string | undefined;
|
|
190
|
+
set tierStrategy(value: WasmTierStrategy | null | undefined);
|
|
187
191
|
get urlEscapeStyle(): string | undefined;
|
|
188
192
|
set urlEscapeStyle(value: WasmUrlEscapeStyle | null | undefined);
|
|
189
193
|
get visitor(): WasmVisitorHandle | undefined;
|
|
@@ -221,7 +225,6 @@ export class WasmConversionResult {
|
|
|
221
225
|
set content(value: string | null | undefined);
|
|
222
226
|
get document(): WasmDocumentStructure | undefined;
|
|
223
227
|
set document(value: WasmDocumentStructure | null | undefined);
|
|
224
|
-
images: string[];
|
|
225
228
|
metadata: WasmHtmlMetadata;
|
|
226
229
|
tables: WasmTableData[];
|
|
227
230
|
warnings: WasmProcessingWarning[];
|
|
@@ -383,6 +386,23 @@ export class WasmHtmlMetadata {
|
|
|
383
386
|
structuredData: WasmStructuredData[];
|
|
384
387
|
}
|
|
385
388
|
|
|
389
|
+
/**
|
|
390
|
+
* Image dimensions in pixels.
|
|
391
|
+
*
|
|
392
|
+
* Binding-safe replacement for `(u32, u32)` tuples, which degrade to
|
|
393
|
+
* `Vec<Vec<String>>` when sanitized for cross-language binding generation.
|
|
394
|
+
* Used by both `ImageMetadata` and
|
|
395
|
+
* `InlineImage`.
|
|
396
|
+
*/
|
|
397
|
+
export class WasmImageDimensions {
|
|
398
|
+
free(): void;
|
|
399
|
+
[Symbol.dispose](): void;
|
|
400
|
+
static default(): WasmImageDimensions;
|
|
401
|
+
constructor(width: number, height: number);
|
|
402
|
+
height: number;
|
|
403
|
+
width: number;
|
|
404
|
+
}
|
|
405
|
+
|
|
386
406
|
/**
|
|
387
407
|
* Image metadata with source and dimensions.
|
|
388
408
|
*
|
|
@@ -395,12 +415,12 @@ export class WasmImageMetadata {
|
|
|
395
415
|
free(): void;
|
|
396
416
|
[Symbol.dispose](): void;
|
|
397
417
|
static default(): WasmImageMetadata;
|
|
398
|
-
constructor(src: string, imageType: WasmImageType, attributes: any, alt?: string | null, title?: string | null, dimensions?:
|
|
418
|
+
constructor(src: string, imageType: WasmImageType, attributes: any, alt?: string | null, title?: string | null, dimensions?: WasmImageDimensions | null);
|
|
399
419
|
get alt(): string | undefined;
|
|
400
420
|
set alt(value: string | null | undefined);
|
|
401
421
|
attributes: any;
|
|
402
|
-
get dimensions():
|
|
403
|
-
set dimensions(value:
|
|
422
|
+
get dimensions(): WasmImageDimensions | undefined;
|
|
423
|
+
set dimensions(value: WasmImageDimensions | null | undefined);
|
|
404
424
|
get imageType(): string;
|
|
405
425
|
set imageType(value: WasmImageType);
|
|
406
426
|
src: string;
|
|
@@ -477,6 +497,22 @@ export enum WasmListIndentType {
|
|
|
477
497
|
Tabs = 1,
|
|
478
498
|
}
|
|
479
499
|
|
|
500
|
+
/**
|
|
501
|
+
* A single key-value metadata entry from `<head>` meta tags.
|
|
502
|
+
*
|
|
503
|
+
* Binding-safe replacement for `(String, String)` tuples used in
|
|
504
|
+
* `NodeContent.MetadataBlock`. Tuple pairs cannot be represented
|
|
505
|
+
* across language boundaries without lossy degradation.
|
|
506
|
+
*/
|
|
507
|
+
export class WasmMetadataEntry {
|
|
508
|
+
free(): void;
|
|
509
|
+
[Symbol.dispose](): void;
|
|
510
|
+
static default(): WasmMetadataEntry;
|
|
511
|
+
constructor(key: string, value: string);
|
|
512
|
+
key: string;
|
|
513
|
+
value: string;
|
|
514
|
+
}
|
|
515
|
+
|
|
480
516
|
/**
|
|
481
517
|
* Line break syntax in Markdown output.
|
|
482
518
|
*
|
|
@@ -503,8 +539,8 @@ export class WasmNodeContent {
|
|
|
503
539
|
set definition(value: string | null | undefined);
|
|
504
540
|
get description(): string | undefined;
|
|
505
541
|
set description(value: string | null | undefined);
|
|
506
|
-
get entries():
|
|
507
|
-
set entries(value:
|
|
542
|
+
get entries(): WasmMetadataEntry[] | undefined;
|
|
543
|
+
set entries(value: WasmMetadataEntry[] | null | undefined);
|
|
508
544
|
get format(): string | undefined;
|
|
509
545
|
set format(value: string | null | undefined);
|
|
510
546
|
get grid(): WasmTableGrid | undefined;
|
|
@@ -830,6 +866,15 @@ export enum WasmTextDirection {
|
|
|
830
866
|
Auto = 2,
|
|
831
867
|
}
|
|
832
868
|
|
|
869
|
+
/**
|
|
870
|
+
* Controls which conversion tier is used.
|
|
871
|
+
*/
|
|
872
|
+
export enum WasmTierStrategy {
|
|
873
|
+
Auto = 0,
|
|
874
|
+
Tier2 = 1,
|
|
875
|
+
Tier1 = 2,
|
|
876
|
+
}
|
|
877
|
+
|
|
833
878
|
/**
|
|
834
879
|
* URL encoding strategy for link and image destinations.
|
|
835
880
|
*
|
|
@@ -5,5 +5,5 @@ import { __wbg_set_wasm } from "./html_to_markdown_wasm_bg.js";
|
|
|
5
5
|
__wbg_set_wasm(wasm);
|
|
6
6
|
|
|
7
7
|
export {
|
|
8
|
-
WasmAnnotationKind, WasmCodeBlockStyle, WasmConversionOptions, WasmConversionOptionsUpdate, WasmConversionResult, WasmDocumentMetadata, WasmDocumentNode, WasmDocumentStructure, WasmGridCell, WasmHeaderMetadata, WasmHeadingStyle, WasmHighlightStyle, WasmHtmlMetadata, WasmImageMetadata, WasmImageType, WasmLinkMetadata, WasmLinkStyle, WasmLinkType, WasmListIndentType, WasmNewlineStyle, WasmNodeContent, WasmNodeContext, WasmNodeType, WasmOutputFormat, WasmPreprocessingOptions, WasmPreprocessingOptionsUpdate, WasmPreprocessingPreset, WasmProcessingWarning, WasmStructuredData, WasmStructuredDataType, WasmTableData, WasmTableGrid, WasmTextAnnotation, WasmTextDirection, WasmUrlEscapeStyle, WasmVisitResult, WasmVisitorHandle, WasmWarningKind, WasmWhitespaceMode, convert
|
|
8
|
+
WasmAnnotationKind, WasmCodeBlockStyle, WasmConversionOptions, WasmConversionOptionsUpdate, WasmConversionResult, WasmDocumentMetadata, WasmDocumentNode, WasmDocumentStructure, WasmGridCell, WasmHeaderMetadata, WasmHeadingStyle, WasmHighlightStyle, WasmHtmlMetadata, WasmImageDimensions, WasmImageMetadata, WasmImageType, WasmLinkMetadata, WasmLinkStyle, WasmLinkType, WasmListIndentType, WasmMetadataEntry, WasmNewlineStyle, WasmNodeContent, WasmNodeContext, WasmNodeType, WasmOutputFormat, WasmPreprocessingOptions, WasmPreprocessingOptionsUpdate, WasmPreprocessingPreset, WasmProcessingWarning, WasmStructuredData, WasmStructuredDataType, WasmTableData, WasmTableGrid, WasmTextAnnotation, WasmTextDirection, WasmTierStrategy, WasmUrlEscapeStyle, WasmVisitResult, WasmVisitorHandle, WasmWarningKind, WasmWhitespaceMode, convert
|
|
9
9
|
} from "./html_to_markdown_wasm_bg.js";
|