@kreuzberg/html-to-markdown-wasm 3.5.5 → 3.6.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -126,6 +126,25 @@ const markdown = result.content;
126
126
  console.log(markdown);
127
127
  ```
128
128
 
129
+ ## Architecture
130
+
131
+ The converter routes each input through one of three tiers based on a fast prescan of the byte stream:
132
+
133
+ 1. **Tier-1 — single-pass byte scanner.** Handles 110+ HTML tags directly. Bails on any construct it cannot prove byte-equivalent to Tier-2.
134
+ 2. **Tier-2 — DOM walker.** Picks up Tier-1 bails and inputs the classifier rejected up front.
135
+ 3. **Tier-3 — standards-conformant parser.** Engaged for malformed HTML requiring full HTML5 repair.
136
+
137
+ The dispatcher is invisible to the caller. Output is byte-identical across tiers — enforced by a 116-snapshot oracle.
138
+
139
+ ## Capabilities
140
+
141
+ - **16 languages, one Rust core.** Rust, Python, Node.js, WASM, Java, Go, C#, PHP, Ruby, Elixir, R, Dart, Kotlin (Android), Swift, Zig, C ABI.
142
+ - **CommonMark-compatible Markdown** with GFM-style tables.
143
+ - **Djot output**: set `output_format = "djot"` (see Djot Output Format section below).
144
+ - **Real-HTML robust**: unclosed tags, CDATA, custom elements, malformed entities, nested tables, mixed encodings handled without losing content.
145
+ - **Metadata extraction**, **visitor API**, **inline images**, **configurable preprocessing presets**.
146
+ - **Per-group regression gates in CI**: every PR runs the bench harness against per-group thresholds.
147
+
129
148
  ## API Reference
130
149
 
131
150
  ### Core Function
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kreuzberg/html-to-markdown-wasm",
3
- "version": "3.5.5",
3
+ "version": "3.6.0-rc.10",
4
4
  "private": false,
5
5
  "description": "High-performance HTML to Markdown converter",
6
6
  "license": "MIT",
@@ -126,6 +126,25 @@ const markdown = result.content;
126
126
  console.log(markdown);
127
127
  ```
128
128
 
129
+ ## Architecture
130
+
131
+ The converter routes each input through one of three tiers based on a fast prescan of the byte stream:
132
+
133
+ 1. **Tier-1 — single-pass byte scanner.** Handles 110+ HTML tags directly. Bails on any construct it cannot prove byte-equivalent to Tier-2.
134
+ 2. **Tier-2 — DOM walker.** Picks up Tier-1 bails and inputs the classifier rejected up front.
135
+ 3. **Tier-3 — standards-conformant parser.** Engaged for malformed HTML requiring full HTML5 repair.
136
+
137
+ The dispatcher is invisible to the caller. Output is byte-identical across tiers — enforced by a 116-snapshot oracle.
138
+
139
+ ## Capabilities
140
+
141
+ - **16 languages, one Rust core.** Rust, Python, Node.js, WASM, Java, Go, C#, PHP, Ruby, Elixir, R, Dart, Kotlin (Android), Swift, Zig, C ABI.
142
+ - **CommonMark-compatible Markdown** with GFM-style tables.
143
+ - **Djot output**: set `output_format = "djot"` (see Djot Output Format section below).
144
+ - **Real-HTML robust**: unclosed tags, CDATA, custom elements, malformed entities, nested tables, mixed encodings handled without losing content.
145
+ - **Metadata extraction**, **visitor API**, **inline images**, **configurable preprocessing presets**.
146
+ - **Per-group regression gates in CI**: every PR runs the bench harness against per-group thresholds.
147
+
129
148
  ## API Reference
130
149
 
131
150
  ### Core Function
@@ -40,7 +40,7 @@ export class WasmConversionOptions {
40
40
  free(): void;
41
41
  [Symbol.dispose](): void;
42
42
  static default(): WasmConversionOptions;
43
- constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptions | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, excludeSelectors?: string[] | null, maxDepth?: number | null);
43
+ constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptions | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, urlEscapeStyle?: WasmUrlEscapeStyle | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, excludeSelectors?: string[] | null, tierStrategy?: WasmTierStrategy | null, maxDepth?: number | null);
44
44
  autolinks: boolean;
45
45
  brInTables: boolean;
46
46
  bullets: string;
@@ -87,6 +87,10 @@ export class WasmConversionOptions {
87
87
  strongEmSymbol: string;
88
88
  subSymbol: string;
89
89
  supSymbol: string;
90
+ get tierStrategy(): string;
91
+ set tierStrategy(value: WasmTierStrategy);
92
+ get urlEscapeStyle(): string;
93
+ set urlEscapeStyle(value: WasmUrlEscapeStyle);
90
94
  get visitor(): WasmVisitorHandle | undefined;
91
95
  set visitor(value: WasmVisitorHandle | null | undefined);
92
96
  get whitespaceMode(): string;
@@ -105,7 +109,7 @@ export class WasmConversionOptionsUpdate {
105
109
  free(): void;
106
110
  [Symbol.dispose](): void;
107
111
  static default(): WasmConversionOptionsUpdate;
108
- constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptionsUpdate | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, maxDepth?: number | null, excludeSelectors?: string[] | null);
112
+ constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptionsUpdate | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, urlEscapeStyle?: WasmUrlEscapeStyle | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, maxDepth?: number | null, excludeSelectors?: string[] | null, tierStrategy?: WasmTierStrategy | null);
109
113
  get autolinks(): boolean | undefined;
110
114
  set autolinks(value: boolean | null | undefined);
111
115
  get brInTables(): boolean | undefined;
@@ -182,6 +186,10 @@ export class WasmConversionOptionsUpdate {
182
186
  set subSymbol(value: string | null | undefined);
183
187
  get supSymbol(): string | undefined;
184
188
  set supSymbol(value: string | null | undefined);
189
+ get tierStrategy(): string | undefined;
190
+ set tierStrategy(value: WasmTierStrategy | null | undefined);
191
+ get urlEscapeStyle(): string | undefined;
192
+ set urlEscapeStyle(value: WasmUrlEscapeStyle | null | undefined);
185
193
  get visitor(): WasmVisitorHandle | undefined;
186
194
  set visitor(value: WasmVisitorHandle | null | undefined);
187
195
  get whitespaceMode(): string | undefined;
@@ -217,7 +225,6 @@ export class WasmConversionResult {
217
225
  set content(value: string | null | undefined);
218
226
  get document(): WasmDocumentStructure | undefined;
219
227
  set document(value: WasmDocumentStructure | null | undefined);
220
- images: string[];
221
228
  metadata: WasmHtmlMetadata;
222
229
  tables: WasmTableData[];
223
230
  warnings: WasmProcessingWarning[];
@@ -379,6 +386,23 @@ export class WasmHtmlMetadata {
379
386
  structuredData: WasmStructuredData[];
380
387
  }
381
388
 
389
+ /**
390
+ * Image dimensions in pixels.
391
+ *
392
+ * Binding-safe replacement for `(u32, u32)` tuples, which degrade to
393
+ * `Vec<Vec<String>>` when sanitized for cross-language binding generation.
394
+ * Used by both `ImageMetadata` and
395
+ * `InlineImage`.
396
+ */
397
+ export class WasmImageDimensions {
398
+ free(): void;
399
+ [Symbol.dispose](): void;
400
+ static default(): WasmImageDimensions;
401
+ constructor(width: number, height: number);
402
+ height: number;
403
+ width: number;
404
+ }
405
+
382
406
  /**
383
407
  * Image metadata with source and dimensions.
384
408
  *
@@ -391,12 +415,12 @@ export class WasmImageMetadata {
391
415
  free(): void;
392
416
  [Symbol.dispose](): void;
393
417
  static default(): WasmImageMetadata;
394
- constructor(src: string, imageType: WasmImageType, attributes: any, alt?: string | null, title?: string | null, dimensions?: Uint32Array | null);
418
+ constructor(src: string, imageType: WasmImageType, attributes: any, alt?: string | null, title?: string | null, dimensions?: WasmImageDimensions | null);
395
419
  get alt(): string | undefined;
396
420
  set alt(value: string | null | undefined);
397
421
  attributes: any;
398
- get dimensions(): Uint32Array | undefined;
399
- set dimensions(value: Uint32Array | null | undefined);
422
+ get dimensions(): WasmImageDimensions | undefined;
423
+ set dimensions(value: WasmImageDimensions | null | undefined);
400
424
  get imageType(): string;
401
425
  set imageType(value: WasmImageType);
402
426
  src: string;
@@ -473,6 +497,22 @@ export enum WasmListIndentType {
473
497
  Tabs = 1,
474
498
  }
475
499
 
500
+ /**
501
+ * A single key-value metadata entry from `<head>` meta tags.
502
+ *
503
+ * Binding-safe replacement for `(String, String)` tuples used in
504
+ * `NodeContent.MetadataBlock`. Tuple pairs cannot be represented
505
+ * across language boundaries without lossy degradation.
506
+ */
507
+ export class WasmMetadataEntry {
508
+ free(): void;
509
+ [Symbol.dispose](): void;
510
+ static default(): WasmMetadataEntry;
511
+ constructor(key: string, value: string);
512
+ key: string;
513
+ value: string;
514
+ }
515
+
476
516
  /**
477
517
  * Line break syntax in Markdown output.
478
518
  *
@@ -499,8 +539,8 @@ export class WasmNodeContent {
499
539
  set definition(value: string | null | undefined);
500
540
  get description(): string | undefined;
501
541
  set description(value: string | null | undefined);
502
- get entries(): any | undefined;
503
- set entries(value: any | null | undefined);
542
+ get entries(): WasmMetadataEntry[] | undefined;
543
+ set entries(value: WasmMetadataEntry[] | null | undefined);
504
544
  get format(): string | undefined;
505
545
  set format(value: string | null | undefined);
506
546
  get grid(): WasmTableGrid | undefined;
@@ -826,6 +866,34 @@ export enum WasmTextDirection {
826
866
  Auto = 2,
827
867
  }
828
868
 
869
+ /**
870
+ * Controls which conversion tier is used.
871
+ */
872
+ export enum WasmTierStrategy {
873
+ Auto = 0,
874
+ Tier2 = 1,
875
+ Tier1 = 2,
876
+ }
877
+
878
+ /**
879
+ * URL encoding strategy for link and image destinations.
880
+ *
881
+ * Controls how special characters in URL destinations are handled when they
882
+ * require escaping to produce valid Markdown.
883
+ *
884
+ * The `Angle` variant (default) wraps the destination in angle brackets:
885
+ * `[text](<url with spaces>)`. This is the CommonMark-specified escape hatch
886
+ * but breaks when the URL itself contains `>`.
887
+ *
888
+ * The `Percent` variant percent-encodes every character that is not an RFC 3986
889
+ * unreserved character or `/`, producing a destination safe for all Markdown
890
+ * parsers: `[text](url%20with%20spaces)`.
891
+ */
892
+ export enum WasmUrlEscapeStyle {
893
+ Angle = 0,
894
+ Percent = 1,
895
+ }
896
+
829
897
  /**
830
898
  * Result of a visitor callback.
831
899
  *
@@ -884,7 +952,12 @@ export enum WasmWhitespaceMode {
884
952
  * # Arguments
885
953
  *
886
954
  * * `html` — the HTML string to convert.
887
- * * `options` — optional conversion options. Defaults to `ConversionOptions.default`.
955
+ * * `options` — conversion options. The parameter bound is
956
+ * `impl Into<Option<ConversionOptions>>`, so any of the following call shapes are accepted:
957
+ * - `convert(html, ConversionOptions.default())` — bare options.
958
+ * - `convert(html, opts)` — bare options.
959
+ * - `convert(html, Some(opts))` — explicit `Option`.
960
+ * - `convert(html, None)` — fall back to `ConversionOptions.default`.
888
961
  *
889
962
  * # Example
890
963
  *
@@ -5,5 +5,5 @@ import { __wbg_set_wasm } from "./html_to_markdown_wasm_bg.js";
5
5
  __wbg_set_wasm(wasm);
6
6
 
7
7
  export {
8
- WasmAnnotationKind, WasmCodeBlockStyle, WasmConversionOptions, WasmConversionOptionsUpdate, WasmConversionResult, WasmDocumentMetadata, WasmDocumentNode, WasmDocumentStructure, WasmGridCell, WasmHeaderMetadata, WasmHeadingStyle, WasmHighlightStyle, WasmHtmlMetadata, WasmImageMetadata, WasmImageType, WasmLinkMetadata, WasmLinkStyle, WasmLinkType, WasmListIndentType, WasmNewlineStyle, WasmNodeContent, WasmNodeContext, WasmNodeType, WasmOutputFormat, WasmPreprocessingOptions, WasmPreprocessingOptionsUpdate, WasmPreprocessingPreset, WasmProcessingWarning, WasmStructuredData, WasmStructuredDataType, WasmTableData, WasmTableGrid, WasmTextAnnotation, WasmTextDirection, WasmVisitResult, WasmVisitorHandle, WasmWarningKind, WasmWhitespaceMode, convert
8
+ WasmAnnotationKind, WasmCodeBlockStyle, WasmConversionOptions, WasmConversionOptionsUpdate, WasmConversionResult, WasmDocumentMetadata, WasmDocumentNode, WasmDocumentStructure, WasmGridCell, WasmHeaderMetadata, WasmHeadingStyle, WasmHighlightStyle, WasmHtmlMetadata, WasmImageDimensions, WasmImageMetadata, WasmImageType, WasmLinkMetadata, WasmLinkStyle, WasmLinkType, WasmListIndentType, WasmMetadataEntry, WasmNewlineStyle, WasmNodeContent, WasmNodeContext, WasmNodeType, WasmOutputFormat, WasmPreprocessingOptions, WasmPreprocessingOptionsUpdate, WasmPreprocessingPreset, WasmProcessingWarning, WasmStructuredData, WasmStructuredDataType, WasmTableData, WasmTableGrid, WasmTextAnnotation, WasmTextDirection, WasmTierStrategy, WasmUrlEscapeStyle, WasmVisitResult, WasmVisitorHandle, WasmWarningKind, WasmWhitespaceMode, convert
9
9
  } from "./html_to_markdown_wasm_bg.js";