@kreuzberg/html-to-markdown-wasm 3.6.0-rc.1 → 3.6.0-rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -126,6 +126,25 @@ const markdown = result.content;
126
126
  console.log(markdown);
127
127
  ```
128
128
 
129
+ ## Architecture
130
+
131
+ The converter routes each input through one of three tiers based on a fast prescan of the byte stream:
132
+
133
+ 1. **Tier-1 — single-pass byte scanner.** Handles 110+ HTML tags directly. Bails on any construct it cannot prove byte-equivalent to Tier-2.
134
+ 2. **Tier-2 — DOM walker.** Picks up Tier-1 bails and inputs the classifier rejected up front.
135
+ 3. **Tier-3 — standards-conformant parser.** Engaged for malformed HTML requiring full HTML5 repair.
136
+
137
+ The dispatcher is invisible to the caller. Output is byte-identical across tiers — enforced by a 116-snapshot oracle.
138
+
139
+ ## Capabilities
140
+
141
+ - **16 languages, one Rust core.** Rust, Python, Node.js, WASM, Java, Go, C#, PHP, Ruby, Elixir, R, Dart, Kotlin (Android), Swift, Zig, C ABI.
142
+ - **CommonMark-compatible Markdown** with GFM-style tables.
143
+ - **Djot output**: set `output_format = "djot"` (see Djot Output Format section below).
144
+ - **Real-HTML robust**: unclosed tags, CDATA, custom elements, malformed entities, nested tables, mixed encodings handled without losing content.
145
+ - **Metadata extraction**, **visitor API**, **inline images**, **configurable preprocessing presets**.
146
+ - **Per-group regression gates in CI**: every PR runs the bench harness against per-group thresholds.
147
+
129
148
  ## API Reference
130
149
 
131
150
  ### Core Function
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kreuzberg/html-to-markdown-wasm",
3
- "version": "3.6.0-rc.1",
3
+ "version": "3.6.0-rc.11",
4
4
  "private": false,
5
5
  "description": "High-performance HTML to Markdown converter",
6
6
  "license": "MIT",
@@ -126,6 +126,25 @@ const markdown = result.content;
126
126
  console.log(markdown);
127
127
  ```
128
128
 
129
+ ## Architecture
130
+
131
+ The converter routes each input through one of three tiers based on a fast prescan of the byte stream:
132
+
133
+ 1. **Tier-1 — single-pass byte scanner.** Handles 110+ HTML tags directly. Bails on any construct it cannot prove byte-equivalent to Tier-2.
134
+ 2. **Tier-2 — DOM walker.** Picks up Tier-1 bails and inputs the classifier rejected up front.
135
+ 3. **Tier-3 — standards-conformant parser.** Engaged for malformed HTML requiring full HTML5 repair.
136
+
137
+ The dispatcher is invisible to the caller. Output is byte-identical across tiers — enforced by a 116-snapshot oracle.
138
+
139
+ ## Capabilities
140
+
141
+ - **16 languages, one Rust core.** Rust, Python, Node.js, WASM, Java, Go, C#, PHP, Ruby, Elixir, R, Dart, Kotlin (Android), Swift, Zig, C ABI.
142
+ - **CommonMark-compatible Markdown** with GFM-style tables.
143
+ - **Djot output**: set `output_format = "djot"` (see Djot Output Format section below).
144
+ - **Real-HTML robust**: unclosed tags, CDATA, custom elements, malformed entities, nested tables, mixed encodings handled without losing content.
145
+ - **Metadata extraction**, **visitor API**, **inline images**, **configurable preprocessing presets**.
146
+ - **Per-group regression gates in CI**: every PR runs the bench harness against per-group thresholds.
147
+
129
148
  ## API Reference
130
149
 
131
150
  ### Core Function
@@ -40,7 +40,7 @@ export class WasmConversionOptions {
40
40
  free(): void;
41
41
  [Symbol.dispose](): void;
42
42
  static default(): WasmConversionOptions;
43
- constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptions | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, urlEscapeStyle?: WasmUrlEscapeStyle | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, excludeSelectors?: string[] | null, maxDepth?: number | null);
43
+ constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptions | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, urlEscapeStyle?: WasmUrlEscapeStyle | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, excludeSelectors?: string[] | null, tierStrategy?: WasmTierStrategy | null, maxDepth?: number | null);
44
44
  autolinks: boolean;
45
45
  brInTables: boolean;
46
46
  bullets: string;
@@ -87,6 +87,8 @@ export class WasmConversionOptions {
87
87
  strongEmSymbol: string;
88
88
  subSymbol: string;
89
89
  supSymbol: string;
90
+ get tierStrategy(): string;
91
+ set tierStrategy(value: WasmTierStrategy);
90
92
  get urlEscapeStyle(): string;
91
93
  set urlEscapeStyle(value: WasmUrlEscapeStyle);
92
94
  get visitor(): WasmVisitorHandle | undefined;
@@ -107,7 +109,7 @@ export class WasmConversionOptionsUpdate {
107
109
  free(): void;
108
110
  [Symbol.dispose](): void;
109
111
  static default(): WasmConversionOptionsUpdate;
110
- constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptionsUpdate | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, urlEscapeStyle?: WasmUrlEscapeStyle | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, maxDepth?: number | null, excludeSelectors?: string[] | null);
112
+ constructor(headingStyle?: WasmHeadingStyle | null, listIndentType?: WasmListIndentType | null, listIndentWidth?: number | null, bullets?: string | null, strongEmSymbol?: string | null, escapeAsterisks?: boolean | null, escapeUnderscores?: boolean | null, escapeMisc?: boolean | null, escapeAscii?: boolean | null, codeLanguage?: string | null, autolinks?: boolean | null, defaultTitle?: boolean | null, brInTables?: boolean | null, compactTables?: boolean | null, highlightStyle?: WasmHighlightStyle | null, extractMetadata?: boolean | null, whitespaceMode?: WasmWhitespaceMode | null, stripNewlines?: boolean | null, wrap?: boolean | null, wrapWidth?: number | null, convertAsInline?: boolean | null, subSymbol?: string | null, supSymbol?: string | null, newlineStyle?: WasmNewlineStyle | null, codeBlockStyle?: WasmCodeBlockStyle | null, keepInlineImagesIn?: string[] | null, preprocessing?: WasmPreprocessingOptionsUpdate | null, encoding?: string | null, debug?: boolean | null, stripTags?: string[] | null, preserveTags?: string[] | null, skipImages?: boolean | null, urlEscapeStyle?: WasmUrlEscapeStyle | null, linkStyle?: WasmLinkStyle | null, outputFormat?: WasmOutputFormat | null, includeDocumentStructure?: boolean | null, extractImages?: boolean | null, maxImageSize?: bigint | null, captureSvg?: boolean | null, inferDimensions?: boolean | null, maxDepth?: number | null, excludeSelectors?: string[] | null, tierStrategy?: WasmTierStrategy | null);
111
113
  get autolinks(): boolean | undefined;
112
114
  set autolinks(value: boolean | null | undefined);
113
115
  get brInTables(): boolean | undefined;
@@ -184,6 +186,8 @@ export class WasmConversionOptionsUpdate {
184
186
  set subSymbol(value: string | null | undefined);
185
187
  get supSymbol(): string | undefined;
186
188
  set supSymbol(value: string | null | undefined);
189
+ get tierStrategy(): string | undefined;
190
+ set tierStrategy(value: WasmTierStrategy | null | undefined);
187
191
  get urlEscapeStyle(): string | undefined;
188
192
  set urlEscapeStyle(value: WasmUrlEscapeStyle | null | undefined);
189
193
  get visitor(): WasmVisitorHandle | undefined;
@@ -221,7 +225,6 @@ export class WasmConversionResult {
221
225
  set content(value: string | null | undefined);
222
226
  get document(): WasmDocumentStructure | undefined;
223
227
  set document(value: WasmDocumentStructure | null | undefined);
224
- images: string[];
225
228
  metadata: WasmHtmlMetadata;
226
229
  tables: WasmTableData[];
227
230
  warnings: WasmProcessingWarning[];
@@ -383,6 +386,23 @@ export class WasmHtmlMetadata {
383
386
  structuredData: WasmStructuredData[];
384
387
  }
385
388
 
389
+ /**
390
+ * Image dimensions in pixels.
391
+ *
392
+ * Binding-safe replacement for `(u32, u32)` tuples, which degrade to
393
+ * `Vec<Vec<String>>` when sanitized for cross-language binding generation.
394
+ * Used by both `ImageMetadata` and
395
+ * `InlineImage`.
396
+ */
397
+ export class WasmImageDimensions {
398
+ free(): void;
399
+ [Symbol.dispose](): void;
400
+ static default(): WasmImageDimensions;
401
+ constructor(width: number, height: number);
402
+ height: number;
403
+ width: number;
404
+ }
405
+
386
406
  /**
387
407
  * Image metadata with source and dimensions.
388
408
  *
@@ -395,12 +415,12 @@ export class WasmImageMetadata {
395
415
  free(): void;
396
416
  [Symbol.dispose](): void;
397
417
  static default(): WasmImageMetadata;
398
- constructor(src: string, imageType: WasmImageType, attributes: any, alt?: string | null, title?: string | null, dimensions?: Uint32Array | null);
418
+ constructor(src: string, imageType: WasmImageType, attributes: any, alt?: string | null, title?: string | null, dimensions?: WasmImageDimensions | null);
399
419
  get alt(): string | undefined;
400
420
  set alt(value: string | null | undefined);
401
421
  attributes: any;
402
- get dimensions(): Uint32Array | undefined;
403
- set dimensions(value: Uint32Array | null | undefined);
422
+ get dimensions(): WasmImageDimensions | undefined;
423
+ set dimensions(value: WasmImageDimensions | null | undefined);
404
424
  get imageType(): string;
405
425
  set imageType(value: WasmImageType);
406
426
  src: string;
@@ -477,6 +497,22 @@ export enum WasmListIndentType {
477
497
  Tabs = 1,
478
498
  }
479
499
 
500
+ /**
501
+ * A single key-value metadata entry from `<head>` meta tags.
502
+ *
503
+ * Binding-safe replacement for `(String, String)` tuples used in
504
+ * `NodeContent.MetadataBlock`. Tuple pairs cannot be represented
505
+ * across language boundaries without lossy degradation.
506
+ */
507
+ export class WasmMetadataEntry {
508
+ free(): void;
509
+ [Symbol.dispose](): void;
510
+ static default(): WasmMetadataEntry;
511
+ constructor(key: string, value: string);
512
+ key: string;
513
+ value: string;
514
+ }
515
+
480
516
  /**
481
517
  * Line break syntax in Markdown output.
482
518
  *
@@ -503,8 +539,8 @@ export class WasmNodeContent {
503
539
  set definition(value: string | null | undefined);
504
540
  get description(): string | undefined;
505
541
  set description(value: string | null | undefined);
506
- get entries(): any | undefined;
507
- set entries(value: any | null | undefined);
542
+ get entries(): WasmMetadataEntry[] | undefined;
543
+ set entries(value: WasmMetadataEntry[] | null | undefined);
508
544
  get format(): string | undefined;
509
545
  set format(value: string | null | undefined);
510
546
  get grid(): WasmTableGrid | undefined;
@@ -830,6 +866,15 @@ export enum WasmTextDirection {
830
866
  Auto = 2,
831
867
  }
832
868
 
869
+ /**
870
+ * Controls which conversion tier is used.
871
+ */
872
+ export enum WasmTierStrategy {
873
+ Auto = 0,
874
+ Tier2 = 1,
875
+ Tier1 = 2,
876
+ }
877
+
833
878
  /**
834
879
  * URL encoding strategy for link and image destinations.
835
880
  *
@@ -5,5 +5,5 @@ import { __wbg_set_wasm } from "./html_to_markdown_wasm_bg.js";
5
5
  __wbg_set_wasm(wasm);
6
6
 
7
7
  export {
8
- WasmAnnotationKind, WasmCodeBlockStyle, WasmConversionOptions, WasmConversionOptionsUpdate, WasmConversionResult, WasmDocumentMetadata, WasmDocumentNode, WasmDocumentStructure, WasmGridCell, WasmHeaderMetadata, WasmHeadingStyle, WasmHighlightStyle, WasmHtmlMetadata, WasmImageMetadata, WasmImageType, WasmLinkMetadata, WasmLinkStyle, WasmLinkType, WasmListIndentType, WasmNewlineStyle, WasmNodeContent, WasmNodeContext, WasmNodeType, WasmOutputFormat, WasmPreprocessingOptions, WasmPreprocessingOptionsUpdate, WasmPreprocessingPreset, WasmProcessingWarning, WasmStructuredData, WasmStructuredDataType, WasmTableData, WasmTableGrid, WasmTextAnnotation, WasmTextDirection, WasmUrlEscapeStyle, WasmVisitResult, WasmVisitorHandle, WasmWarningKind, WasmWhitespaceMode, convert
8
+ WasmAnnotationKind, WasmCodeBlockStyle, WasmConversionOptions, WasmConversionOptionsUpdate, WasmConversionResult, WasmDocumentMetadata, WasmDocumentNode, WasmDocumentStructure, WasmGridCell, WasmHeaderMetadata, WasmHeadingStyle, WasmHighlightStyle, WasmHtmlMetadata, WasmImageDimensions, WasmImageMetadata, WasmImageType, WasmLinkMetadata, WasmLinkStyle, WasmLinkType, WasmListIndentType, WasmMetadataEntry, WasmNewlineStyle, WasmNodeContent, WasmNodeContext, WasmNodeType, WasmOutputFormat, WasmPreprocessingOptions, WasmPreprocessingOptionsUpdate, WasmPreprocessingPreset, WasmProcessingWarning, WasmStructuredData, WasmStructuredDataType, WasmTableData, WasmTableGrid, WasmTextAnnotation, WasmTextDirection, WasmTierStrategy, WasmUrlEscapeStyle, WasmVisitResult, WasmVisitorHandle, WasmWarningKind, WasmWhitespaceMode, convert
9
9
  } from "./html_to_markdown_wasm_bg.js";