@kreuzberg/html-to-markdown-wasm 3.4.0-rc.8 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,65 +1,35 @@
1
1
  {
2
- "name": "@kreuzberg/html-to-markdown-wasm",
3
- "version": "3.4.0-rc.8",
4
- "description": "High-performance HTML to Markdown converter - WebAssembly bindings",
5
- "main": "dist/html_to_markdown_wasm.js",
6
- "types": "dist/html_to_markdown_wasm.d.ts",
7
- "exports": {
8
- ".": {
9
- "import": "./dist/html_to_markdown_wasm.js",
10
- "types": "./dist/html_to_markdown_wasm.d.ts",
11
- "default": "./dist/html_to_markdown_wasm.js"
12
- },
13
- "./dist-node": {
14
- "import": "./dist-node/html_to_markdown_wasm.js",
15
- "require": "./dist-node/html_to_markdown_wasm.js",
16
- "types": "./dist-node/html_to_markdown_wasm.d.ts"
17
- },
18
- "./dist-node/*": "./dist-node/*",
19
- "./dist-web": {
20
- "import": "./dist-web/html_to_markdown_wasm.js",
21
- "types": "./dist-web/html_to_markdown_wasm.d.ts"
22
- },
23
- "./dist-web/*": "./dist-web/*"
24
- },
25
- "repository": "https://github.com/kreuzberg-dev/html-to-markdown",
26
- "homepage": "https://github.com/kreuzberg-dev/html-to-markdown",
27
- "license": "MIT",
28
- "author": "Na'aman Hirschfeld <naaman@kreuzberg.dev>",
29
- "bugs": "https://github.com/kreuzberg-dev/html-to-markdown/issues",
30
- "keywords": [
31
- "html",
32
- "markdown",
33
- "converter",
34
- "rust",
35
- "wasm",
36
- "webassembly"
37
- ],
38
- "files": [
39
- "dist",
40
- "dist-node",
41
- "dist-web",
42
- "README.md"
43
- ],
44
- "scripts": {
45
- "build": "wasm-pack build --target bundler --out-dir dist && node ./scripts/patch-bundler-entry.js",
46
- "build:nodejs": "wasm-pack build --target nodejs --out-dir dist-node && node ./scripts/patch-bundler-entry.js dist-node --types-only",
47
- "build:web": "wasm-pack build --target web --out-dir dist-web && node ./scripts/patch-bundler-entry.js dist-web --types-only",
48
- "build:all": "pnpm run build && pnpm run build:nodejs && pnpm run build:web && pnpm run cleanup:gitignore",
49
- "cleanup:gitignore": "node ./scripts/cleanup-gitignore.js",
50
- "test": "vitest run",
51
- "test:watch": "vitest",
52
- "test:wasm-pack": "wasm-pack test --headless --chrome",
53
- "clean": "rm -rf dist dist-node dist-web node_modules pkg"
54
- },
55
- "devDependencies": {
56
- "@types/node": "^25.6.0",
57
- "tsx": "^4.21.0",
58
- "vitest": "^4.1.5",
59
- "wasm-pack": "^0.14.0"
60
- },
61
- "publishConfig": {
62
- "registry": "https://registry.npmjs.org/",
63
- "access": "public"
64
- }
2
+ "name": "@kreuzberg/html-to-markdown-wasm",
3
+ "version": "3.4.0",
4
+ "private": false,
5
+ "description": "High-performance HTML to Markdown converter",
6
+ "license": "MIT",
7
+ "repository": {
8
+ "type": "git",
9
+ "url": "https://github.com/kreuzberg-dev/html-to-markdown",
10
+ "directory": "crates/html-to-markdown-wasm"
11
+ },
12
+ "files": [
13
+ "pkg",
14
+ "*.wasm",
15
+ "*.d.ts",
16
+ "README.md"
17
+ ],
18
+ "type": "module",
19
+ "main": "pkg/nodejs/html-to-markdown_wasm.js",
20
+ "module": "pkg/web/html-to-markdown_wasm.js",
21
+ "types": "pkg/nodejs/html-to-markdown_wasm.d.ts",
22
+ "scripts": {
23
+ "build": "wasm-pack build --target nodejs --out-dir pkg/nodejs",
24
+ "build:ci": "wasm-pack build --release --target nodejs --out-dir pkg/nodejs",
25
+ "build:wasm:web": "wasm-pack build --release --target web --out-dir pkg/web",
26
+ "build:wasm:bundler": "wasm-pack build --release --target bundler --out-dir pkg/bundler",
27
+ "build:wasm:nodejs": "wasm-pack build --release --target nodejs --out-dir pkg/nodejs",
28
+ "build:wasm:deno": "wasm-pack build --release --target deno --out-dir pkg/deno",
29
+ "build:all": "npm run build:wasm:web && npm run build:wasm:bundler && npm run build:wasm:nodejs && npm run build:wasm:deno && find pkg -name .gitignore -delete",
30
+ "test": "vitest run",
31
+ "test:watch": "vitest watch",
32
+ "test:coverage": "vitest run --coverage",
33
+ "clean": "rm -rf pkg dist"
34
+ }
65
35
  }
@@ -63,7 +63,7 @@ export class WasmConversionOptions {
63
63
  * Create from a partial update, applying to defaults.
64
64
  */
65
65
  static fromUpdate(update: WasmConversionOptionsUpdate): WasmConversionOptions;
66
- constructor(heading_style?: WasmHeadingStyle | null, list_indent_type?: WasmListIndentType | null, list_indent_width?: number | null, bullets?: string | null, strong_em_symbol?: string | null, escape_asterisks?: boolean | null, escape_underscores?: boolean | null, escape_misc?: boolean | null, escape_ascii?: boolean | null, code_language?: string | null, autolinks?: boolean | null, default_title?: boolean | null, br_in_tables?: boolean | null, highlight_style?: WasmHighlightStyle | null, extract_metadata?: boolean | null, whitespace_mode?: WasmWhitespaceMode | null, strip_newlines?: boolean | null, wrap?: boolean | null, wrap_width?: number | null, convert_as_inline?: boolean | null, sub_symbol?: string | null, sup_symbol?: string | null, newline_style?: WasmNewlineStyle | null, code_block_style?: WasmCodeBlockStyle | null, keep_inline_images_in?: string[] | null, preprocessing?: WasmPreprocessingOptions | null, encoding?: string | null, debug?: boolean | null, strip_tags?: string[] | null, preserve_tags?: string[] | null, skip_images?: boolean | null, link_style?: WasmLinkStyle | null, output_format?: WasmOutputFormat | null, include_document_structure?: boolean | null, extract_images?: boolean | null, max_image_size?: bigint | null, capture_svg?: boolean | null, infer_dimensions?: boolean | null, exclude_selectors?: string[] | null, max_depth?: number | null);
66
+ constructor(heading_style?: WasmHeadingStyle | null, list_indent_type?: WasmListIndentType | null, list_indent_width?: number | null, bullets?: string | null, strong_em_symbol?: string | null, escape_asterisks?: boolean | null, escape_underscores?: boolean | null, escape_misc?: boolean | null, escape_ascii?: boolean | null, code_language?: string | null, autolinks?: boolean | null, default_title?: boolean | null, br_in_tables?: boolean | null, highlight_style?: WasmHighlightStyle | null, extract_metadata?: boolean | null, whitespace_mode?: WasmWhitespaceMode | null, strip_newlines?: boolean | null, wrap?: boolean | null, wrap_width?: number | null, convert_as_inline?: boolean | null, sub_symbol?: string | null, sup_symbol?: string | null, newline_style?: WasmNewlineStyle | null, code_block_style?: WasmCodeBlockStyle | null, keep_inline_images_in?: string[] | null, preprocessing?: WasmPreprocessingOptions | null, encoding?: string | null, debug?: boolean | null, strip_tags?: string[] | null, preserve_tags?: string[] | null, skip_images?: boolean | null, link_style?: WasmLinkStyle | null, output_format?: WasmOutputFormat | null, include_document_structure?: boolean | null, extract_images?: boolean | null, max_image_size?: bigint | null, capture_svg?: boolean | null, infer_dimensions?: boolean | null, exclude_selectors?: string[] | null, max_depth?: number | null, visitor?: WasmVisitorHandle | null);
67
67
  autolinks: boolean;
68
68
  brInTables: boolean;
69
69
  bullets: string;
@@ -102,6 +102,8 @@ export class WasmConversionOptions {
102
102
  strongEmSymbol: string;
103
103
  subSymbol: string;
104
104
  supSymbol: string;
105
+ get visitor(): WasmVisitorHandle | undefined;
106
+ set visitor(value: WasmVisitorHandle | null | undefined);
105
107
  whitespaceMode: WasmWhitespaceMode;
106
108
  wrap: boolean;
107
109
  wrapWidth: number;
@@ -140,6 +142,10 @@ export class WasmConversionOptionsBuilder {
140
142
  * Set the list of HTML tag names whose content is stripped from output.
141
143
  */
142
144
  stripTags(tags: string[]): WasmConversionOptionsBuilder;
145
+ /**
146
+ * Set the visitor used during conversion.
147
+ */
148
+ visitor(visitor?: WasmVisitorHandle | null): WasmConversionOptionsBuilder;
143
149
  }
144
150
 
145
151
  /**
@@ -151,7 +157,7 @@ export class WasmConversionOptionsBuilder {
151
157
  export class WasmConversionOptionsUpdate {
152
158
  free(): void;
153
159
  [Symbol.dispose](): void;
154
- constructor(heading_style?: WasmHeadingStyle | null, list_indent_type?: WasmListIndentType | null, list_indent_width?: number | null, bullets?: string | null, strong_em_symbol?: string | null, escape_asterisks?: boolean | null, escape_underscores?: boolean | null, escape_misc?: boolean | null, escape_ascii?: boolean | null, code_language?: string | null, autolinks?: boolean | null, default_title?: boolean | null, br_in_tables?: boolean | null, highlight_style?: WasmHighlightStyle | null, extract_metadata?: boolean | null, whitespace_mode?: WasmWhitespaceMode | null, strip_newlines?: boolean | null, wrap?: boolean | null, wrap_width?: number | null, convert_as_inline?: boolean | null, sub_symbol?: string | null, sup_symbol?: string | null, newline_style?: WasmNewlineStyle | null, code_block_style?: WasmCodeBlockStyle | null, keep_inline_images_in?: string[] | null, preprocessing?: WasmPreprocessingOptionsUpdate | null, encoding?: string | null, debug?: boolean | null, strip_tags?: string[] | null, preserve_tags?: string[] | null, skip_images?: boolean | null, link_style?: WasmLinkStyle | null, output_format?: WasmOutputFormat | null, include_document_structure?: boolean | null, extract_images?: boolean | null, max_image_size?: bigint | null, capture_svg?: boolean | null, infer_dimensions?: boolean | null, max_depth?: number | null, exclude_selectors?: string[] | null);
160
+ constructor(heading_style?: WasmHeadingStyle | null, list_indent_type?: WasmListIndentType | null, list_indent_width?: number | null, bullets?: string | null, strong_em_symbol?: string | null, escape_asterisks?: boolean | null, escape_underscores?: boolean | null, escape_misc?: boolean | null, escape_ascii?: boolean | null, code_language?: string | null, autolinks?: boolean | null, default_title?: boolean | null, br_in_tables?: boolean | null, highlight_style?: WasmHighlightStyle | null, extract_metadata?: boolean | null, whitespace_mode?: WasmWhitespaceMode | null, strip_newlines?: boolean | null, wrap?: boolean | null, wrap_width?: number | null, convert_as_inline?: boolean | null, sub_symbol?: string | null, sup_symbol?: string | null, newline_style?: WasmNewlineStyle | null, code_block_style?: WasmCodeBlockStyle | null, keep_inline_images_in?: string[] | null, preprocessing?: WasmPreprocessingOptionsUpdate | null, encoding?: string | null, debug?: boolean | null, strip_tags?: string[] | null, preserve_tags?: string[] | null, skip_images?: boolean | null, link_style?: WasmLinkStyle | null, output_format?: WasmOutputFormat | null, include_document_structure?: boolean | null, extract_images?: boolean | null, max_image_size?: bigint | null, capture_svg?: boolean | null, infer_dimensions?: boolean | null, max_depth?: number | null, exclude_selectors?: string[] | null, visitor?: WasmVisitorHandle | null);
155
161
  get autolinks(): boolean | undefined;
156
162
  set autolinks(value: boolean | null | undefined);
157
163
  get brInTables(): boolean | undefined;
@@ -226,6 +232,8 @@ export class WasmConversionOptionsUpdate {
226
232
  set subSymbol(value: string | null | undefined);
227
233
  get supSymbol(): string | undefined;
228
234
  set supSymbol(value: string | null | undefined);
235
+ get visitor(): WasmVisitorHandle | undefined;
236
+ set visitor(value: WasmVisitorHandle | null | undefined);
229
237
  get whitespaceMode(): WasmWhitespaceMode | undefined;
230
238
  set whitespaceMode(value: WasmWhitespaceMode | null | undefined);
231
239
  get wrap(): boolean | undefined;
@@ -273,7 +281,6 @@ export class WasmConversionResult {
273
281
  * # Examples
274
282
  *
275
283
  * ```
276
- * # use html_to_markdown_rs::metadata::DocumentMetadata;
277
284
  * let doc = DocumentMetadata {
278
285
  * title: Some("My Article".to_string()),
279
286
  * description: Some("A great article about Rust".to_string()),
@@ -363,7 +370,6 @@ export class WasmGridCell {
363
370
  * # Examples
364
371
  *
365
372
  * ```
366
- * # use html_to_markdown_rs::metadata::HeaderMetadata;
367
373
  * let header = HeaderMetadata {
368
374
  * level: 1,
369
375
  * text: "Main Title".to_string(),
@@ -389,7 +395,6 @@ export class WasmHeaderMetadata {
389
395
  * # Examples
390
396
  *
391
397
  * ```
392
- * # use html_to_markdown_rs::metadata::HeaderMetadata;
393
398
  * let valid = HeaderMetadata {
394
399
  * level: 3,
395
400
  * text: "Title".to_string(),
@@ -451,7 +456,6 @@ export enum WasmHighlightStyle {
451
456
  * # Examples
452
457
  *
453
458
  * ```
454
- * # use html_to_markdown_rs::metadata::HtmlMetadata;
455
459
  * let metadata = HtmlMetadata {
456
460
  * document: Default::default(),
457
461
  * headers: Vec::new(),
@@ -483,7 +487,6 @@ export class WasmHtmlMetadata {
483
487
  * # Examples
484
488
  *
485
489
  * ```
486
- * # use html_to_markdown_rs::metadata::{ImageMetadata, ImageType};
487
490
  * let img = ImageMetadata {
488
491
  * src: "https://example.com/image.jpg".to_string(),
489
492
  * alt: Some("An example image".to_string()),
@@ -531,7 +534,6 @@ export enum WasmImageType {
531
534
  * # Examples
532
535
  *
533
536
  * ```
534
- * # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
535
537
  * let link = LinkMetadata {
536
538
  * href: "https://example.com".to_string(),
537
539
  * text: "Example".to_string(),
@@ -562,7 +564,6 @@ export class WasmLinkMetadata {
562
564
  * # Examples
563
565
  *
564
566
  * ```
565
- * # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
566
567
  * assert_eq!(LinkMetadata::classify_link("#section"), LinkType::Anchor);
567
568
  * assert_eq!(LinkMetadata::classify_link("mailto:test@example.com"), LinkType::Email);
568
569
  * assert_eq!(LinkMetadata::classify_link("tel:+1234567890"), LinkType::Phone);
@@ -867,7 +868,6 @@ export class WasmProcessingWarning {
867
868
  * # Examples
868
869
  *
869
870
  * ```
870
- * # use html_to_markdown_rs::metadata::{StructuredData, StructuredDataType};
871
871
  * let schema = StructuredData {
872
872
  * data_type: StructuredDataType::JsonLd,
873
873
  * raw_json: r#"{"@context":"https://schema.org","@type":"Article"}"#.to_string(),
@@ -961,6 +961,17 @@ export enum WasmVisitResult {
961
961
  Error = 4,
962
962
  }
963
963
 
964
+ /**
965
+ * Type alias for a visitor handle (Rc-wrapped `RefCell` for interior mutability).
966
+ *
967
+ * This allows visitors to be passed around and shared while still being mutable.
968
+ */
969
+ export class WasmVisitorHandle {
970
+ free(): void;
971
+ [Symbol.dispose](): void;
972
+ constructor(visitor: any);
973
+ }
974
+
964
975
  /**
965
976
  * Categories of processing warnings.
966
977
  */
@@ -983,129 +994,27 @@ export enum WasmWhitespaceMode {
983
994
  Strict = 1,
984
995
  }
985
996
 
986
- export function convert(html: string, options?: WasmConversionOptions | null, visitor?: any | null): WasmConversionResult;
987
-
988
-
989
- export type WasmHeadingStyle = "underlined" | "atx" | "atxClosed";
990
- export type WasmListIndentType = "spaces" | "tabs";
991
- export type WasmWhitespaceMode = "normalized" | "strict";
992
- export type WasmNewlineStyle = "spaces" | "backslash";
993
- export type WasmCodeBlockStyle = "indented" | "backticks" | "tildes";
994
- export type WasmHighlightStyle = "doubleEqual" | "html" | "bold" | "none";
995
- export type WasmPreprocessingPreset = "minimal" | "standard" | "aggressive";
996
- export type WasmOutputFormat = "markdown" | "djot" | "plain";
997
-
998
- export interface WasmPreprocessingOptions {
999
- enabled?: boolean;
1000
- preset?: WasmPreprocessingPreset;
1001
- removeNavigation?: boolean;
1002
- removeForms?: boolean;
1003
- }
1004
-
1005
- export interface WasmConversionOptions {
1006
- headingStyle?: WasmHeadingStyle;
1007
- listIndentType?: WasmListIndentType;
1008
- listIndentWidth?: number;
1009
- bullets?: string;
1010
- strongEmSymbol?: string;
1011
- escapeAsterisks?: boolean;
1012
- escapeUnderscores?: boolean;
1013
- escapeMisc?: boolean;
1014
- escapeAscii?: boolean;
1015
- codeLanguage?: string;
1016
- autolinks?: boolean;
1017
- defaultTitle?: boolean;
1018
- brInTables?: boolean;
1019
- hocrSpatialTables?: boolean;
1020
- highlightStyle?: WasmHighlightStyle;
1021
- extractMetadata?: boolean;
1022
- whitespaceMode?: WasmWhitespaceMode;
1023
- stripNewlines?: boolean;
1024
- wrap?: boolean;
1025
- wrapWidth?: number;
1026
- convertAsInline?: boolean;
1027
- subSymbol?: string;
1028
- supSymbol?: string;
1029
- newlineStyle?: WasmNewlineStyle;
1030
- codeBlockStyle?: WasmCodeBlockStyle;
1031
- keepInlineImagesIn?: string[];
1032
- preprocessing?: WasmPreprocessingOptions | null;
1033
- encoding?: string;
1034
- debug?: boolean;
1035
- stripTags?: string[];
1036
- preserveTags?: string[];
1037
- skipImages?: boolean;
1038
- outputFormat?: WasmOutputFormat;
1039
- includeDocumentStructure?: boolean;
1040
- extractImages?: boolean;
1041
- maxImageSize?: number;
1042
- captureSvg?: boolean;
1043
- inferDimensions?: boolean;
1044
- }
1045
-
1046
- /** A single cell in a structured table grid. */
1047
- export interface WasmGridCell {
1048
- content: string;
1049
- row: number;
1050
- col: number;
1051
- rowSpan: number;
1052
- colSpan: number;
1053
- isHeader: boolean;
1054
- }
1055
-
1056
- /** Structured table grid with cell-level data. */
1057
- export interface WasmTableGrid {
1058
- rows: number;
1059
- cols: number;
1060
- cells: WasmGridCell[];
1061
- }
1062
-
1063
- /** A table extracted during conversion. */
1064
- export interface WasmConversionTable {
1065
- grid: WasmTableGrid;
1066
- markdown: string;
1067
- }
1068
-
1069
- /** Non-fatal warning emitted during conversion. */
1070
- export interface WasmConversionWarning {
1071
- /** Human-readable warning message. */
1072
- message: string;
1073
- /** Warning kind identifier. */
1074
- kind: string;
1075
- }
1076
-
1077
- /** An extracted inline image from the HTML document. */
1078
- export interface WasmInlineImage {
1079
- /** Raw image data as a Uint8Array. */
1080
- data: Uint8Array;
1081
- /** Image format (png, jpeg, gif, svg, etc.). */
1082
- format: string;
1083
- /** Generated or provided filename, or null. */
1084
- filename: string | null;
1085
- /** Alt text or description, or null. */
1086
- description: string | null;
1087
- /** Image width in pixels, or null if not available. */
1088
- width: number | null;
1089
- /** Image height in pixels, or null if not available. */
1090
- height: number | null;
1091
- /** Source type ("img_data_uri" or "svg_element"). */
1092
- source: string;
1093
- /** HTML attributes from the source element. */
1094
- attributes: Record<string, string>;
1095
- }
1096
-
1097
- /** Result of the convert() API. */
1098
- export interface WasmConversionResult {
1099
- /** Converted text output (markdown, djot, or plain text), or null. */
1100
- content: string | null;
1101
- /** Structured document tree serialized as a JSON value, or null. */
1102
- document: unknown | null;
1103
- /** Extracted HTML metadata serialized as a JSON value, or null. */
1104
- metadata: unknown | null;
1105
- /** All tables found in the HTML, in document order. */
1106
- tables: WasmConversionTable[];
1107
- /** Extracted inline images (data URIs and SVGs). */
1108
- images: WasmInlineImage[];
1109
- /** Non-fatal processing warnings. */
1110
- warnings: WasmConversionWarning[];
1111
- }
997
+ /**
998
+ * Convert HTML to Markdown, returning a [`ConversionResult`] with content, metadata, images,
999
+ * and warnings.
1000
+ *
1001
+ * # Arguments
1002
+ *
1003
+ * * `html` the HTML string to convert.
1004
+ * * `options` optional conversion options. Defaults to [`ConversionOptions::default`].
1005
+ *
1006
+ * # Example
1007
+ *
1008
+ * ```
1009
+ * use html_to_markdown_rs::convert;
1010
+ *
1011
+ * let html = "<h1>Hello World</h1>";
1012
+ * let result = convert(html, None).unwrap();
1013
+ * assert!(result.content.as_deref().unwrap_or("").contains("Hello World"));
1014
+ * ```
1015
+ *
1016
+ * # Errors
1017
+ *
1018
+ * Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
1019
+ */
1020
+ export function convert(html: string, options?: WasmConversionOptions | null): WasmConversionResult;
@@ -0,0 +1,9 @@
1
+ /* @ts-self-types="./html_to_markdown_wasm.d.ts" */
2
+ import * as wasm from "./html_to_markdown_wasm_bg.wasm";
3
+ import { __wbg_set_wasm } from "./html_to_markdown_wasm_bg.js";
4
+
5
+ __wbg_set_wasm(wasm);
6
+
7
+ export {
8
+ WasmAnnotationKind, WasmCodeBlockStyle, WasmConversionOptions, WasmConversionOptionsBuilder, WasmConversionOptionsUpdate, WasmConversionResult, WasmDocumentMetadata, WasmDocumentNode, WasmDocumentStructure, WasmGridCell, WasmHeaderMetadata, WasmHeadingStyle, WasmHighlightStyle, WasmHtmlMetadata, WasmImageMetadata, WasmImageType, WasmLinkMetadata, WasmLinkStyle, WasmLinkType, WasmListIndentType, WasmNewlineStyle, WasmNodeContent, WasmNodeContext, WasmNodeType, WasmOutputFormat, WasmPreprocessingOptions, WasmPreprocessingOptionsUpdate, WasmPreprocessingPreset, WasmProcessingWarning, WasmStructuredData, WasmStructuredDataType, WasmTableData, WasmTableGrid, WasmTextAnnotation, WasmTextDirection, WasmVisitResult, WasmVisitorHandle, WasmWarningKind, WasmWhitespaceMode, convert
9
+ } from "./html_to_markdown_wasm_bg.js";