@kreuzberg/html-to-markdown-node 3.4.0-rc.9 → 3.5.0-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/LICENSE +21 -0
  2. package/index.d.ts +676 -112
  3. package/index.js +110 -566
  4. package/package.json +27 -49
package/index.d.ts CHANGED
@@ -1,242 +1,553 @@
1
1
  /* auto-generated by NAPI-RS */
2
2
  /* eslint-disable */
3
- export declare class JsConversionOptionsBuilder {
4
- stripTags(tags: Array<string>): JsConversionOptionsBuilder
5
- preserveTags(tags: Array<string>): JsConversionOptionsBuilder
6
- keepInlineImagesIn(tags: Array<string>): JsConversionOptionsBuilder
7
- excludeSelectors(selectors: Array<string>): JsConversionOptionsBuilder
8
- preprocessing(preprocessing: JsPreprocessingOptions): JsConversionOptionsBuilder
9
- build(): JsConversionOptions
10
- }
11
-
12
- export interface JsAnnotationKind {
3
+ /**
4
+ * Shareable, thread-safe handle to a user-provided HTML visitor implementation.
5
+ *
6
+ * Pass an instance wrapped in this handle to `ConversionOptions` to
7
+ * customise how the HTML document is traversed and converted to Markdown.
8
+ * The handle may be cloned and shared across threads without additional
9
+ * synchronisation on the caller's side.
10
+ */
11
+ export declare class VisitorHandle {
12
+
13
+ }
14
+ export type JsVisitorHandle = VisitorHandle
15
+
16
+ /**
17
+ * The type of an inline text annotation.
18
+ *
19
+ * Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
20
+ */
21
+ export interface AnnotationKind {
13
22
  annotation_type: string
14
23
  url?: string
15
24
  title?: string
16
25
  }
17
26
 
18
- export declare const enum JsCodeBlockStyle {
27
+ /**
28
+ * Code block fence style in Markdown output.
29
+ *
30
+ * Determines how code blocks (`<pre><code>`) are rendered in Markdown.
31
+ */
32
+ export declare const enum CodeBlockStyle {
33
+ /** Indented code blocks (4 spaces). `CommonMark` standard. */
19
34
  Indented = 'Indented',
35
+ /** Fenced code blocks with backticks (```). Default (GFM). Supports language hints. */
20
36
  Backticks = 'Backticks',
37
+ /** Fenced code blocks with tildes (~~~). Supports language hints. */
21
38
  Tildes = 'Tildes'
22
39
  }
23
40
 
24
- export interface JsConversionOptions {
41
+ /**
42
+ * Main conversion options for HTML to Markdown conversion.
43
+ *
44
+ * Use `ConversionOptions.builder()` to construct, or `Default.default()` for defaults.
45
+ *
46
+ * # Example
47
+ */
48
+ export interface ConversionOptions {
49
+ /** Heading style to use in Markdown output (ATX `#` or Setext underline). */
25
50
  headingStyle?: JsHeadingStyle
51
+ /** How to indent nested list items (spaces or tab). */
26
52
  listIndentType?: JsListIndentType
53
+ /** Number of spaces (or tabs) to use for each level of list indentation. */
27
54
  listIndentWidth?: number
55
+ /** Bullet character(s) to use for unordered list items (e.g. `"-"`, `"*"`). */
28
56
  bullets?: string
57
+ /** Character used for bold/italic emphasis markers (`*` or `_`). */
29
58
  strongEmSymbol?: string
59
+ /** Escape `*` characters in plain text to avoid unintended bold/italic. */
30
60
  escapeAsterisks?: boolean
61
+ /** Escape `_` characters in plain text to avoid unintended bold/italic. */
31
62
  escapeUnderscores?: boolean
63
+ /** Escape miscellaneous Markdown metacharacters (`[]()#` etc.) in plain text. */
32
64
  escapeMisc?: boolean
65
+ /** Escape ASCII characters that have special meaning in certain Markdown dialects. */
33
66
  escapeAscii?: boolean
67
+ /** Default language annotation for fenced code blocks that have no language hint. */
34
68
  codeLanguage?: string
69
+ /** Automatically convert bare URLs into Markdown autolinks. */
35
70
  autolinks?: boolean
71
+ /** Emit a default title when no `<title>` tag is present. */
36
72
  defaultTitle?: boolean
73
+ /** Render `<br>` elements inside table cells as literal line breaks. */
37
74
  brInTables?: boolean
75
+ /**
76
+ * Emit tables without column padding (compact GFM format).
77
+ *
78
+ * When `true`, column widths are not computed and cells are emitted with
79
+ * no trailing spaces. Separator rows use exactly `---` per column.
80
+ * Produces token-efficient output suitable for RAG / LLM contexts.
81
+ *
82
+ * Default `false` (aligned padding preserved).
83
+ */
84
+ compactTables?: boolean
85
+ /** Style used for `<mark>` / highlighted text (e.g. `==text==`). */
38
86
  highlightStyle?: JsHighlightStyle
87
+ /**
88
+ * Populate `result.metadata` with `<head>` / `<meta>` extraction
89
+ * (title, description, Open Graph, Twitter Card, JSON-LD, …).
90
+ *
91
+ * Default `true`. Disabling skips the metadata pass only — table
92
+ * extraction into `result.tables` runs unconditionally.
93
+ */
39
94
  extractMetadata?: boolean
95
+ /**
96
+ * Controls how whitespace sequences are normalised in the converted output.
97
+ *
98
+ * - [`WhitespaceMode::Normalized`] (default) — collapses consecutive whitespace characters
99
+ * (spaces, tabs, newlines) to a single space, matching browser rendering behaviour.
100
+ * - [`WhitespaceMode::Strict`] — preserves all whitespace exactly as it appears in the
101
+ * source HTML, including runs of spaces and embedded newlines.
102
+ *
103
+ * Choose `Strict` only when the source HTML uses deliberate whitespace (e.g. pre-formatted
104
+ * content outside `<pre>` tags). For most documents `Normalized` produces cleaner output.
105
+ */
40
106
  whitespaceMode?: JsWhitespaceMode
107
+ /** Strip all newlines from the output, producing a single-line result. */
41
108
  stripNewlines?: boolean
109
+ /** Wrap long lines at [`wrap_width`](Self::wrap_width) characters. */
42
110
  wrap?: boolean
111
+ /**
112
+ * Maximum output line width in characters when [`wrap`](Self::wrap) is `true` (default `80`).
113
+ *
114
+ * Lines are broken at word boundaries so that no line exceeds this length. A value of `0`
115
+ * is treated as "no limit" — equivalent to leaving [`wrap`](Self::wrap) disabled. Has no
116
+ * effect when `wrap` is `false`.
117
+ */
43
118
  wrapWidth?: number
119
+ /** Treat the entire document as inline content (no block-level wrappers). */
44
120
  convertAsInline?: boolean
121
+ /** Markdown notation for subscript text (e.g. `"~"`). */
45
122
  subSymbol?: string
123
+ /** Markdown notation for superscript text (e.g. `"^"`). */
46
124
  supSymbol?: string
125
+ /** How to encode hard line breaks (`<br>`) in Markdown. */
47
126
  newlineStyle?: JsNewlineStyle
127
+ /** Style used for fenced code blocks (backticks or tilde). */
48
128
  codeBlockStyle?: JsCodeBlockStyle
129
+ /** HTML tag names whose `<img>` children are kept inline instead of block. */
49
130
  keepInlineImagesIn?: Array<string>
131
+ /**
132
+ * Options for the HTML pre-processing pass applied before conversion begins.
133
+ *
134
+ * Pre-processing runs before the HTML is handed to the converter and can perform operations
135
+ * such as unwrapping redundant wrapper elements, removing tracking pixels, and normalising
136
+ * vendor-specific markup. See [`PreprocessingOptions`] for the full set of knobs.
137
+ *
138
+ * Defaults to [`PreprocessingOptions::default()`], which enables the standard cleaning
139
+ * passes. Set individual fields on [`PreprocessingOptions`] (or construct via
140
+ * [`ConversionOptions::builder`]) to opt in or out of specific passes.
141
+ */
50
142
  preprocessing?: JsPreprocessingOptions
143
+ /** Expected character encoding of the input HTML (default `"utf-8"`). */
51
144
  encoding?: string
145
+ /** Emit debug information during conversion. */
52
146
  debug?: boolean
147
+ /** HTML tag names whose content is stripped from the output entirely. */
53
148
  stripTags?: Array<string>
149
+ /** HTML tag names that are preserved verbatim in the output. */
54
150
  preserveTags?: Array<string>
151
+ /** Skip conversion of `<img>` elements (omit images from output). */
55
152
  skipImages?: boolean
153
+ /** Link rendering style (inline or reference). */
56
154
  linkStyle?: JsLinkStyle
155
+ /** Target output format (Markdown, plain text, etc.). */
57
156
  outputFormat?: JsOutputFormat
157
+ /** Include structured document tree in result. */
58
158
  includeDocumentStructure?: boolean
159
+ /** Extract inline images from data URIs and SVGs. */
59
160
  extractImages?: boolean
161
+ /** Maximum decoded image size in bytes (default 5MB). */
60
162
  maxImageSize?: number
163
+ /** Capture SVG elements as images. */
61
164
  captureSvg?: boolean
165
+ /** Infer image dimensions from data. */
62
166
  inferDimensions?: boolean
167
+ /**
168
+ * Maximum DOM traversal depth. `None` means unlimited.
169
+ * When set, subtrees beyond this depth are silently truncated.
170
+ */
63
171
  maxDepth?: number
172
+ /**
173
+ * CSS selectors for elements to exclude entirely (element + all content).
174
+ *
175
+ * Unlike `strip_tags` (which removes the tag wrapper but keeps children),
176
+ * excluded elements and all their descendants are dropped from the output.
177
+ * Supports any CSS selector that `tl` supports: tag names, `.class`,
178
+ * `#id`, `[attribute]`, etc.
179
+ *
180
+ * Invalid selectors are silently skipped at conversion time.
181
+ *
182
+ * Example: `vec![".cookie-banner".into(), "#ad-container".into(), "[role='complementary']".into()]`
183
+ */
64
184
  excludeSelectors?: Array<string>
65
- }
66
-
67
- export interface JsConversionOptionsUpdate {
68
- headingStyle?: JsHeadingStyle
69
- listIndentType?: JsListIndentType
70
- listIndentWidth?: number
71
- bullets?: string
72
- strongEmSymbol?: string
73
- escapeAsterisks?: boolean
74
- escapeUnderscores?: boolean
75
- escapeMisc?: boolean
76
- escapeAscii?: boolean
77
- codeLanguage?: string
78
- autolinks?: boolean
79
- defaultTitle?: boolean
80
- brInTables?: boolean
81
- highlightStyle?: JsHighlightStyle
82
- extractMetadata?: boolean
83
- whitespaceMode?: JsWhitespaceMode
84
- stripNewlines?: boolean
85
- wrap?: boolean
86
- wrapWidth?: number
87
- convertAsInline?: boolean
88
- subSymbol?: string
89
- supSymbol?: string
90
- newlineStyle?: JsNewlineStyle
91
- codeBlockStyle?: JsCodeBlockStyle
92
- keepInlineImagesIn?: Array<string>
93
- preprocessing?: JsPreprocessingOptionsUpdate
94
- encoding?: string
95
- debug?: boolean
96
- stripTags?: Array<string>
97
- preserveTags?: Array<string>
98
- skipImages?: boolean
99
- linkStyle?: JsLinkStyle
100
- outputFormat?: JsOutputFormat
101
- includeDocumentStructure?: boolean
102
- extractImages?: boolean
103
- maxImageSize?: number
104
- captureSvg?: boolean
105
- inferDimensions?: boolean
106
- maxDepth?: number
107
- excludeSelectors?: Array<string>
108
- }
109
-
110
- export interface JsConversionResult {
185
+ /**
186
+ * Optional visitor for custom traversal logic.
187
+ *
188
+ * When set, the visitor's callbacks are invoked for matching HTML elements
189
+ * during conversion, allowing custom output, skipping, or HTML preservation.
190
+ * See `HtmlVisitor`.
191
+ */
192
+ visitor?: object
193
+ }
194
+
195
+ export declare function conversionOptionsDefault(): ConversionOptions
196
+
197
+ /**
198
+ * The primary result of HTML conversion and extraction.
199
+ *
200
+ * Contains the converted text output, optional structured document tree,
201
+ * metadata, extracted tables, images, and processing warnings.
202
+ *
203
+ * # Example
204
+ *
205
+ * ```text
206
+ * use html_to_markdown_rs::{convert, ConversionOptions};
207
+ *
208
+ * let result = convert("<h1>Hello</h1><p>World</p>", None)?;
209
+ * assert!(result.content.is_some());
210
+ * assert!(result.warnings.is_empty());
211
+ * ```
212
+ */
213
+ export interface ConversionResult {
214
+ /**
215
+ * Converted text output (markdown, djot, or plain text).
216
+ *
217
+ * `None` when `output_format` is set to `OutputFormat::None`,
218
+ * indicating extraction-only mode.
219
+ */
111
220
  content?: string
112
- document?: JsDocumentStructure
113
- metadata?: JsHtmlMetadata
221
+ /**
222
+ * Structured document tree with semantic elements.
223
+ *
224
+ * Populated when `ConversionOptions::include_document_structure` is `true`. `None`
225
+ * otherwise (the default), which avoids the overhead of building the tree.
226
+ *
227
+ * When present, the tree mirrors the converted document: headings open
228
+ * `Group` sections, paragraphs and list items carry
229
+ * inline `TextAnnotation`s, and tables reference the same
230
+ * `TableGrid` data exposed in [`Self::tables`].
231
+ *
232
+ * Note: this field is independent of the `metadata` feature flag. Document structure
233
+ * collection is always available at runtime; it is gated only by the runtime option, not
234
+ * by a compile-time feature.
235
+ */
236
+ document?: DocumentStructure
237
+ /** Extracted HTML metadata (title, OG, links, images, structured data). */
238
+ metadata?: HtmlMetadata
239
+ /** Extracted tables with structured cell data and markdown representation. */
114
240
  tables?: Array<JsTableData>
241
+ /**
242
+ * Extracted inline images (data URIs and SVGs).
243
+ *
244
+ * Populated when `extract_images` is `true` in options.
245
+ */
115
246
  images?: Array<string>
247
+ /** Non-fatal processing warnings. */
116
248
  warnings?: Array<JsProcessingWarning>
117
249
  }
118
250
 
119
- export interface JsDocumentMetadata {
251
+ export declare function convert(html: string, options?: ConversionOptions | undefined | null, visitor?: object | undefined | null): ConversionResult
252
+
253
+ /**
254
+ * Document-level metadata extracted from `<head>` and top-level elements.
255
+ *
256
+ * Contains all metadata typically used by search engines, social media platforms,
257
+ * and browsers for document indexing and presentation.
258
+ *
259
+ * # Examples
260
+ */
261
+ export interface DocumentMetadata {
262
+ /** Document title from `<title>` tag */
120
263
  title?: string
264
+ /** Document description from `<meta name="description">` tag */
121
265
  description?: string
266
+ /** Document keywords from `<meta name="keywords">` tag, split on commas */
122
267
  keywords?: Array<string>
268
+ /** Document author from `<meta name="author">` tag */
123
269
  author?: string
270
+ /** Canonical URL from `<link rel="canonical">` tag */
124
271
  canonicalUrl?: string
272
+ /** Base URL from `<base href="">` tag for resolving relative URLs */
125
273
  baseHref?: string
274
+ /** Document language from `lang` attribute */
126
275
  language?: string
276
+ /** Document text direction from `dir` attribute */
127
277
  textDirection?: JsTextDirection
278
+ /**
279
+ * Open Graph metadata (og:* properties) for social media
280
+ * Keys like "title", "description", "image", "url", etc.
281
+ */
128
282
  openGraph?: Record<string, string>
283
+ /**
284
+ * Twitter Card metadata (twitter:* properties)
285
+ * Keys like "card", "site", "creator", "title", "description", "image", etc.
286
+ */
129
287
  twitterCard?: Record<string, string>
288
+ /**
289
+ * Additional meta tags not covered by specific fields
290
+ * Keys are meta name/property attributes, values are content
291
+ */
130
292
  metaTags?: Record<string, string>
131
293
  }
132
294
 
133
- export interface JsDocumentNode {
295
+ /** A single node in the document tree. */
296
+ export interface DocumentNode {
297
+ /** Deterministic node identifier. */
134
298
  id: string
299
+ /** The semantic content of this node. */
135
300
  content: JsNodeContent
301
+ /** Index of the parent node (None for root nodes). */
136
302
  parent?: number
303
+ /** Indices of child nodes in reading order. */
137
304
  children: Array<number>
305
+ /** Inline formatting annotations (bold, italic, links, etc.) with byte offsets into the text. */
138
306
  annotations: Array<JsTextAnnotation>
307
+ /**
308
+ * Format-specific attributes preserved from the source HTML element.
309
+ *
310
+ * Keys are lowercased attribute names as they appear in the HTML (e.g. `"class"`, `"id"`,
311
+ * `"data-foo"`). Values are the raw attribute strings, copied verbatim from the source —
312
+ * no HTML entity decoding is applied here.
313
+ *
314
+ * The map is `None` when no attributes are present (omitted entirely in serialized output).
315
+ * Not every HTML attribute is preserved: only attributes that carry semantic or structural
316
+ * significance for the node type are collected. For example, heading nodes capture the `"id"`
317
+ * attribute for anchor linking; other element-level attributes may be silently dropped.
318
+ */
139
319
  attributes?: Record<string, string>
140
320
  }
141
321
 
142
- export interface JsDocumentStructure {
322
+ /**
323
+ * A structured document tree representing the semantic content of an HTML document.
324
+ *
325
+ * Uses a flat node array with index-based parent/child references for efficient traversal.
326
+ */
327
+ export interface DocumentStructure {
328
+ /** All nodes in document reading order. */
143
329
  nodes: Array<JsDocumentNode>
330
+ /** The source format (always "html" for this crate). */
144
331
  sourceFormat?: string
145
332
  }
146
333
 
147
- export interface JsGridCell {
334
+ /** A single cell in a table grid. */
335
+ export interface GridCell {
336
+ /** The text content of the cell. */
148
337
  content: string
338
+ /** 0-indexed row position. */
149
339
  row: number
340
+ /** 0-indexed column position. */
150
341
  col: number
342
+ /** Number of rows this cell spans (default 1). */
151
343
  rowSpan: number
344
+ /** Number of columns this cell spans (default 1). */
152
345
  colSpan: number
346
+ /** Whether this is a header cell (`<th>`). */
153
347
  isHeader: boolean
154
348
  }
155
349
 
156
- export interface JsHeaderMetadata {
350
+ /**
351
+ * Header element metadata with hierarchy tracking.
352
+ *
353
+ * Captures heading elements (h1-h6) with their text content, identifiers,
354
+ * and position in the document structure.
355
+ *
356
+ * # Examples
357
+ */
358
+ export interface HeaderMetadata {
359
+ /** Header level: 1 (h1) through 6 (h6) */
157
360
  level: number
361
+ /** Normalized text content of the header */
158
362
  text: string
363
+ /** HTML id attribute if present */
159
364
  id?: string
365
+ /** Document tree depth at the header element */
160
366
  depth: number
367
+ /** Byte offset in original HTML document */
161
368
  htmlOffset: number
162
369
  }
163
370
 
164
- export declare const enum JsHeadingStyle {
371
+ /**
372
+ * Heading style options for Markdown output.
373
+ *
374
+ * Controls how headings (h1-h6) are rendered in the output Markdown.
375
+ */
376
+ export declare const enum HeadingStyle {
377
+ /** Underlined style (=== for h1, --- for h2). */
165
378
  Underlined = 'Underlined',
379
+ /** ATX style (# for h1, ## for h2, etc.). Default. */
166
380
  Atx = 'Atx',
381
+ /** ATX closed style (# title #, with closing hashes). */
167
382
  AtxClosed = 'AtxClosed'
168
383
  }
169
384
 
170
- export declare const enum JsHighlightStyle {
385
+ /**
386
+ * Highlight rendering style for `<mark>` elements.
387
+ *
388
+ * Controls how highlighted text is rendered in Markdown output.
389
+ */
390
+ export declare const enum HighlightStyle {
391
+ /** Double equals syntax (==text==). Default. Pandoc-compatible. */
171
392
  DoubleEqual = 'DoubleEqual',
393
+ /** Preserve as HTML (==text==). Original HTML tag. */
172
394
  Html = 'Html',
395
+ /** Render as bold (**text**). Uses strong emphasis. */
173
396
  Bold = 'Bold',
397
+ /** Strip formatting, render as plain text. No markup. */
174
398
  None = 'None'
175
399
  }
176
400
 
177
- export interface JsHtmlMetadata {
178
- document?: JsDocumentMetadata
179
- headers?: Array<JsHeaderMetadata>
180
- links?: Array<JsLinkMetadata>
181
- images?: Array<JsImageMetadata>
182
- structuredData?: Array<JsStructuredData>
183
- }
184
-
185
- export interface JsImageMetadata {
401
+ /**
402
+ * Comprehensive metadata extraction result from HTML document.
403
+ *
404
+ * Contains all extracted metadata types in a single structure,
405
+ * suitable for serialization and transmission across language boundaries.
406
+ *
407
+ * # Examples
408
+ */
409
+ export interface HtmlMetadata {
410
+ /** Document-level metadata (title, description, canonical, etc.) */
411
+ document?: DocumentMetadata
412
+ /** Extracted header elements with hierarchy */
413
+ headers?: Array<HeaderMetadata>
414
+ /** Extracted hyperlinks with type classification */
415
+ links?: Array<LinkMetadata>
416
+ /** Extracted images with source and dimensions */
417
+ images?: Array<ImageMetadata>
418
+ /** Extracted structured data blocks */
419
+ structuredData?: Array<StructuredData>
420
+ }
421
+
422
+ /**
423
+ * Image metadata with source and dimensions.
424
+ *
425
+ * Captures `<img>` elements and inline `<svg>` elements with metadata
426
+ * for image analysis and optimization.
427
+ *
428
+ * # Examples
429
+ */
430
+ export interface ImageMetadata {
431
+ /** Image source (URL, data URI, or SVG content identifier) */
186
432
  src: string
433
+ /** Alternative text from alt attribute (for accessibility) */
187
434
  alt?: string
435
+ /** Title attribute (often shown as tooltip) */
188
436
  title?: string
437
+ /** Image dimensions as (width, height) if available */
189
438
  dimensions?: Array<number>
439
+ /** Image type classification */
190
440
  imageType: JsImageType
441
+ /** Additional HTML attributes */
191
442
  attributes: Record<string, string>
192
443
  }
193
444
 
194
- export declare const enum JsImageType {
445
+ /**
446
+ * Image source classification for proper handling and processing.
447
+ *
448
+ * Determines whether an image is embedded (data URI), inline SVG, external, or relative.
449
+ */
450
+ export declare const enum ImageType {
451
+ /** Data URI embedded image (base64 or other encoding) */
195
452
  DataUri = 'data_uri',
453
+ /** Inline SVG element */
196
454
  InlineSvg = 'inline_svg',
455
+ /** External image URL (http/https) */
197
456
  External = 'external',
457
+ /** Relative image path */
198
458
  Relative = 'relative'
199
459
  }
200
460
 
201
- export interface JsLinkMetadata {
461
+ /**
462
+ * Hyperlink metadata with categorization and attributes.
463
+ *
464
+ * Represents `<a>` elements with parsed href values, text content, and link type classification.
465
+ *
466
+ * # Examples
467
+ */
468
+ export interface LinkMetadata {
469
+ /** The href URL value */
202
470
  href: string
471
+ /** Link text content (normalized, concatenated if mixed with elements) */
203
472
  text: string
473
+ /** Optional title attribute (often shown as tooltip) */
204
474
  title?: string
475
+ /** Link type classification */
205
476
  linkType: JsLinkType
477
+ /** Rel attribute values (e.g., "nofollow", "stylesheet", "canonical") */
206
478
  rel: Array<string>
479
+ /** Additional HTML attributes */
207
480
  attributes: Record<string, string>
208
481
  }
209
482
 
210
- export declare const enum JsLinkStyle {
483
+ /**
484
+ * Link rendering style in Markdown output.
485
+ *
486
+ * Controls whether links and images use inline `[text](url)` syntax or
487
+ * reference-style `[text][1]` syntax with definitions collected at the end.
488
+ */
489
+ export declare const enum LinkStyle {
490
+ /** Inline links: `[text](url)`. Default. */
211
491
  Inline = 'Inline',
492
+ /** Reference-style links: `[text][1]` with `[1]: url` at end of document. */
212
493
  Reference = 'Reference'
213
494
  }
214
495
 
215
- export declare const enum JsLinkType {
496
+ /**
497
+ * Link classification based on href value and document context.
498
+ *
499
+ * Used to categorize links during extraction for filtering and analysis.
500
+ */
501
+ export declare const enum LinkType {
502
+ /** Anchor link within same document (href starts with #) */
216
503
  Anchor = 'anchor',
504
+ /** Internal link within same domain */
217
505
  Internal = 'internal',
506
+ /** External link to different domain */
218
507
  External = 'external',
508
+ /** Email link (mailto:) */
219
509
  Email = 'email',
510
+ /** Phone link (tel:) */
220
511
  Phone = 'phone',
512
+ /** Other protocol or unclassifiable */
221
513
  Other = 'other'
222
514
  }
223
515
 
224
- export declare const enum JsListIndentType {
516
+ /**
517
+ * List indentation character type.
518
+ *
519
+ * Controls whether list items are indented with spaces or tabs.
520
+ */
521
+ export declare const enum ListIndentType {
522
+ /** Use spaces for indentation. Default. Width controlled by `list_indent_width`. */
225
523
  Spaces = 'Spaces',
524
+ /** Use tabs for indentation. */
226
525
  Tabs = 'Tabs'
227
526
  }
228
527
 
229
- export declare const enum JsNewlineStyle {
528
+ /**
529
+ * Line break syntax in Markdown output.
530
+ *
531
+ * Controls how soft line breaks (from `<br>` or line breaks in source) are rendered.
532
+ */
533
+ export declare const enum NewlineStyle {
534
+ /** Two trailing spaces at end of line. Default. Standard Markdown syntax. */
230
535
  Spaces = 'Spaces',
536
+ /** Backslash at end of line. Alternative Markdown syntax. */
231
537
  Backslash = 'Backslash'
232
538
  }
233
539
 
234
- export interface JsNodeContent {
540
+ /**
541
+ * The semantic content type of a document node.
542
+ *
543
+ * Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
544
+ */
545
+ export interface NodeContent {
235
546
  node_type: string
236
547
  level?: number
237
548
  text?: string
238
549
  ordered?: boolean
239
- grid?: JsTableGrid
550
+ grid?: TableGrid
240
551
  description?: string
241
552
  src?: string
242
553
  imageIndex?: number
@@ -251,191 +562,444 @@ export interface JsNodeContent {
251
562
  headingText?: string
252
563
  }
253
564
 
254
- export interface JsNodeContext {
565
+ /**
566
+ * Context information passed to all visitor methods.
567
+ *
568
+ * Provides comprehensive metadata about the current node being visited,
569
+ * including its type, attributes, position in the DOM tree, and parent context.
570
+ */
571
+ export interface NodeContext {
572
+ /** Coarse-grained node type classification */
255
573
  nodeType: JsNodeType
574
+ /** Raw HTML tag name (e.g., "div", "h1", "custom-element") */
256
575
  tagName: string
576
+ /** All HTML attributes as key-value pairs */
257
577
  attributes: Record<string, string>
578
+ /** Depth in the DOM tree (0 = root) */
258
579
  depth: number
580
+ /** Index among siblings (0-based) */
259
581
  indexInParent: number
582
+ /** Parent element's tag name (None if root) */
260
583
  parentTag?: string
584
+ /** Whether this element is treated as inline vs block */
261
585
  isInline: boolean
262
586
  }
263
587
 
264
- export declare const enum JsNodeType {
588
+ /**
589
+ * Node type enumeration covering all HTML element types.
590
+ *
591
+ * This enum categorizes all HTML elements that the converter recognizes,
592
+ * providing a coarse-grained classification for visitor dispatch.
593
+ */
594
+ export declare const enum NodeType {
595
+ /** Text node (most frequent - 100+ per document) */
265
596
  Text = 'Text',
597
+ /** Generic element node */
266
598
  Element = 'Element',
599
+ /** Heading elements (h1-h6) */
267
600
  Heading = 'Heading',
601
+ /** Paragraph element */
268
602
  Paragraph = 'Paragraph',
603
+ /** Generic div container */
269
604
  Div = 'Div',
605
+ /** Blockquote element */
270
606
  Blockquote = 'Blockquote',
607
+ /** Preformatted text block */
271
608
  Pre = 'Pre',
609
+ /** Horizontal rule */
272
610
  Hr = 'Hr',
611
+ /** Ordered or unordered list (ul, ol) */
273
612
  List = 'List',
613
+ /** List item (li) */
274
614
  ListItem = 'ListItem',
615
+ /** Definition list (dl) */
275
616
  DefinitionList = 'DefinitionList',
617
+ /** Definition term (dt) */
276
618
  DefinitionTerm = 'DefinitionTerm',
619
+ /** Definition description (dd) */
277
620
  DefinitionDescription = 'DefinitionDescription',
621
+ /** Table element */
278
622
  Table = 'Table',
623
+ /** Table row (tr) */
279
624
  TableRow = 'TableRow',
625
+ /** Table cell (td, th) */
280
626
  TableCell = 'TableCell',
627
+ /** Table header cell (th) */
281
628
  TableHeader = 'TableHeader',
629
+ /** Table body (tbody) */
282
630
  TableBody = 'TableBody',
631
+ /** Table head (thead) */
283
632
  TableHead = 'TableHead',
633
+ /** Table foot (tfoot) */
284
634
  TableFoot = 'TableFoot',
635
+ /** Anchor link (a) */
285
636
  Link = 'Link',
637
+ /** Image (img) */
286
638
  Image = 'Image',
639
+ /** Strong/bold (strong, b) */
287
640
  Strong = 'Strong',
641
+ /** Emphasis/italic (em, i) */
288
642
  Em = 'Em',
643
+ /** Inline code (code) */
289
644
  Code = 'Code',
645
+ /** Strikethrough (s, del, strike) */
290
646
  Strikethrough = 'Strikethrough',
647
+ /** Underline (u, ins) */
291
648
  Underline = 'Underline',
649
+ /** Subscript (sub) */
292
650
  Subscript = 'Subscript',
651
+ /** Superscript (sup) */
293
652
  Superscript = 'Superscript',
653
+ /** Mark/highlight (mark) */
294
654
  Mark = 'Mark',
655
+ /** Small text (small) */
295
656
  Small = 'Small',
657
+ /** Line break (br) */
296
658
  Br = 'Br',
659
+ /** Span element */
297
660
  Span = 'Span',
661
+ /** Article element */
298
662
  Article = 'Article',
663
+ /** Section element */
299
664
  Section = 'Section',
665
+ /** Navigation element */
300
666
  Nav = 'Nav',
667
+ /** Aside element */
301
668
  Aside = 'Aside',
669
+ /** Header element */
302
670
  Header = 'Header',
671
+ /** Footer element */
303
672
  Footer = 'Footer',
673
+ /** Main element */
304
674
  Main = 'Main',
675
+ /** Figure element */
305
676
  Figure = 'Figure',
677
+ /** Figure caption */
306
678
  Figcaption = 'Figcaption',
679
+ /** Time element */
307
680
  Time = 'Time',
681
+ /** Details element */
308
682
  Details = 'Details',
683
+ /** Summary element */
309
684
  Summary = 'Summary',
685
+ /** Form element */
310
686
  Form = 'Form',
687
+ /** Input element */
311
688
  Input = 'Input',
689
+ /** Select element */
312
690
  Select = 'Select',
691
+ /** Option element */
313
692
  Option = 'Option',
693
+ /** Button element */
314
694
  Button = 'Button',
695
+ /** Textarea element */
315
696
  Textarea = 'Textarea',
697
+ /** Label element */
316
698
  Label = 'Label',
699
+ /** Fieldset element */
317
700
  Fieldset = 'Fieldset',
701
+ /** Legend element */
318
702
  Legend = 'Legend',
703
+ /** Audio element */
319
704
  Audio = 'Audio',
705
+ /** Video element */
320
706
  Video = 'Video',
707
+ /** Picture element */
321
708
  Picture = 'Picture',
709
+ /** Source element */
322
710
  Source = 'Source',
711
+ /** Iframe element */
323
712
  Iframe = 'Iframe',
713
+ /** SVG element */
324
714
  Svg = 'Svg',
715
+ /** Canvas element */
325
716
  Canvas = 'Canvas',
717
+ /** Ruby annotation */
326
718
  Ruby = 'Ruby',
719
+ /** Ruby text */
327
720
  Rt = 'Rt',
721
+ /** Ruby parenthesis */
328
722
  Rp = 'Rp',
723
+ /** Abbreviation */
329
724
  Abbr = 'Abbr',
725
+ /** Keyboard input */
330
726
  Kbd = 'Kbd',
727
+ /** Sample output */
331
728
  Samp = 'Samp',
729
+ /** Variable */
332
730
  Var = 'Var',
731
+ /** Citation */
333
732
  Cite = 'Cite',
733
+ /** Quote */
334
734
  Q = 'Q',
735
+ /** Deleted text */
335
736
  Del = 'Del',
737
+ /** Inserted text */
336
738
  Ins = 'Ins',
739
+ /** Data element */
337
740
  Data = 'Data',
741
+ /** Meter element */
338
742
  Meter = 'Meter',
743
+ /** Progress element */
339
744
  Progress = 'Progress',
745
+ /** Output element */
340
746
  Output = 'Output',
747
+ /** Template element */
341
748
  Template = 'Template',
749
+ /** Slot element */
342
750
  Slot = 'Slot',
751
+ /** HTML root element */
343
752
  Html = 'Html',
753
+ /** Head element */
344
754
  Head = 'Head',
755
+ /** Body element */
345
756
  Body = 'Body',
757
+ /** Title element */
346
758
  Title = 'Title',
759
+ /** Meta element */
347
760
  Meta = 'Meta',
761
+ /** Link element (not anchor) */
348
762
  LinkTag = 'LinkTag',
763
+ /** Style element */
349
764
  Style = 'Style',
765
+ /** Script element */
350
766
  Script = 'Script',
767
+ /** Base element */
351
768
  Base = 'Base',
769
+ /** Custom element (web components) or unknown tag */
352
770
  Custom = 'Custom'
353
771
  }
354
772
 
355
- export declare const enum JsOutputFormat {
773
+ /**
774
+ * Output format for conversion.
775
+ *
776
+ * Specifies the target markup language format for the conversion output.
777
+ */
778
+ export declare const enum OutputFormat {
779
+ /** Standard Markdown (CommonMark compatible). Default. */
356
780
  Markdown = 'Markdown',
781
+ /** Djot lightweight markup language. */
357
782
  Djot = 'Djot',
783
+ /** Plain text output (no markup, visible text only). */
358
784
  Plain = 'Plain'
359
785
  }
360
786
 
361
- export interface JsPreprocessingOptions {
787
+ /** HTML preprocessing options for document cleanup before conversion. */
788
+ export interface PreprocessingOptions {
789
+ /** Enable HTML preprocessing globally */
362
790
  enabled?: boolean
791
+ /** Preprocessing preset level (Minimal, Standard, Aggressive) */
363
792
  preset?: JsPreprocessingPreset
793
+ /** Remove navigation elements (nav, breadcrumbs, menus, sidebars) */
364
794
  removeNavigation?: boolean
795
+ /** Remove form elements (forms, inputs, buttons, etc.) */
365
796
  removeForms?: boolean
366
797
  }
367
798
 
368
- export interface JsPreprocessingOptionsUpdate {
369
- enabled?: boolean
370
- preset?: JsPreprocessingPreset
371
- removeNavigation?: boolean
372
- removeForms?: boolean
373
- }
799
+ export declare function preprocessingOptionsDefault(): PreprocessingOptions
374
800
 
375
- export declare const enum JsPreprocessingPreset {
801
+ /**
802
+ * HTML preprocessing aggressiveness level.
803
+ *
804
+ * Controls the extent of cleanup performed before conversion. Higher levels remove more elements.
805
+ */
806
+ export declare const enum PreprocessingPreset {
807
+ /** Minimal cleanup. Remove only essential noise (scripts, styles). */
376
808
  Minimal = 'Minimal',
809
+ /** Standard cleanup. Default. Removes navigation, forms, and other auxiliary content. */
377
810
  Standard = 'Standard',
811
+ /** Aggressive cleanup. Remove extensive non-content elements and structure. */
378
812
  Aggressive = 'Aggressive'
379
813
  }
380
814
 
381
- export interface JsProcessingWarning {
815
+ /**
816
+ * A non-fatal diagnostic produced during HTML conversion.
817
+ *
818
+ * Warnings indicate that conversion completed but some content may have been handled
819
+ * differently than expected — for example, an image that could not be extracted, a truncated
820
+ * input, or malformed HTML that was repaired with best-effort parsing.
821
+ *
822
+ * Conversion always succeeds (returns `ConversionResult`) even when warnings are
823
+ * present. Callers should inspect `warnings` and decide how to
824
+ * handle them based on their tolerance for partial results:
825
+ *
826
+ * - **Logging pipelines**: emit each warning at `WARN` level and continue.
827
+ * - **Strict pipelines**: treat any warning as a hard error by checking
828
+ * `result.warnings.is_empty()` before using the output.
829
+ *
830
+ * See `WarningKind` for the full taxonomy of warning categories.
831
+ */
832
+ export interface ProcessingWarning {
833
+ /** Human-readable warning message. */
382
834
  message: string
835
+ /** The category of warning. */
383
836
  kind: JsWarningKind
384
837
  }
385
838
 
386
- export interface JsStructuredData {
839
+ /**
840
+ * Structured data block (JSON-LD, Microdata, or RDFa).
841
+ *
842
+ * Represents machine-readable structured data found in the document.
843
+ * JSON-LD blocks are collected as raw JSON strings for flexibility.
844
+ *
845
+ * # Examples
846
+ */
847
+ export interface StructuredData {
848
+ /** Type of structured data (JSON-LD, Microdata, RDFa) */
387
849
  dataType: JsStructuredDataType
850
+ /** Raw JSON string (for JSON-LD) or serialized representation */
388
851
  rawJson: string
852
+ /** Schema type if detectable (e.g., "Article", "Event", "Product") */
389
853
  schemaType?: string
390
854
  }
391
855
 
392
- export declare const enum JsStructuredDataType {
856
+ /**
857
+ * Structured data format type.
858
+ *
859
+ * Identifies the schema/format used for structured data markup.
860
+ */
861
+ export declare const enum StructuredDataType {
862
+ /** JSON-LD (JSON for Linking Data) script blocks */
393
863
  JsonLd = 'json_ld',
864
+ /** HTML5 Microdata attributes (itemscope, itemtype, itemprop) */
394
865
  Microdata = 'microdata',
395
- RDFa = 'rd_fa'
866
+ /** RDF in Attributes (RDFa) markup */
867
+ RDFa = 'rdfa'
396
868
  }
397
869
 
398
- export interface JsTableData {
399
- grid: JsTableGrid
870
+ /** A top-level extracted table with both structured data and markdown representation. */
871
+ export interface TableData {
872
+ /** The structured table grid. */
873
+ grid: TableGrid
874
+ /** The markdown rendering of this table. */
400
875
  markdown: string
401
876
  }
402
877
 
403
- export interface JsTableGrid {
878
+ /** A structured table grid with cell-level data including spans. */
879
+ export interface TableGrid {
880
+ /** Number of rows. */
404
881
  rows?: number
882
+ /** Number of columns. */
405
883
  cols?: number
884
+ /**
885
+ * All cells in the table as a flat, sparse list.
886
+ *
887
+ * The list is ordered by `(row, col)` but is **not** a dense `rows × cols` matrix: cells
888
+ * that are covered by a spanning cell (via `row_span > 1` or `col_span > 1`) do not appear
889
+ * in the list. Only the top-left "origin" cell of a span is present, with its `row_span`
890
+ * and `col_span` fields set accordingly.
891
+ *
892
+ * To reconstruct the full visual grid, iterate over all cells and mark the rectangular
893
+ * region `[row .. row+row_span, col .. col+col_span]` as occupied by that cell. Any
894
+ * `(row, col)` position that is not the origin of any cell is covered by a span from an
895
+ * earlier cell.
896
+ *
897
+ * The length of this vec is `≤ rows * cols`. An empty table (`rows == 0 || cols == 0`)
898
+ * produces an empty vec.
899
+ */
406
900
  cells?: Array<JsGridCell>
407
901
  }
408
902
 
409
- export interface JsTextAnnotation {
903
+ /**
904
+ * A styling or semantic annotation that applies to a byte range within a node's text.
905
+ *
906
+ * Unlike `DocumentNode`, which captures block-level structure (headings, paragraphs, etc.),
907
+ * a `TextAnnotation` describes inline-level markup — bold, italic, links, code spans, and
908
+ * similar — that spans a contiguous run of bytes inside `DocumentNode.content`'s text field.
909
+ *
910
+ * Byte offsets (`start`..`end`) are into the UTF-8 encoded text of the parent node. The range
911
+ * follows Rust slice conventions: `start` is inclusive and `end` is exclusive, so the annotated
912
+ * text is `text[start as usize..end as usize]`.
913
+ *
914
+ * Multiple annotations on the same node can overlap (e.g. bold-italic text), and they are
915
+ * stored in the order they are encountered during DOM traversal.
916
+ *
917
+ * See `AnnotationKind` for the full list of supported annotation types.
918
+ */
919
+ export interface TextAnnotation {
920
+ /** Start byte offset (inclusive) into the parent node's text. */
410
921
  start: number
922
+ /** End byte offset (exclusive) into the parent node's text. */
411
923
  end: number
924
+ /** The type of annotation. */
412
925
  kind: JsAnnotationKind
413
926
  }
414
927
 
415
- export declare const enum JsTextDirection {
416
- LeftToRight = 'LeftToRight',
417
- RightToLeft = 'RightToLeft',
418
- Auto = 'Auto'
419
- }
420
-
421
- export declare const enum JsVisitResult {
928
+ /**
929
+ * Text directionality of document content.
930
+ *
931
+ * Corresponds to the HTML `dir` attribute and `bdi` element directionality.
932
+ */
933
+ export declare const enum TextDirection {
934
+ /** Left-to-right text flow (default for Latin scripts) */
935
+ LeftToRight = 'ltr',
936
+ /** Right-to-left text flow (Hebrew, Arabic, Urdu, etc.) */
937
+ RightToLeft = 'rtl',
938
+ /** Automatic directionality detection */
939
+ Auto = 'auto'
940
+ }
941
+
942
+ /**
943
+ * Result of a visitor callback.
944
+ *
945
+ * Allows visitors to control the conversion flow by either proceeding
946
+ * with default behavior, providing custom output, skipping elements,
947
+ * preserving HTML, or signaling errors.
948
+ */
949
+ export declare const enum VisitResult {
950
+ /** Continue with default conversion behavior */
422
951
  Continue = 'Continue',
952
+ /**
953
+ * Replace default output with custom markdown
954
+ *
955
+ * The visitor takes full responsibility for the markdown output
956
+ * of this node and its children.
957
+ */
423
958
  Custom = 'Custom',
959
+ /**
960
+ * Skip this element entirely (don't output anything)
961
+ *
962
+ * The element and all its children are ignored in the output.
963
+ */
424
964
  Skip = 'Skip',
965
+ /**
966
+ * Preserve original HTML (don't convert to markdown)
967
+ *
968
+ * The element's raw HTML is included verbatim in the output.
969
+ */
425
970
  PreserveHtml = 'PreserveHtml',
971
+ /**
972
+ * Stop conversion with an error
973
+ *
974
+ * The conversion process halts and returns this error message.
975
+ */
426
976
  Error = 'Error'
427
977
  }
428
978
 
429
- export declare const enum JsWarningKind {
979
+ /** Categories of processing warnings. */
980
+ export declare const enum WarningKind {
981
+ /** An image could not be extracted (e.g. invalid data URI, unsupported format). */
430
982
  ImageExtractionFailed = 'image_extraction_failed',
983
+ /** The input encoding was not recognized; fell back to UTF-8. */
431
984
  EncodingFallback = 'encoding_fallback',
985
+ /** The input was truncated due to size limits. */
432
986
  TruncatedInput = 'truncated_input',
987
+ /** The HTML was malformed but processing continued with best effort. */
433
988
  MalformedHtml = 'malformed_html',
989
+ /** Sanitization was applied to remove potentially unsafe content. */
434
990
  SanitizationApplied = 'sanitization_applied',
991
+ /** DOM traversal was truncated because max_depth was exceeded. */
435
992
  DepthLimitExceeded = 'depth_limit_exceeded'
436
993
  }
437
994
 
438
- export declare const enum JsWhitespaceMode {
995
+ /**
996
+ * Whitespace handling strategy during conversion.
997
+ *
998
+ * Determines how sequences of whitespace characters (spaces, tabs, newlines) are processed.
999
+ */
1000
+ export declare const enum WhitespaceMode {
1001
+ /** Collapse multiple whitespace characters to single spaces. Default. Matches browser behavior. */
439
1002
  Normalized = 'Normalized',
1003
+ /** Preserve all whitespace exactly as it appears in the HTML. */
440
1004
  Strict = 'Strict'
441
1005
  }