@kreuzberg/html-to-markdown-node 3.4.0 → 3.5.0-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/LICENSE +21 -0
  2. package/index.d.ts +667 -111
  3. package/index.js +109 -755
  4. package/package.json +10 -11
package/index.d.ts CHANGED
@@ -1,250 +1,553 @@
1
1
  /* auto-generated by NAPI-RS */
2
2
  /* eslint-disable */
3
- export declare class JsConversionOptionsBuilder {
4
- stripTags(tags: Array<string>): JsConversionOptionsBuilder
5
- preserveTags(tags: Array<string>): JsConversionOptionsBuilder
6
- keepInlineImagesIn(tags: Array<string>): JsConversionOptionsBuilder
7
- excludeSelectors(selectors: Array<string>): JsConversionOptionsBuilder
8
- preprocessing(preprocessing: JsPreprocessingOptions): JsConversionOptionsBuilder
9
- build(): JsConversionOptions
10
- }
11
-
12
- export declare class JsVisitorHandle {
13
-
14
- }
15
-
16
- export declare function convert(html: string, options?: JsConversionOptions | undefined | null): JsConversionResult
17
-
18
- export interface JsAnnotationKind {
3
+ /**
4
+ * Shareable, thread-safe handle to a user-provided HTML visitor implementation.
5
+ *
6
+ * Pass an instance wrapped in this handle to `ConversionOptions` to
7
+ * customise how the HTML document is traversed and converted to Markdown.
8
+ * The handle may be cloned and shared across threads without additional
9
+ * synchronisation on the caller's side.
10
+ */
11
+ export declare class VisitorHandle {
12
+
13
+ }
14
+ export type JsVisitorHandle = VisitorHandle
15
+
16
+ /**
17
+ * The type of an inline text annotation.
18
+ *
19
+ * Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
20
+ */
21
+ export interface AnnotationKind {
19
22
  annotation_type: string
20
23
  url?: string
21
24
  title?: string
22
25
  }
23
26
 
24
- export declare const enum JsCodeBlockStyle {
27
+ /**
28
+ * Code block fence style in Markdown output.
29
+ *
30
+ * Determines how code blocks (`<pre><code>`) are rendered in Markdown.
31
+ */
32
+ export declare const enum CodeBlockStyle {
33
+ /** Indented code blocks (4 spaces). `CommonMark` standard. */
25
34
  Indented = 'Indented',
35
+ /** Fenced code blocks with backticks (```). Default (GFM). Supports language hints. */
26
36
  Backticks = 'Backticks',
37
+ /** Fenced code blocks with tildes (~~~). Supports language hints. */
27
38
  Tildes = 'Tildes'
28
39
  }
29
40
 
30
- export interface JsConversionOptions {
41
+ /**
42
+ * Main conversion options for HTML to Markdown conversion.
43
+ *
44
+ * Use `ConversionOptions.builder()` to construct, or `Default.default()` for defaults.
45
+ *
46
+ * # Example
47
+ */
48
+ export interface ConversionOptions {
49
+ /** Heading style to use in Markdown output (ATX `#` or Setext underline). */
31
50
  headingStyle?: JsHeadingStyle
51
+ /** How to indent nested list items (spaces or tab). */
32
52
  listIndentType?: JsListIndentType
53
+ /** Number of spaces (or tabs) to use for each level of list indentation. */
33
54
  listIndentWidth?: number
55
+ /** Bullet character(s) to use for unordered list items (e.g. `"-"`, `"*"`). */
34
56
  bullets?: string
57
+ /** Character used for bold/italic emphasis markers (`*` or `_`). */
35
58
  strongEmSymbol?: string
59
+ /** Escape `*` characters in plain text to avoid unintended bold/italic. */
36
60
  escapeAsterisks?: boolean
61
+ /** Escape `_` characters in plain text to avoid unintended bold/italic. */
37
62
  escapeUnderscores?: boolean
63
+ /** Escape miscellaneous Markdown metacharacters (`[]()#` etc.) in plain text. */
38
64
  escapeMisc?: boolean
65
+ /** Escape ASCII characters that have special meaning in certain Markdown dialects. */
39
66
  escapeAscii?: boolean
67
+ /** Default language annotation for fenced code blocks that have no language hint. */
40
68
  codeLanguage?: string
69
+ /** Automatically convert bare URLs into Markdown autolinks. */
41
70
  autolinks?: boolean
71
+ /** Emit a default title when no `<title>` tag is present. */
42
72
  defaultTitle?: boolean
73
+ /** Render `<br>` elements inside table cells as literal line breaks. */
43
74
  brInTables?: boolean
75
+ /**
76
+ * Emit tables without column padding (compact GFM format).
77
+ *
78
+ * When `true`, column widths are not computed and cells are emitted with
79
+ * no trailing spaces. Separator rows use exactly `---` per column.
80
+ * Produces token-efficient output suitable for RAG / LLM contexts.
81
+ *
82
+ * Default `false` (aligned padding preserved).
83
+ */
84
+ compactTables?: boolean
85
+ /** Style used for `<mark>` / highlighted text (e.g. `==text==`). */
44
86
  highlightStyle?: JsHighlightStyle
87
+ /**
88
+ * Populate `result.metadata` with `<head>` / `<meta>` extraction
89
+ * (title, description, Open Graph, Twitter Card, JSON-LD, …).
90
+ *
91
+ * Default `true`. Disabling skips the metadata pass only — table
92
+ * extraction into `result.tables` runs unconditionally.
93
+ */
45
94
  extractMetadata?: boolean
95
+ /**
96
+ * Controls how whitespace sequences are normalised in the converted output.
97
+ *
98
+ * - [`WhitespaceMode::Normalized`] (default) — collapses consecutive whitespace characters
99
+ * (spaces, tabs, newlines) to a single space, matching browser rendering behaviour.
100
+ * - [`WhitespaceMode::Strict`] — preserves all whitespace exactly as it appears in the
101
+ * source HTML, including runs of spaces and embedded newlines.
102
+ *
103
+ * Choose `Strict` only when the source HTML uses deliberate whitespace (e.g. pre-formatted
104
+ * content outside `<pre>` tags). For most documents `Normalized` produces cleaner output.
105
+ */
46
106
  whitespaceMode?: JsWhitespaceMode
107
+ /** Strip all newlines from the output, producing a single-line result. */
47
108
  stripNewlines?: boolean
109
+ /** Wrap long lines at [`wrap_width`](Self::wrap_width) characters. */
48
110
  wrap?: boolean
111
+ /**
112
+ * Maximum output line width in characters when [`wrap`](Self::wrap) is `true` (default `80`).
113
+ *
114
+ * Lines are broken at word boundaries so that no line exceeds this length. A value of `0`
115
+ * is treated as "no limit" — equivalent to leaving [`wrap`](Self::wrap) disabled. Has no
116
+ * effect when `wrap` is `false`.
117
+ */
49
118
  wrapWidth?: number
119
+ /** Treat the entire document as inline content (no block-level wrappers). */
50
120
  convertAsInline?: boolean
121
+ /** Markdown notation for subscript text (e.g. `"~"`). */
51
122
  subSymbol?: string
123
+ /** Markdown notation for superscript text (e.g. `"^"`). */
52
124
  supSymbol?: string
125
+ /** How to encode hard line breaks (`<br>`) in Markdown. */
53
126
  newlineStyle?: JsNewlineStyle
127
+ /** Style used for fenced code blocks (backticks or tilde). */
54
128
  codeBlockStyle?: JsCodeBlockStyle
129
+ /** HTML tag names whose `<img>` children are kept inline instead of block. */
55
130
  keepInlineImagesIn?: Array<string>
131
+ /**
132
+ * Options for the HTML pre-processing pass applied before conversion begins.
133
+ *
134
+ * Pre-processing runs before the HTML is handed to the converter and can perform operations
135
+ * such as unwrapping redundant wrapper elements, removing tracking pixels, and normalising
136
+ * vendor-specific markup. See [`PreprocessingOptions`] for the full set of knobs.
137
+ *
138
+ * Defaults to [`PreprocessingOptions::default()`], which enables the standard cleaning
139
+ * passes. Set individual fields on [`PreprocessingOptions`] (or construct via
140
+ * [`ConversionOptions::builder`]) to opt in or out of specific passes.
141
+ */
56
142
  preprocessing?: JsPreprocessingOptions
143
+ /** Expected character encoding of the input HTML (default `"utf-8"`). */
57
144
  encoding?: string
145
+ /** Emit debug information during conversion. */
58
146
  debug?: boolean
147
+ /** HTML tag names whose content is stripped from the output entirely. */
59
148
  stripTags?: Array<string>
149
+ /** HTML tag names that are preserved verbatim in the output. */
60
150
  preserveTags?: Array<string>
151
+ /** Skip conversion of `<img>` elements (omit images from output). */
61
152
  skipImages?: boolean
153
+ /** Link rendering style (inline or reference). */
62
154
  linkStyle?: JsLinkStyle
155
+ /** Target output format (Markdown, plain text, etc.). */
63
156
  outputFormat?: JsOutputFormat
157
+ /** Include structured document tree in result. */
64
158
  includeDocumentStructure?: boolean
159
+ /** Extract inline images from data URIs and SVGs. */
65
160
  extractImages?: boolean
161
+ /** Maximum decoded image size in bytes (default 5MB). */
66
162
  maxImageSize?: number
163
+ /** Capture SVG elements as images. */
67
164
  captureSvg?: boolean
165
+ /** Infer image dimensions from data. */
68
166
  inferDimensions?: boolean
167
+ /**
168
+ * Maximum DOM traversal depth. `None` means unlimited.
169
+ * When set, subtrees beyond this depth are silently truncated.
170
+ */
69
171
  maxDepth?: number
172
+ /**
173
+ * CSS selectors for elements to exclude entirely (element + all content).
174
+ *
175
+ * Unlike `strip_tags` (which removes the tag wrapper but keeps children),
176
+ * excluded elements and all their descendants are dropped from the output.
177
+ * Supports any CSS selector that `tl` supports: tag names, `.class`,
178
+ * `#id`, `[attribute]`, etc.
179
+ *
180
+ * Invalid selectors are silently skipped at conversion time.
181
+ *
182
+ * Example: `vec![".cookie-banner".into(), "#ad-container".into(), "[role='complementary']".into()]`
183
+ */
70
184
  excludeSelectors?: Array<string>
185
+ /**
186
+ * Optional visitor for custom traversal logic.
187
+ *
188
+ * When set, the visitor's callbacks are invoked for matching HTML elements
189
+ * during conversion, allowing custom output, skipping, or HTML preservation.
190
+ * See `HtmlVisitor`.
191
+ */
71
192
  visitor?: object
72
193
  }
73
194
 
74
- export interface JsConversionOptionsUpdate {
75
- headingStyle?: JsHeadingStyle
76
- listIndentType?: JsListIndentType
77
- listIndentWidth?: number
78
- bullets?: string
79
- strongEmSymbol?: string
80
- escapeAsterisks?: boolean
81
- escapeUnderscores?: boolean
82
- escapeMisc?: boolean
83
- escapeAscii?: boolean
84
- codeLanguage?: string
85
- autolinks?: boolean
86
- defaultTitle?: boolean
87
- brInTables?: boolean
88
- highlightStyle?: JsHighlightStyle
89
- extractMetadata?: boolean
90
- whitespaceMode?: JsWhitespaceMode
91
- stripNewlines?: boolean
92
- wrap?: boolean
93
- wrapWidth?: number
94
- convertAsInline?: boolean
95
- subSymbol?: string
96
- supSymbol?: string
97
- newlineStyle?: JsNewlineStyle
98
- codeBlockStyle?: JsCodeBlockStyle
99
- keepInlineImagesIn?: Array<string>
100
- preprocessing?: JsPreprocessingOptionsUpdate
101
- encoding?: string
102
- debug?: boolean
103
- stripTags?: Array<string>
104
- preserveTags?: Array<string>
105
- skipImages?: boolean
106
- linkStyle?: JsLinkStyle
107
- outputFormat?: JsOutputFormat
108
- includeDocumentStructure?: boolean
109
- extractImages?: boolean
110
- maxImageSize?: number
111
- captureSvg?: boolean
112
- inferDimensions?: boolean
113
- maxDepth?: number
114
- excludeSelectors?: Array<string>
115
- visitor?: object
116
- }
117
-
118
- export interface JsConversionResult {
195
+ export declare function conversionOptionsDefault(): ConversionOptions
196
+
197
+ /**
198
+ * The primary result of HTML conversion and extraction.
199
+ *
200
+ * Contains the converted text output, optional structured document tree,
201
+ * metadata, extracted tables, images, and processing warnings.
202
+ *
203
+ * # Example
204
+ *
205
+ * ```text
206
+ * use html_to_markdown_rs::{convert, ConversionOptions};
207
+ *
208
+ * let result = convert("<h1>Hello</h1><p>World</p>", None)?;
209
+ * assert!(result.content.is_some());
210
+ * assert!(result.warnings.is_empty());
211
+ * ```
212
+ */
213
+ export interface ConversionResult {
214
+ /**
215
+ * Converted text output (markdown, djot, or plain text).
216
+ *
217
+ * `None` when `output_format` is set to `OutputFormat::None`,
218
+ * indicating extraction-only mode.
219
+ */
119
220
  content?: string
120
- document?: JsDocumentStructure
121
- metadata?: JsHtmlMetadata
221
+ /**
222
+ * Structured document tree with semantic elements.
223
+ *
224
+ * Populated when `ConversionOptions::include_document_structure` is `true`. `None`
225
+ * otherwise (the default), which avoids the overhead of building the tree.
226
+ *
227
+ * When present, the tree mirrors the converted document: headings open
228
+ * `Group` sections, paragraphs and list items carry
229
+ * inline `TextAnnotation`s, and tables reference the same
230
+ * `TableGrid` data exposed in [`Self::tables`].
231
+ *
232
+ * Note: this field is independent of the `metadata` feature flag. Document structure
233
+ * collection is always available at runtime; it is gated only by the runtime option, not
234
+ * by a compile-time feature.
235
+ */
236
+ document?: DocumentStructure
237
+ /** Extracted HTML metadata (title, OG, links, images, structured data). */
238
+ metadata?: HtmlMetadata
239
+ /** Extracted tables with structured cell data and markdown representation. */
122
240
  tables?: Array<JsTableData>
241
+ /**
242
+ * Extracted inline images (data URIs and SVGs).
243
+ *
244
+ * Populated when `extract_images` is `true` in options.
245
+ */
123
246
  images?: Array<string>
247
+ /** Non-fatal processing warnings. */
124
248
  warnings?: Array<JsProcessingWarning>
125
249
  }
126
250
 
127
- export interface JsDocumentMetadata {
251
+ export declare function convert(html: string, options?: ConversionOptions | undefined | null, visitor?: object | undefined | null): ConversionResult
252
+
253
+ /**
254
+ * Document-level metadata extracted from `<head>` and top-level elements.
255
+ *
256
+ * Contains all metadata typically used by search engines, social media platforms,
257
+ * and browsers for document indexing and presentation.
258
+ *
259
+ * # Examples
260
+ */
261
+ export interface DocumentMetadata {
262
+ /** Document title from `<title>` tag */
128
263
  title?: string
264
+ /** Document description from `<meta name="description">` tag */
129
265
  description?: string
266
+ /** Document keywords from `<meta name="keywords">` tag, split on commas */
130
267
  keywords?: Array<string>
268
+ /** Document author from `<meta name="author">` tag */
131
269
  author?: string
270
+ /** Canonical URL from `<link rel="canonical">` tag */
132
271
  canonicalUrl?: string
272
+ /** Base URL from `<base href="">` tag for resolving relative URLs */
133
273
  baseHref?: string
274
+ /** Document language from `lang` attribute */
134
275
  language?: string
276
+ /** Document text direction from `dir` attribute */
135
277
  textDirection?: JsTextDirection
278
+ /**
279
+ * Open Graph metadata (og:* properties) for social media
280
+ * Keys like "title", "description", "image", "url", etc.
281
+ */
136
282
  openGraph?: Record<string, string>
283
+ /**
284
+ * Twitter Card metadata (twitter:* properties)
285
+ * Keys like "card", "site", "creator", "title", "description", "image", etc.
286
+ */
137
287
  twitterCard?: Record<string, string>
288
+ /**
289
+ * Additional meta tags not covered by specific fields
290
+ * Keys are meta name/property attributes, values are content
291
+ */
138
292
  metaTags?: Record<string, string>
139
293
  }
140
294
 
141
- export interface JsDocumentNode {
295
+ /** A single node in the document tree. */
296
+ export interface DocumentNode {
297
+ /** Deterministic node identifier. */
142
298
  id: string
299
+ /** The semantic content of this node. */
143
300
  content: JsNodeContent
301
+ /** Index of the parent node (None for root nodes). */
144
302
  parent?: number
303
+ /** Indices of child nodes in reading order. */
145
304
  children: Array<number>
305
+ /** Inline formatting annotations (bold, italic, links, etc.) with byte offsets into the text. */
146
306
  annotations: Array<JsTextAnnotation>
307
+ /**
308
+ * Format-specific attributes preserved from the source HTML element.
309
+ *
310
+ * Keys are lowercased attribute names as they appear in the HTML (e.g. `"class"`, `"id"`,
311
+ * `"data-foo"`). Values are the raw attribute strings, copied verbatim from the source —
312
+ * no HTML entity decoding is applied here.
313
+ *
314
+ * The map is `None` when no attributes are present (omitted entirely in serialized output).
315
+ * Not every HTML attribute is preserved: only attributes that carry semantic or structural
316
+ * significance for the node type are collected. For example, heading nodes capture the `"id"`
317
+ * attribute for anchor linking; other element-level attributes may be silently dropped.
318
+ */
147
319
  attributes?: Record<string, string>
148
320
  }
149
321
 
150
- export interface JsDocumentStructure {
322
+ /**
323
+ * A structured document tree representing the semantic content of an HTML document.
324
+ *
325
+ * Uses a flat node array with index-based parent/child references for efficient traversal.
326
+ */
327
+ export interface DocumentStructure {
328
+ /** All nodes in document reading order. */
151
329
  nodes: Array<JsDocumentNode>
330
+ /** The source format (always "html" for this crate). */
152
331
  sourceFormat?: string
153
332
  }
154
333
 
155
- export interface JsGridCell {
334
+ /** A single cell in a table grid. */
335
+ export interface GridCell {
336
+ /** The text content of the cell. */
156
337
  content: string
338
+ /** 0-indexed row position. */
157
339
  row: number
340
+ /** 0-indexed column position. */
158
341
  col: number
342
+ /** Number of rows this cell spans (default 1). */
159
343
  rowSpan: number
344
+ /** Number of columns this cell spans (default 1). */
160
345
  colSpan: number
346
+ /** Whether this is a header cell (`<th>`). */
161
347
  isHeader: boolean
162
348
  }
163
349
 
164
- export interface JsHeaderMetadata {
350
+ /**
351
+ * Header element metadata with hierarchy tracking.
352
+ *
353
+ * Captures heading elements (h1-h6) with their text content, identifiers,
354
+ * and position in the document structure.
355
+ *
356
+ * # Examples
357
+ */
358
+ export interface HeaderMetadata {
359
+ /** Header level: 1 (h1) through 6 (h6) */
165
360
  level: number
361
+ /** Normalized text content of the header */
166
362
  text: string
363
+ /** HTML id attribute if present */
167
364
  id?: string
365
+ /** Document tree depth at the header element */
168
366
  depth: number
367
+ /** Byte offset in original HTML document */
169
368
  htmlOffset: number
170
369
  }
171
370
 
172
- export declare const enum JsHeadingStyle {
371
+ /**
372
+ * Heading style options for Markdown output.
373
+ *
374
+ * Controls how headings (h1-h6) are rendered in the output Markdown.
375
+ */
376
+ export declare const enum HeadingStyle {
377
+ /** Underlined style (=== for h1, --- for h2). */
173
378
  Underlined = 'Underlined',
379
+ /** ATX style (# for h1, ## for h2, etc.). Default. */
174
380
  Atx = 'Atx',
381
+ /** ATX closed style (# title #, with closing hashes). */
175
382
  AtxClosed = 'AtxClosed'
176
383
  }
177
384
 
178
- export declare const enum JsHighlightStyle {
385
+ /**
386
+ * Highlight rendering style for `<mark>` elements.
387
+ *
388
+ * Controls how highlighted text is rendered in Markdown output.
389
+ */
390
+ export declare const enum HighlightStyle {
391
+ /** Double equals syntax (==text==). Default. Pandoc-compatible. */
179
392
  DoubleEqual = 'DoubleEqual',
393
+ /** Preserve as HTML (==text==). Original HTML tag. */
180
394
  Html = 'Html',
395
+ /** Render as bold (**text**). Uses strong emphasis. */
181
396
  Bold = 'Bold',
397
+ /** Strip formatting, render as plain text. No markup. */
182
398
  None = 'None'
183
399
  }
184
400
 
185
- export interface JsHtmlMetadata {
186
- document?: JsDocumentMetadata
187
- headers?: Array<JsHeaderMetadata>
188
- links?: Array<JsLinkMetadata>
189
- images?: Array<JsImageMetadata>
190
- structuredData?: Array<JsStructuredData>
191
- }
192
-
193
- export interface JsImageMetadata {
401
+ /**
402
+ * Comprehensive metadata extraction result from HTML document.
403
+ *
404
+ * Contains all extracted metadata types in a single structure,
405
+ * suitable for serialization and transmission across language boundaries.
406
+ *
407
+ * # Examples
408
+ */
409
+ export interface HtmlMetadata {
410
+ /** Document-level metadata (title, description, canonical, etc.) */
411
+ document?: DocumentMetadata
412
+ /** Extracted header elements with hierarchy */
413
+ headers?: Array<HeaderMetadata>
414
+ /** Extracted hyperlinks with type classification */
415
+ links?: Array<LinkMetadata>
416
+ /** Extracted images with source and dimensions */
417
+ images?: Array<ImageMetadata>
418
+ /** Extracted structured data blocks */
419
+ structuredData?: Array<StructuredData>
420
+ }
421
+
422
+ /**
423
+ * Image metadata with source and dimensions.
424
+ *
425
+ * Captures `<img>` elements and inline `<svg>` elements with metadata
426
+ * for image analysis and optimization.
427
+ *
428
+ * # Examples
429
+ */
430
+ export interface ImageMetadata {
431
+ /** Image source (URL, data URI, or SVG content identifier) */
194
432
  src: string
433
+ /** Alternative text from alt attribute (for accessibility) */
195
434
  alt?: string
435
+ /** Title attribute (often shown as tooltip) */
196
436
  title?: string
437
+ /** Image dimensions as (width, height) if available */
197
438
  dimensions?: Array<number>
439
+ /** Image type classification */
198
440
  imageType: JsImageType
441
+ /** Additional HTML attributes */
199
442
  attributes: Record<string, string>
200
443
  }
201
444
 
202
- export declare const enum JsImageType {
445
+ /**
446
+ * Image source classification for proper handling and processing.
447
+ *
448
+ * Determines whether an image is embedded (data URI), inline SVG, external, or relative.
449
+ */
450
+ export declare const enum ImageType {
451
+ /** Data URI embedded image (base64 or other encoding) */
203
452
  DataUri = 'data_uri',
453
+ /** Inline SVG element */
204
454
  InlineSvg = 'inline_svg',
455
+ /** External image URL (http/https) */
205
456
  External = 'external',
457
+ /** Relative image path */
206
458
  Relative = 'relative'
207
459
  }
208
460
 
209
- export interface JsLinkMetadata {
461
+ /**
462
+ * Hyperlink metadata with categorization and attributes.
463
+ *
464
+ * Represents `<a>` elements with parsed href values, text content, and link type classification.
465
+ *
466
+ * # Examples
467
+ */
468
+ export interface LinkMetadata {
469
+ /** The href URL value */
210
470
  href: string
471
+ /** Link text content (normalized, concatenated if mixed with elements) */
211
472
  text: string
473
+ /** Optional title attribute (often shown as tooltip) */
212
474
  title?: string
475
+ /** Link type classification */
213
476
  linkType: JsLinkType
477
+ /** Rel attribute values (e.g., "nofollow", "stylesheet", "canonical") */
214
478
  rel: Array<string>
479
+ /** Additional HTML attributes */
215
480
  attributes: Record<string, string>
216
481
  }
217
482
 
218
- export declare const enum JsLinkStyle {
483
+ /**
484
+ * Link rendering style in Markdown output.
485
+ *
486
+ * Controls whether links and images use inline `[text](url)` syntax or
487
+ * reference-style `[text][1]` syntax with definitions collected at the end.
488
+ */
489
+ export declare const enum LinkStyle {
490
+ /** Inline links: `[text](url)`. Default. */
219
491
  Inline = 'Inline',
492
+ /** Reference-style links: `[text][1]` with `[1]: url` at end of document. */
220
493
  Reference = 'Reference'
221
494
  }
222
495
 
223
- export declare const enum JsLinkType {
496
+ /**
497
+ * Link classification based on href value and document context.
498
+ *
499
+ * Used to categorize links during extraction for filtering and analysis.
500
+ */
501
+ export declare const enum LinkType {
502
+ /** Anchor link within same document (href starts with #) */
224
503
  Anchor = 'anchor',
504
+ /** Internal link within same domain */
225
505
  Internal = 'internal',
506
+ /** External link to different domain */
226
507
  External = 'external',
508
+ /** Email link (mailto:) */
227
509
  Email = 'email',
510
+ /** Phone link (tel:) */
228
511
  Phone = 'phone',
512
+ /** Other protocol or unclassifiable */
229
513
  Other = 'other'
230
514
  }
231
515
 
232
- export declare const enum JsListIndentType {
516
+ /**
517
+ * List indentation character type.
518
+ *
519
+ * Controls whether list items are indented with spaces or tabs.
520
+ */
521
+ export declare const enum ListIndentType {
522
+ /** Use spaces for indentation. Default. Width controlled by `list_indent_width`. */
233
523
  Spaces = 'Spaces',
524
+ /** Use tabs for indentation. */
234
525
  Tabs = 'Tabs'
235
526
  }
236
527
 
237
- export declare const enum JsNewlineStyle {
528
+ /**
529
+ * Line break syntax in Markdown output.
530
+ *
531
+ * Controls how soft line breaks (from `<br>` or line breaks in source) are rendered.
532
+ */
533
+ export declare const enum NewlineStyle {
534
+ /** Two trailing spaces at end of line. Default. Standard Markdown syntax. */
238
535
  Spaces = 'Spaces',
536
+ /** Backslash at end of line. Alternative Markdown syntax. */
239
537
  Backslash = 'Backslash'
240
538
  }
241
539
 
242
- export interface JsNodeContent {
540
+ /**
541
+ * The semantic content type of a document node.
542
+ *
543
+ * Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
544
+ */
545
+ export interface NodeContent {
243
546
  node_type: string
244
547
  level?: number
245
548
  text?: string
246
549
  ordered?: boolean
247
- grid?: JsTableGrid
550
+ grid?: TableGrid
248
551
  description?: string
249
552
  src?: string
250
553
  imageIndex?: number
@@ -259,191 +562,444 @@ export interface JsNodeContent {
259
562
  headingText?: string
260
563
  }
261
564
 
262
- export interface JsNodeContext {
565
+ /**
566
+ * Context information passed to all visitor methods.
567
+ *
568
+ * Provides comprehensive metadata about the current node being visited,
569
+ * including its type, attributes, position in the DOM tree, and parent context.
570
+ */
571
+ export interface NodeContext {
572
+ /** Coarse-grained node type classification */
263
573
  nodeType: JsNodeType
574
+ /** Raw HTML tag name (e.g., "div", "h1", "custom-element") */
264
575
  tagName: string
576
+ /** All HTML attributes as key-value pairs */
265
577
  attributes: Record<string, string>
578
+ /** Depth in the DOM tree (0 = root) */
266
579
  depth: number
580
+ /** Index among siblings (0-based) */
267
581
  indexInParent: number
582
+ /** Parent element's tag name (None if root) */
268
583
  parentTag?: string
584
+ /** Whether this element is treated as inline vs block */
269
585
  isInline: boolean
270
586
  }
271
587
 
272
- export declare const enum JsNodeType {
588
+ /**
589
+ * Node type enumeration covering all HTML element types.
590
+ *
591
+ * This enum categorizes all HTML elements that the converter recognizes,
592
+ * providing a coarse-grained classification for visitor dispatch.
593
+ */
594
+ export declare const enum NodeType {
595
+ /** Text node (most frequent - 100+ per document) */
273
596
  Text = 'Text',
597
+ /** Generic element node */
274
598
  Element = 'Element',
599
+ /** Heading elements (h1-h6) */
275
600
  Heading = 'Heading',
601
+ /** Paragraph element */
276
602
  Paragraph = 'Paragraph',
603
+ /** Generic div container */
277
604
  Div = 'Div',
605
+ /** Blockquote element */
278
606
  Blockquote = 'Blockquote',
607
+ /** Preformatted text block */
279
608
  Pre = 'Pre',
609
+ /** Horizontal rule */
280
610
  Hr = 'Hr',
611
+ /** Ordered or unordered list (ul, ol) */
281
612
  List = 'List',
613
+ /** List item (li) */
282
614
  ListItem = 'ListItem',
615
+ /** Definition list (dl) */
283
616
  DefinitionList = 'DefinitionList',
617
+ /** Definition term (dt) */
284
618
  DefinitionTerm = 'DefinitionTerm',
619
+ /** Definition description (dd) */
285
620
  DefinitionDescription = 'DefinitionDescription',
621
+ /** Table element */
286
622
  Table = 'Table',
623
+ /** Table row (tr) */
287
624
  TableRow = 'TableRow',
625
+ /** Table cell (td, th) */
288
626
  TableCell = 'TableCell',
627
+ /** Table header cell (th) */
289
628
  TableHeader = 'TableHeader',
629
+ /** Table body (tbody) */
290
630
  TableBody = 'TableBody',
631
+ /** Table head (thead) */
291
632
  TableHead = 'TableHead',
633
+ /** Table foot (tfoot) */
292
634
  TableFoot = 'TableFoot',
635
+ /** Anchor link (a) */
293
636
  Link = 'Link',
637
+ /** Image (img) */
294
638
  Image = 'Image',
639
+ /** Strong/bold (strong, b) */
295
640
  Strong = 'Strong',
641
+ /** Emphasis/italic (em, i) */
296
642
  Em = 'Em',
643
+ /** Inline code (code) */
297
644
  Code = 'Code',
645
+ /** Strikethrough (s, del, strike) */
298
646
  Strikethrough = 'Strikethrough',
647
+ /** Underline (u, ins) */
299
648
  Underline = 'Underline',
649
+ /** Subscript (sub) */
300
650
  Subscript = 'Subscript',
651
+ /** Superscript (sup) */
301
652
  Superscript = 'Superscript',
653
+ /** Mark/highlight (mark) */
302
654
  Mark = 'Mark',
655
+ /** Small text (small) */
303
656
  Small = 'Small',
657
+ /** Line break (br) */
304
658
  Br = 'Br',
659
+ /** Span element */
305
660
  Span = 'Span',
661
+ /** Article element */
306
662
  Article = 'Article',
663
+ /** Section element */
307
664
  Section = 'Section',
665
+ /** Navigation element */
308
666
  Nav = 'Nav',
667
+ /** Aside element */
309
668
  Aside = 'Aside',
669
+ /** Header element */
310
670
  Header = 'Header',
671
+ /** Footer element */
311
672
  Footer = 'Footer',
673
+ /** Main element */
312
674
  Main = 'Main',
675
+ /** Figure element */
313
676
  Figure = 'Figure',
677
+ /** Figure caption */
314
678
  Figcaption = 'Figcaption',
679
+ /** Time element */
315
680
  Time = 'Time',
681
+ /** Details element */
316
682
  Details = 'Details',
683
+ /** Summary element */
317
684
  Summary = 'Summary',
685
+ /** Form element */
318
686
  Form = 'Form',
687
+ /** Input element */
319
688
  Input = 'Input',
689
+ /** Select element */
320
690
  Select = 'Select',
691
+ /** Option element */
321
692
  Option = 'Option',
693
+ /** Button element */
322
694
  Button = 'Button',
695
+ /** Textarea element */
323
696
  Textarea = 'Textarea',
697
+ /** Label element */
324
698
  Label = 'Label',
699
+ /** Fieldset element */
325
700
  Fieldset = 'Fieldset',
701
+ /** Legend element */
326
702
  Legend = 'Legend',
703
+ /** Audio element */
327
704
  Audio = 'Audio',
705
+ /** Video element */
328
706
  Video = 'Video',
707
+ /** Picture element */
329
708
  Picture = 'Picture',
709
+ /** Source element */
330
710
  Source = 'Source',
711
+ /** Iframe element */
331
712
  Iframe = 'Iframe',
713
+ /** SVG element */
332
714
  Svg = 'Svg',
715
+ /** Canvas element */
333
716
  Canvas = 'Canvas',
717
+ /** Ruby annotation */
334
718
  Ruby = 'Ruby',
719
+ /** Ruby text */
335
720
  Rt = 'Rt',
721
+ /** Ruby parenthesis */
336
722
  Rp = 'Rp',
723
+ /** Abbreviation */
337
724
  Abbr = 'Abbr',
725
+ /** Keyboard input */
338
726
  Kbd = 'Kbd',
727
+ /** Sample output */
339
728
  Samp = 'Samp',
729
+ /** Variable */
340
730
  Var = 'Var',
731
+ /** Citation */
341
732
  Cite = 'Cite',
733
+ /** Quote */
342
734
  Q = 'Q',
735
+ /** Deleted text */
343
736
  Del = 'Del',
737
+ /** Inserted text */
344
738
  Ins = 'Ins',
739
+ /** Data element */
345
740
  Data = 'Data',
741
+ /** Meter element */
346
742
  Meter = 'Meter',
743
+ /** Progress element */
347
744
  Progress = 'Progress',
745
+ /** Output element */
348
746
  Output = 'Output',
747
+ /** Template element */
349
748
  Template = 'Template',
749
+ /** Slot element */
350
750
  Slot = 'Slot',
751
+ /** HTML root element */
351
752
  Html = 'Html',
753
+ /** Head element */
352
754
  Head = 'Head',
755
+ /** Body element */
353
756
  Body = 'Body',
757
+ /** Title element */
354
758
  Title = 'Title',
759
+ /** Meta element */
355
760
  Meta = 'Meta',
761
+ /** Link element (not anchor) */
356
762
  LinkTag = 'LinkTag',
763
+ /** Style element */
357
764
  Style = 'Style',
765
+ /** Script element */
358
766
  Script = 'Script',
767
+ /** Base element */
359
768
  Base = 'Base',
769
+ /** Custom element (web components) or unknown tag */
360
770
  Custom = 'Custom'
361
771
  }
362
772
 
363
- export declare const enum JsOutputFormat {
773
+ /**
774
+ * Output format for conversion.
775
+ *
776
+ * Specifies the target markup language format for the conversion output.
777
+ */
778
+ export declare const enum OutputFormat {
779
+ /** Standard Markdown (CommonMark compatible). Default. */
364
780
  Markdown = 'Markdown',
781
+ /** Djot lightweight markup language. */
365
782
  Djot = 'Djot',
783
+ /** Plain text output (no markup, visible text only). */
366
784
  Plain = 'Plain'
367
785
  }
368
786
 
369
- export interface JsPreprocessingOptions {
787
+ /** HTML preprocessing options for document cleanup before conversion. */
788
+ export interface PreprocessingOptions {
789
+ /** Enable HTML preprocessing globally */
370
790
  enabled?: boolean
791
+ /** Preprocessing preset level (Minimal, Standard, Aggressive) */
371
792
  preset?: JsPreprocessingPreset
793
+ /** Remove navigation elements (nav, breadcrumbs, menus, sidebars) */
372
794
  removeNavigation?: boolean
795
+ /** Remove form elements (forms, inputs, buttons, etc.) */
373
796
  removeForms?: boolean
374
797
  }
375
798
 
376
- export interface JsPreprocessingOptionsUpdate {
377
- enabled?: boolean
378
- preset?: JsPreprocessingPreset
379
- removeNavigation?: boolean
380
- removeForms?: boolean
381
- }
799
+ export declare function preprocessingOptionsDefault(): PreprocessingOptions
382
800
 
383
- export declare const enum JsPreprocessingPreset {
801
+ /**
802
+ * HTML preprocessing aggressiveness level.
803
+ *
804
+ * Controls the extent of cleanup performed before conversion. Higher levels remove more elements.
805
+ */
806
+ export declare const enum PreprocessingPreset {
807
+ /** Minimal cleanup. Remove only essential noise (scripts, styles). */
384
808
  Minimal = 'Minimal',
809
+ /** Standard cleanup. Default. Removes navigation, forms, and other auxiliary content. */
385
810
  Standard = 'Standard',
811
+ /** Aggressive cleanup. Remove extensive non-content elements and structure. */
386
812
  Aggressive = 'Aggressive'
387
813
  }
388
814
 
389
- export interface JsProcessingWarning {
815
+ /**
816
+ * A non-fatal diagnostic produced during HTML conversion.
817
+ *
818
+ * Warnings indicate that conversion completed but some content may have been handled
819
+ * differently than expected — for example, an image that could not be extracted, a truncated
820
+ * input, or malformed HTML that was repaired with best-effort parsing.
821
+ *
822
+ * Conversion always succeeds (returns `ConversionResult`) even when warnings are
823
+ * present. Callers should inspect `warnings` and decide how to
824
+ * handle them based on their tolerance for partial results:
825
+ *
826
+ * - **Logging pipelines**: emit each warning at `WARN` level and continue.
827
+ * - **Strict pipelines**: treat any warning as a hard error by checking
828
+ * `result.warnings.is_empty()` before using the output.
829
+ *
830
+ * See `WarningKind` for the full taxonomy of warning categories.
831
+ */
832
+ export interface ProcessingWarning {
833
+ /** Human-readable warning message. */
390
834
  message: string
835
+ /** The category of warning. */
391
836
  kind: JsWarningKind
392
837
  }
393
838
 
394
- export interface JsStructuredData {
839
+ /**
840
+ * Structured data block (JSON-LD, Microdata, or RDFa).
841
+ *
842
+ * Represents machine-readable structured data found in the document.
843
+ * JSON-LD blocks are collected as raw JSON strings for flexibility.
844
+ *
845
+ * # Examples
846
+ */
847
+ export interface StructuredData {
848
+ /** Type of structured data (JSON-LD, Microdata, RDFa) */
395
849
  dataType: JsStructuredDataType
850
+ /** Raw JSON string (for JSON-LD) or serialized representation */
396
851
  rawJson: string
852
+ /** Schema type if detectable (e.g., "Article", "Event", "Product") */
397
853
  schemaType?: string
398
854
  }
399
855
 
400
- export declare const enum JsStructuredDataType {
856
+ /**
857
+ * Structured data format type.
858
+ *
859
+ * Identifies the schema/format used for structured data markup.
860
+ */
861
+ export declare const enum StructuredDataType {
862
+ /** JSON-LD (JSON for Linking Data) script blocks */
401
863
  JsonLd = 'json_ld',
864
+ /** HTML5 Microdata attributes (itemscope, itemtype, itemprop) */
402
865
  Microdata = 'microdata',
866
+ /** RDF in Attributes (RDFa) markup */
403
867
  RDFa = 'rdfa'
404
868
  }
405
869
 
406
- export interface JsTableData {
407
- grid: JsTableGrid
870
+ /** A top-level extracted table with both structured data and markdown representation. */
871
+ export interface TableData {
872
+ /** The structured table grid. */
873
+ grid: TableGrid
874
+ /** The markdown rendering of this table. */
408
875
  markdown: string
409
876
  }
410
877
 
411
- export interface JsTableGrid {
878
+ /** A structured table grid with cell-level data including spans. */
879
+ export interface TableGrid {
880
+ /** Number of rows. */
412
881
  rows?: number
882
+ /** Number of columns. */
413
883
  cols?: number
884
+ /**
885
+ * All cells in the table as a flat, sparse list.
886
+ *
887
+ * The list is ordered by `(row, col)` but is **not** a dense `rows × cols` matrix: cells
888
+ * that are covered by a spanning cell (via `row_span > 1` or `col_span > 1`) do not appear
889
+ * in the list. Only the top-left "origin" cell of a span is present, with its `row_span`
890
+ * and `col_span` fields set accordingly.
891
+ *
892
+ * To reconstruct the full visual grid, iterate over all cells and mark the rectangular
893
+ * region `[row .. row+row_span, col .. col+col_span]` as occupied by that cell. Any
894
+ * `(row, col)` position that is not the origin of any cell is covered by a span from an
895
+ * earlier cell.
896
+ *
897
+ * The length of this vec is `≤ rows * cols`. An empty table (`rows == 0 || cols == 0`)
898
+ * produces an empty vec.
899
+ */
414
900
  cells?: Array<JsGridCell>
415
901
  }
416
902
 
417
- export interface JsTextAnnotation {
903
+ /**
904
+ * A styling or semantic annotation that applies to a byte range within a node's text.
905
+ *
906
+ * Unlike `DocumentNode`, which captures block-level structure (headings, paragraphs, etc.),
907
+ * a `TextAnnotation` describes inline-level markup — bold, italic, links, code spans, and
908
+ * similar — that spans a contiguous run of bytes inside `DocumentNode.content`'s text field.
909
+ *
910
+ * Byte offsets (`start`..`end`) are into the UTF-8 encoded text of the parent node. The range
911
+ * follows Rust slice conventions: `start` is inclusive and `end` is exclusive, so the annotated
912
+ * text is `text[start as usize..end as usize]`.
913
+ *
914
+ * Multiple annotations on the same node can overlap (e.g. bold-italic text), and they are
915
+ * stored in the order they are encountered during DOM traversal.
916
+ *
917
+ * See `AnnotationKind` for the full list of supported annotation types.
918
+ */
919
+ export interface TextAnnotation {
920
+ /** Start byte offset (inclusive) into the parent node's text. */
418
921
  start: number
922
+ /** End byte offset (exclusive) into the parent node's text. */
419
923
  end: number
924
+ /** The type of annotation. */
420
925
  kind: JsAnnotationKind
421
926
  }
422
927
 
423
- export declare const enum JsTextDirection {
928
+ /**
929
+ * Text directionality of document content.
930
+ *
931
+ * Corresponds to the HTML `dir` attribute and `bdi` element directionality.
932
+ */
933
+ export declare const enum TextDirection {
934
+ /** Left-to-right text flow (default for Latin scripts) */
424
935
  LeftToRight = 'ltr',
936
+ /** Right-to-left text flow (Hebrew, Arabic, Urdu, etc.) */
425
937
  RightToLeft = 'rtl',
938
+ /** Automatic directionality detection */
426
939
  Auto = 'auto'
427
940
  }
428
941
 
429
- export declare const enum JsVisitResult {
942
+ /**
943
+ * Result of a visitor callback.
944
+ *
945
+ * Allows visitors to control the conversion flow by either proceeding
946
+ * with default behavior, providing custom output, skipping elements,
947
+ * preserving HTML, or signaling errors.
948
+ */
949
+ export declare const enum VisitResult {
950
+ /** Continue with default conversion behavior */
430
951
  Continue = 'Continue',
952
+ /**
953
+ * Replace default output with custom markdown
954
+ *
955
+ * The visitor takes full responsibility for the markdown output
956
+ * of this node and its children.
957
+ */
431
958
  Custom = 'Custom',
959
+ /**
960
+ * Skip this element entirely (don't output anything)
961
+ *
962
+ * The element and all its children are ignored in the output.
963
+ */
432
964
  Skip = 'Skip',
965
+ /**
966
+ * Preserve original HTML (don't convert to markdown)
967
+ *
968
+ * The element's raw HTML is included verbatim in the output.
969
+ */
433
970
  PreserveHtml = 'PreserveHtml',
971
+ /**
972
+ * Stop conversion with an error
973
+ *
974
+ * The conversion process halts and returns this error message.
975
+ */
434
976
  Error = 'Error'
435
977
  }
436
978
 
437
- export declare const enum JsWarningKind {
979
+ /** Categories of processing warnings. */
980
+ export declare const enum WarningKind {
981
+ /** An image could not be extracted (e.g. invalid data URI, unsupported format). */
438
982
  ImageExtractionFailed = 'image_extraction_failed',
983
+ /** The input encoding was not recognized; fell back to UTF-8. */
439
984
  EncodingFallback = 'encoding_fallback',
985
+ /** The input was truncated due to size limits. */
440
986
  TruncatedInput = 'truncated_input',
987
+ /** The HTML was malformed but processing continued with best effort. */
441
988
  MalformedHtml = 'malformed_html',
989
+ /** Sanitization was applied to remove potentially unsafe content. */
442
990
  SanitizationApplied = 'sanitization_applied',
991
+ /** DOM traversal was truncated because max_depth was exceeded. */
443
992
  DepthLimitExceeded = 'depth_limit_exceeded'
444
993
  }
445
994
 
446
- export declare const enum JsWhitespaceMode {
995
+ /**
996
+ * Whitespace handling strategy during conversion.
997
+ *
998
+ * Determines how sequences of whitespace characters (spaces, tabs, newlines) are processed.
999
+ */
1000
+ export declare const enum WhitespaceMode {
1001
+ /** Collapse multiple whitespace characters to single spaces. Default. Matches browser behavior. */
447
1002
  Normalized = 'Normalized',
1003
+ /** Preserve all whitespace exactly as it appears in the HTML. */
448
1004
  Strict = 'Strict'
449
1005
  }