@kreuzberg/html-to-markdown-node 3.4.0 → 3.5.0-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/index.d.ts +667 -111
- package/index.js +109 -755
- package/package.json +10 -11
package/index.d.ts
CHANGED
|
@@ -1,250 +1,553 @@
|
|
|
1
1
|
/* auto-generated by NAPI-RS */
|
|
2
2
|
/* eslint-disable */
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
3
|
+
/**
|
|
4
|
+
* Shareable, thread-safe handle to a user-provided HTML visitor implementation.
|
|
5
|
+
*
|
|
6
|
+
* Pass an instance wrapped in this handle to `ConversionOptions` to
|
|
7
|
+
* customise how the HTML document is traversed and converted to Markdown.
|
|
8
|
+
* The handle may be cloned and shared across threads without additional
|
|
9
|
+
* synchronisation on the caller's side.
|
|
10
|
+
*/
|
|
11
|
+
export declare class VisitorHandle {
|
|
12
|
+
|
|
13
|
+
}
|
|
14
|
+
export type JsVisitorHandle = VisitorHandle
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* The type of an inline text annotation.
|
|
18
|
+
*
|
|
19
|
+
* Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
|
|
20
|
+
*/
|
|
21
|
+
export interface AnnotationKind {
|
|
19
22
|
annotation_type: string
|
|
20
23
|
url?: string
|
|
21
24
|
title?: string
|
|
22
25
|
}
|
|
23
26
|
|
|
24
|
-
|
|
27
|
+
/**
|
|
28
|
+
* Code block fence style in Markdown output.
|
|
29
|
+
*
|
|
30
|
+
* Determines how code blocks (`<pre><code>`) are rendered in Markdown.
|
|
31
|
+
*/
|
|
32
|
+
export declare const enum CodeBlockStyle {
|
|
33
|
+
/** Indented code blocks (4 spaces). `CommonMark` standard. */
|
|
25
34
|
Indented = 'Indented',
|
|
35
|
+
/** Fenced code blocks with backticks (```). Default (GFM). Supports language hints. */
|
|
26
36
|
Backticks = 'Backticks',
|
|
37
|
+
/** Fenced code blocks with tildes (~~~). Supports language hints. */
|
|
27
38
|
Tildes = 'Tildes'
|
|
28
39
|
}
|
|
29
40
|
|
|
30
|
-
|
|
41
|
+
/**
|
|
42
|
+
* Main conversion options for HTML to Markdown conversion.
|
|
43
|
+
*
|
|
44
|
+
* Use `ConversionOptions.builder()` to construct, or `Default.default()` for defaults.
|
|
45
|
+
*
|
|
46
|
+
* # Example
|
|
47
|
+
*/
|
|
48
|
+
export interface ConversionOptions {
|
|
49
|
+
/** Heading style to use in Markdown output (ATX `#` or Setext underline). */
|
|
31
50
|
headingStyle?: JsHeadingStyle
|
|
51
|
+
/** How to indent nested list items (spaces or tab). */
|
|
32
52
|
listIndentType?: JsListIndentType
|
|
53
|
+
/** Number of spaces (or tabs) to use for each level of list indentation. */
|
|
33
54
|
listIndentWidth?: number
|
|
55
|
+
/** Bullet character(s) to use for unordered list items (e.g. `"-"`, `"*"`). */
|
|
34
56
|
bullets?: string
|
|
57
|
+
/** Character used for bold/italic emphasis markers (`*` or `_`). */
|
|
35
58
|
strongEmSymbol?: string
|
|
59
|
+
/** Escape `*` characters in plain text to avoid unintended bold/italic. */
|
|
36
60
|
escapeAsterisks?: boolean
|
|
61
|
+
/** Escape `_` characters in plain text to avoid unintended bold/italic. */
|
|
37
62
|
escapeUnderscores?: boolean
|
|
63
|
+
/** Escape miscellaneous Markdown metacharacters (`[]()#` etc.) in plain text. */
|
|
38
64
|
escapeMisc?: boolean
|
|
65
|
+
/** Escape ASCII characters that have special meaning in certain Markdown dialects. */
|
|
39
66
|
escapeAscii?: boolean
|
|
67
|
+
/** Default language annotation for fenced code blocks that have no language hint. */
|
|
40
68
|
codeLanguage?: string
|
|
69
|
+
/** Automatically convert bare URLs into Markdown autolinks. */
|
|
41
70
|
autolinks?: boolean
|
|
71
|
+
/** Emit a default title when no `<title>` tag is present. */
|
|
42
72
|
defaultTitle?: boolean
|
|
73
|
+
/** Render `<br>` elements inside table cells as literal line breaks. */
|
|
43
74
|
brInTables?: boolean
|
|
75
|
+
/**
|
|
76
|
+
* Emit tables without column padding (compact GFM format).
|
|
77
|
+
*
|
|
78
|
+
* When `true`, column widths are not computed and cells are emitted with
|
|
79
|
+
* no trailing spaces. Separator rows use exactly `---` per column.
|
|
80
|
+
* Produces token-efficient output suitable for RAG / LLM contexts.
|
|
81
|
+
*
|
|
82
|
+
* Default `false` (aligned padding preserved).
|
|
83
|
+
*/
|
|
84
|
+
compactTables?: boolean
|
|
85
|
+
/** Style used for `<mark>` / highlighted text (e.g. `==text==`). */
|
|
44
86
|
highlightStyle?: JsHighlightStyle
|
|
87
|
+
/**
|
|
88
|
+
* Populate `result.metadata` with `<head>` / `<meta>` extraction
|
|
89
|
+
* (title, description, Open Graph, Twitter Card, JSON-LD, …).
|
|
90
|
+
*
|
|
91
|
+
* Default `true`. Disabling skips the metadata pass only — table
|
|
92
|
+
* extraction into `result.tables` runs unconditionally.
|
|
93
|
+
*/
|
|
45
94
|
extractMetadata?: boolean
|
|
95
|
+
/**
|
|
96
|
+
* Controls how whitespace sequences are normalised in the converted output.
|
|
97
|
+
*
|
|
98
|
+
* - [`WhitespaceMode::Normalized`] (default) — collapses consecutive whitespace characters
|
|
99
|
+
* (spaces, tabs, newlines) to a single space, matching browser rendering behaviour.
|
|
100
|
+
* - [`WhitespaceMode::Strict`] — preserves all whitespace exactly as it appears in the
|
|
101
|
+
* source HTML, including runs of spaces and embedded newlines.
|
|
102
|
+
*
|
|
103
|
+
* Choose `Strict` only when the source HTML uses deliberate whitespace (e.g. pre-formatted
|
|
104
|
+
* content outside `<pre>` tags). For most documents `Normalized` produces cleaner output.
|
|
105
|
+
*/
|
|
46
106
|
whitespaceMode?: JsWhitespaceMode
|
|
107
|
+
/** Strip all newlines from the output, producing a single-line result. */
|
|
47
108
|
stripNewlines?: boolean
|
|
109
|
+
/** Wrap long lines at [`wrap_width`](Self::wrap_width) characters. */
|
|
48
110
|
wrap?: boolean
|
|
111
|
+
/**
|
|
112
|
+
* Maximum output line width in characters when [`wrap`](Self::wrap) is `true` (default `80`).
|
|
113
|
+
*
|
|
114
|
+
* Lines are broken at word boundaries so that no line exceeds this length. A value of `0`
|
|
115
|
+
* is treated as "no limit" — equivalent to leaving [`wrap`](Self::wrap) disabled. Has no
|
|
116
|
+
* effect when `wrap` is `false`.
|
|
117
|
+
*/
|
|
49
118
|
wrapWidth?: number
|
|
119
|
+
/** Treat the entire document as inline content (no block-level wrappers). */
|
|
50
120
|
convertAsInline?: boolean
|
|
121
|
+
/** Markdown notation for subscript text (e.g. `"~"`). */
|
|
51
122
|
subSymbol?: string
|
|
123
|
+
/** Markdown notation for superscript text (e.g. `"^"`). */
|
|
52
124
|
supSymbol?: string
|
|
125
|
+
/** How to encode hard line breaks (`<br>`) in Markdown. */
|
|
53
126
|
newlineStyle?: JsNewlineStyle
|
|
127
|
+
/** Style used for fenced code blocks (backticks or tilde). */
|
|
54
128
|
codeBlockStyle?: JsCodeBlockStyle
|
|
129
|
+
/** HTML tag names whose `<img>` children are kept inline instead of block. */
|
|
55
130
|
keepInlineImagesIn?: Array<string>
|
|
131
|
+
/**
|
|
132
|
+
* Options for the HTML pre-processing pass applied before conversion begins.
|
|
133
|
+
*
|
|
134
|
+
* Pre-processing runs before the HTML is handed to the converter and can perform operations
|
|
135
|
+
* such as unwrapping redundant wrapper elements, removing tracking pixels, and normalising
|
|
136
|
+
* vendor-specific markup. See [`PreprocessingOptions`] for the full set of knobs.
|
|
137
|
+
*
|
|
138
|
+
* Defaults to [`PreprocessingOptions::default()`], which enables the standard cleaning
|
|
139
|
+
* passes. Set individual fields on [`PreprocessingOptions`] (or construct via
|
|
140
|
+
* [`ConversionOptions::builder`]) to opt in or out of specific passes.
|
|
141
|
+
*/
|
|
56
142
|
preprocessing?: JsPreprocessingOptions
|
|
143
|
+
/** Expected character encoding of the input HTML (default `"utf-8"`). */
|
|
57
144
|
encoding?: string
|
|
145
|
+
/** Emit debug information during conversion. */
|
|
58
146
|
debug?: boolean
|
|
147
|
+
/** HTML tag names whose content is stripped from the output entirely. */
|
|
59
148
|
stripTags?: Array<string>
|
|
149
|
+
/** HTML tag names that are preserved verbatim in the output. */
|
|
60
150
|
preserveTags?: Array<string>
|
|
151
|
+
/** Skip conversion of `<img>` elements (omit images from output). */
|
|
61
152
|
skipImages?: boolean
|
|
153
|
+
/** Link rendering style (inline or reference). */
|
|
62
154
|
linkStyle?: JsLinkStyle
|
|
155
|
+
/** Target output format (Markdown, plain text, etc.). */
|
|
63
156
|
outputFormat?: JsOutputFormat
|
|
157
|
+
/** Include structured document tree in result. */
|
|
64
158
|
includeDocumentStructure?: boolean
|
|
159
|
+
/** Extract inline images from data URIs and SVGs. */
|
|
65
160
|
extractImages?: boolean
|
|
161
|
+
/** Maximum decoded image size in bytes (default 5MB). */
|
|
66
162
|
maxImageSize?: number
|
|
163
|
+
/** Capture SVG elements as images. */
|
|
67
164
|
captureSvg?: boolean
|
|
165
|
+
/** Infer image dimensions from data. */
|
|
68
166
|
inferDimensions?: boolean
|
|
167
|
+
/**
|
|
168
|
+
* Maximum DOM traversal depth. `None` means unlimited.
|
|
169
|
+
* When set, subtrees beyond this depth are silently truncated.
|
|
170
|
+
*/
|
|
69
171
|
maxDepth?: number
|
|
172
|
+
/**
|
|
173
|
+
* CSS selectors for elements to exclude entirely (element + all content).
|
|
174
|
+
*
|
|
175
|
+
* Unlike `strip_tags` (which removes the tag wrapper but keeps children),
|
|
176
|
+
* excluded elements and all their descendants are dropped from the output.
|
|
177
|
+
* Supports any CSS selector that `tl` supports: tag names, `.class`,
|
|
178
|
+
* `#id`, `[attribute]`, etc.
|
|
179
|
+
*
|
|
180
|
+
* Invalid selectors are silently skipped at conversion time.
|
|
181
|
+
*
|
|
182
|
+
* Example: `vec![".cookie-banner".into(), "#ad-container".into(), "[role='complementary']".into()]`
|
|
183
|
+
*/
|
|
70
184
|
excludeSelectors?: Array<string>
|
|
185
|
+
/**
|
|
186
|
+
* Optional visitor for custom traversal logic.
|
|
187
|
+
*
|
|
188
|
+
* When set, the visitor's callbacks are invoked for matching HTML elements
|
|
189
|
+
* during conversion, allowing custom output, skipping, or HTML preservation.
|
|
190
|
+
* See `HtmlVisitor`.
|
|
191
|
+
*/
|
|
71
192
|
visitor?: object
|
|
72
193
|
}
|
|
73
194
|
|
|
74
|
-
export
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
keepInlineImagesIn?: Array<string>
|
|
100
|
-
preprocessing?: JsPreprocessingOptionsUpdate
|
|
101
|
-
encoding?: string
|
|
102
|
-
debug?: boolean
|
|
103
|
-
stripTags?: Array<string>
|
|
104
|
-
preserveTags?: Array<string>
|
|
105
|
-
skipImages?: boolean
|
|
106
|
-
linkStyle?: JsLinkStyle
|
|
107
|
-
outputFormat?: JsOutputFormat
|
|
108
|
-
includeDocumentStructure?: boolean
|
|
109
|
-
extractImages?: boolean
|
|
110
|
-
maxImageSize?: number
|
|
111
|
-
captureSvg?: boolean
|
|
112
|
-
inferDimensions?: boolean
|
|
113
|
-
maxDepth?: number
|
|
114
|
-
excludeSelectors?: Array<string>
|
|
115
|
-
visitor?: object
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
export interface JsConversionResult {
|
|
195
|
+
export declare function conversionOptionsDefault(): ConversionOptions
|
|
196
|
+
|
|
197
|
+
/**
|
|
198
|
+
* The primary result of HTML conversion and extraction.
|
|
199
|
+
*
|
|
200
|
+
* Contains the converted text output, optional structured document tree,
|
|
201
|
+
* metadata, extracted tables, images, and processing warnings.
|
|
202
|
+
*
|
|
203
|
+
* # Example
|
|
204
|
+
*
|
|
205
|
+
* ```text
|
|
206
|
+
* use html_to_markdown_rs::{convert, ConversionOptions};
|
|
207
|
+
*
|
|
208
|
+
* let result = convert("<h1>Hello</h1><p>World</p>", None)?;
|
|
209
|
+
* assert!(result.content.is_some());
|
|
210
|
+
* assert!(result.warnings.is_empty());
|
|
211
|
+
* ```
|
|
212
|
+
*/
|
|
213
|
+
export interface ConversionResult {
|
|
214
|
+
/**
|
|
215
|
+
* Converted text output (markdown, djot, or plain text).
|
|
216
|
+
*
|
|
217
|
+
* `None` when `output_format` is set to `OutputFormat::None`,
|
|
218
|
+
* indicating extraction-only mode.
|
|
219
|
+
*/
|
|
119
220
|
content?: string
|
|
120
|
-
|
|
121
|
-
|
|
221
|
+
/**
|
|
222
|
+
* Structured document tree with semantic elements.
|
|
223
|
+
*
|
|
224
|
+
* Populated when `ConversionOptions::include_document_structure` is `true`. `None`
|
|
225
|
+
* otherwise (the default), which avoids the overhead of building the tree.
|
|
226
|
+
*
|
|
227
|
+
* When present, the tree mirrors the converted document: headings open
|
|
228
|
+
* `Group` sections, paragraphs and list items carry
|
|
229
|
+
* inline `TextAnnotation`s, and tables reference the same
|
|
230
|
+
* `TableGrid` data exposed in [`Self::tables`].
|
|
231
|
+
*
|
|
232
|
+
* Note: this field is independent of the `metadata` feature flag. Document structure
|
|
233
|
+
* collection is always available at runtime; it is gated only by the runtime option, not
|
|
234
|
+
* by a compile-time feature.
|
|
235
|
+
*/
|
|
236
|
+
document?: DocumentStructure
|
|
237
|
+
/** Extracted HTML metadata (title, OG, links, images, structured data). */
|
|
238
|
+
metadata?: HtmlMetadata
|
|
239
|
+
/** Extracted tables with structured cell data and markdown representation. */
|
|
122
240
|
tables?: Array<JsTableData>
|
|
241
|
+
/**
|
|
242
|
+
* Extracted inline images (data URIs and SVGs).
|
|
243
|
+
*
|
|
244
|
+
* Populated when `extract_images` is `true` in options.
|
|
245
|
+
*/
|
|
123
246
|
images?: Array<string>
|
|
247
|
+
/** Non-fatal processing warnings. */
|
|
124
248
|
warnings?: Array<JsProcessingWarning>
|
|
125
249
|
}
|
|
126
250
|
|
|
127
|
-
export
|
|
251
|
+
export declare function convert(html: string, options?: ConversionOptions | undefined | null, visitor?: object | undefined | null): ConversionResult
|
|
252
|
+
|
|
253
|
+
/**
|
|
254
|
+
* Document-level metadata extracted from `<head>` and top-level elements.
|
|
255
|
+
*
|
|
256
|
+
* Contains all metadata typically used by search engines, social media platforms,
|
|
257
|
+
* and browsers for document indexing and presentation.
|
|
258
|
+
*
|
|
259
|
+
* # Examples
|
|
260
|
+
*/
|
|
261
|
+
export interface DocumentMetadata {
|
|
262
|
+
/** Document title from `<title>` tag */
|
|
128
263
|
title?: string
|
|
264
|
+
/** Document description from `<meta name="description">` tag */
|
|
129
265
|
description?: string
|
|
266
|
+
/** Document keywords from `<meta name="keywords">` tag, split on commas */
|
|
130
267
|
keywords?: Array<string>
|
|
268
|
+
/** Document author from `<meta name="author">` tag */
|
|
131
269
|
author?: string
|
|
270
|
+
/** Canonical URL from `<link rel="canonical">` tag */
|
|
132
271
|
canonicalUrl?: string
|
|
272
|
+
/** Base URL from `<base href="">` tag for resolving relative URLs */
|
|
133
273
|
baseHref?: string
|
|
274
|
+
/** Document language from `lang` attribute */
|
|
134
275
|
language?: string
|
|
276
|
+
/** Document text direction from `dir` attribute */
|
|
135
277
|
textDirection?: JsTextDirection
|
|
278
|
+
/**
|
|
279
|
+
* Open Graph metadata (og:* properties) for social media
|
|
280
|
+
* Keys like "title", "description", "image", "url", etc.
|
|
281
|
+
*/
|
|
136
282
|
openGraph?: Record<string, string>
|
|
283
|
+
/**
|
|
284
|
+
* Twitter Card metadata (twitter:* properties)
|
|
285
|
+
* Keys like "card", "site", "creator", "title", "description", "image", etc.
|
|
286
|
+
*/
|
|
137
287
|
twitterCard?: Record<string, string>
|
|
288
|
+
/**
|
|
289
|
+
* Additional meta tags not covered by specific fields
|
|
290
|
+
* Keys are meta name/property attributes, values are content
|
|
291
|
+
*/
|
|
138
292
|
metaTags?: Record<string, string>
|
|
139
293
|
}
|
|
140
294
|
|
|
141
|
-
|
|
295
|
+
/** A single node in the document tree. */
|
|
296
|
+
export interface DocumentNode {
|
|
297
|
+
/** Deterministic node identifier. */
|
|
142
298
|
id: string
|
|
299
|
+
/** The semantic content of this node. */
|
|
143
300
|
content: JsNodeContent
|
|
301
|
+
/** Index of the parent node (None for root nodes). */
|
|
144
302
|
parent?: number
|
|
303
|
+
/** Indices of child nodes in reading order. */
|
|
145
304
|
children: Array<number>
|
|
305
|
+
/** Inline formatting annotations (bold, italic, links, etc.) with byte offsets into the text. */
|
|
146
306
|
annotations: Array<JsTextAnnotation>
|
|
307
|
+
/**
|
|
308
|
+
* Format-specific attributes preserved from the source HTML element.
|
|
309
|
+
*
|
|
310
|
+
* Keys are lowercased attribute names as they appear in the HTML (e.g. `"class"`, `"id"`,
|
|
311
|
+
* `"data-foo"`). Values are the raw attribute strings, copied verbatim from the source —
|
|
312
|
+
* no HTML entity decoding is applied here.
|
|
313
|
+
*
|
|
314
|
+
* The map is `None` when no attributes are present (omitted entirely in serialized output).
|
|
315
|
+
* Not every HTML attribute is preserved: only attributes that carry semantic or structural
|
|
316
|
+
* significance for the node type are collected. For example, heading nodes capture the `"id"`
|
|
317
|
+
* attribute for anchor linking; other element-level attributes may be silently dropped.
|
|
318
|
+
*/
|
|
147
319
|
attributes?: Record<string, string>
|
|
148
320
|
}
|
|
149
321
|
|
|
150
|
-
|
|
322
|
+
/**
|
|
323
|
+
* A structured document tree representing the semantic content of an HTML document.
|
|
324
|
+
*
|
|
325
|
+
* Uses a flat node array with index-based parent/child references for efficient traversal.
|
|
326
|
+
*/
|
|
327
|
+
export interface DocumentStructure {
|
|
328
|
+
/** All nodes in document reading order. */
|
|
151
329
|
nodes: Array<JsDocumentNode>
|
|
330
|
+
/** The source format (always "html" for this crate). */
|
|
152
331
|
sourceFormat?: string
|
|
153
332
|
}
|
|
154
333
|
|
|
155
|
-
|
|
334
|
+
/** A single cell in a table grid. */
|
|
335
|
+
export interface GridCell {
|
|
336
|
+
/** The text content of the cell. */
|
|
156
337
|
content: string
|
|
338
|
+
/** 0-indexed row position. */
|
|
157
339
|
row: number
|
|
340
|
+
/** 0-indexed column position. */
|
|
158
341
|
col: number
|
|
342
|
+
/** Number of rows this cell spans (default 1). */
|
|
159
343
|
rowSpan: number
|
|
344
|
+
/** Number of columns this cell spans (default 1). */
|
|
160
345
|
colSpan: number
|
|
346
|
+
/** Whether this is a header cell (`<th>`). */
|
|
161
347
|
isHeader: boolean
|
|
162
348
|
}
|
|
163
349
|
|
|
164
|
-
|
|
350
|
+
/**
|
|
351
|
+
* Header element metadata with hierarchy tracking.
|
|
352
|
+
*
|
|
353
|
+
* Captures heading elements (h1-h6) with their text content, identifiers,
|
|
354
|
+
* and position in the document structure.
|
|
355
|
+
*
|
|
356
|
+
* # Examples
|
|
357
|
+
*/
|
|
358
|
+
export interface HeaderMetadata {
|
|
359
|
+
/** Header level: 1 (h1) through 6 (h6) */
|
|
165
360
|
level: number
|
|
361
|
+
/** Normalized text content of the header */
|
|
166
362
|
text: string
|
|
363
|
+
/** HTML id attribute if present */
|
|
167
364
|
id?: string
|
|
365
|
+
/** Document tree depth at the header element */
|
|
168
366
|
depth: number
|
|
367
|
+
/** Byte offset in original HTML document */
|
|
169
368
|
htmlOffset: number
|
|
170
369
|
}
|
|
171
370
|
|
|
172
|
-
|
|
371
|
+
/**
|
|
372
|
+
* Heading style options for Markdown output.
|
|
373
|
+
*
|
|
374
|
+
* Controls how headings (h1-h6) are rendered in the output Markdown.
|
|
375
|
+
*/
|
|
376
|
+
export declare const enum HeadingStyle {
|
|
377
|
+
/** Underlined style (=== for h1, --- for h2). */
|
|
173
378
|
Underlined = 'Underlined',
|
|
379
|
+
/** ATX style (# for h1, ## for h2, etc.). Default. */
|
|
174
380
|
Atx = 'Atx',
|
|
381
|
+
/** ATX closed style (# title #, with closing hashes). */
|
|
175
382
|
AtxClosed = 'AtxClosed'
|
|
176
383
|
}
|
|
177
384
|
|
|
178
|
-
|
|
385
|
+
/**
|
|
386
|
+
* Highlight rendering style for `<mark>` elements.
|
|
387
|
+
*
|
|
388
|
+
* Controls how highlighted text is rendered in Markdown output.
|
|
389
|
+
*/
|
|
390
|
+
export declare const enum HighlightStyle {
|
|
391
|
+
/** Double equals syntax (==text==). Default. Pandoc-compatible. */
|
|
179
392
|
DoubleEqual = 'DoubleEqual',
|
|
393
|
+
/** Preserve as HTML (==text==). Original HTML tag. */
|
|
180
394
|
Html = 'Html',
|
|
395
|
+
/** Render as bold (**text**). Uses strong emphasis. */
|
|
181
396
|
Bold = 'Bold',
|
|
397
|
+
/** Strip formatting, render as plain text. No markup. */
|
|
182
398
|
None = 'None'
|
|
183
399
|
}
|
|
184
400
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
export interface
|
|
401
|
+
/**
|
|
402
|
+
* Comprehensive metadata extraction result from HTML document.
|
|
403
|
+
*
|
|
404
|
+
* Contains all extracted metadata types in a single structure,
|
|
405
|
+
* suitable for serialization and transmission across language boundaries.
|
|
406
|
+
*
|
|
407
|
+
* # Examples
|
|
408
|
+
*/
|
|
409
|
+
export interface HtmlMetadata {
|
|
410
|
+
/** Document-level metadata (title, description, canonical, etc.) */
|
|
411
|
+
document?: DocumentMetadata
|
|
412
|
+
/** Extracted header elements with hierarchy */
|
|
413
|
+
headers?: Array<HeaderMetadata>
|
|
414
|
+
/** Extracted hyperlinks with type classification */
|
|
415
|
+
links?: Array<LinkMetadata>
|
|
416
|
+
/** Extracted images with source and dimensions */
|
|
417
|
+
images?: Array<ImageMetadata>
|
|
418
|
+
/** Extracted structured data blocks */
|
|
419
|
+
structuredData?: Array<StructuredData>
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
/**
|
|
423
|
+
* Image metadata with source and dimensions.
|
|
424
|
+
*
|
|
425
|
+
* Captures `<img>` elements and inline `<svg>` elements with metadata
|
|
426
|
+
* for image analysis and optimization.
|
|
427
|
+
*
|
|
428
|
+
* # Examples
|
|
429
|
+
*/
|
|
430
|
+
export interface ImageMetadata {
|
|
431
|
+
/** Image source (URL, data URI, or SVG content identifier) */
|
|
194
432
|
src: string
|
|
433
|
+
/** Alternative text from alt attribute (for accessibility) */
|
|
195
434
|
alt?: string
|
|
435
|
+
/** Title attribute (often shown as tooltip) */
|
|
196
436
|
title?: string
|
|
437
|
+
/** Image dimensions as (width, height) if available */
|
|
197
438
|
dimensions?: Array<number>
|
|
439
|
+
/** Image type classification */
|
|
198
440
|
imageType: JsImageType
|
|
441
|
+
/** Additional HTML attributes */
|
|
199
442
|
attributes: Record<string, string>
|
|
200
443
|
}
|
|
201
444
|
|
|
202
|
-
|
|
445
|
+
/**
|
|
446
|
+
* Image source classification for proper handling and processing.
|
|
447
|
+
*
|
|
448
|
+
* Determines whether an image is embedded (data URI), inline SVG, external, or relative.
|
|
449
|
+
*/
|
|
450
|
+
export declare const enum ImageType {
|
|
451
|
+
/** Data URI embedded image (base64 or other encoding) */
|
|
203
452
|
DataUri = 'data_uri',
|
|
453
|
+
/** Inline SVG element */
|
|
204
454
|
InlineSvg = 'inline_svg',
|
|
455
|
+
/** External image URL (http/https) */
|
|
205
456
|
External = 'external',
|
|
457
|
+
/** Relative image path */
|
|
206
458
|
Relative = 'relative'
|
|
207
459
|
}
|
|
208
460
|
|
|
209
|
-
|
|
461
|
+
/**
|
|
462
|
+
* Hyperlink metadata with categorization and attributes.
|
|
463
|
+
*
|
|
464
|
+
* Represents `<a>` elements with parsed href values, text content, and link type classification.
|
|
465
|
+
*
|
|
466
|
+
* # Examples
|
|
467
|
+
*/
|
|
468
|
+
export interface LinkMetadata {
|
|
469
|
+
/** The href URL value */
|
|
210
470
|
href: string
|
|
471
|
+
/** Link text content (normalized, concatenated if mixed with elements) */
|
|
211
472
|
text: string
|
|
473
|
+
/** Optional title attribute (often shown as tooltip) */
|
|
212
474
|
title?: string
|
|
475
|
+
/** Link type classification */
|
|
213
476
|
linkType: JsLinkType
|
|
477
|
+
/** Rel attribute values (e.g., "nofollow", "stylesheet", "canonical") */
|
|
214
478
|
rel: Array<string>
|
|
479
|
+
/** Additional HTML attributes */
|
|
215
480
|
attributes: Record<string, string>
|
|
216
481
|
}
|
|
217
482
|
|
|
218
|
-
|
|
483
|
+
/**
|
|
484
|
+
* Link rendering style in Markdown output.
|
|
485
|
+
*
|
|
486
|
+
* Controls whether links and images use inline `[text](url)` syntax or
|
|
487
|
+
* reference-style `[text][1]` syntax with definitions collected at the end.
|
|
488
|
+
*/
|
|
489
|
+
export declare const enum LinkStyle {
|
|
490
|
+
/** Inline links: `[text](url)`. Default. */
|
|
219
491
|
Inline = 'Inline',
|
|
492
|
+
/** Reference-style links: `[text][1]` with `[1]: url` at end of document. */
|
|
220
493
|
Reference = 'Reference'
|
|
221
494
|
}
|
|
222
495
|
|
|
223
|
-
|
|
496
|
+
/**
|
|
497
|
+
* Link classification based on href value and document context.
|
|
498
|
+
*
|
|
499
|
+
* Used to categorize links during extraction for filtering and analysis.
|
|
500
|
+
*/
|
|
501
|
+
export declare const enum LinkType {
|
|
502
|
+
/** Anchor link within same document (href starts with #) */
|
|
224
503
|
Anchor = 'anchor',
|
|
504
|
+
/** Internal link within same domain */
|
|
225
505
|
Internal = 'internal',
|
|
506
|
+
/** External link to different domain */
|
|
226
507
|
External = 'external',
|
|
508
|
+
/** Email link (mailto:) */
|
|
227
509
|
Email = 'email',
|
|
510
|
+
/** Phone link (tel:) */
|
|
228
511
|
Phone = 'phone',
|
|
512
|
+
/** Other protocol or unclassifiable */
|
|
229
513
|
Other = 'other'
|
|
230
514
|
}
|
|
231
515
|
|
|
232
|
-
|
|
516
|
+
/**
|
|
517
|
+
* List indentation character type.
|
|
518
|
+
*
|
|
519
|
+
* Controls whether list items are indented with spaces or tabs.
|
|
520
|
+
*/
|
|
521
|
+
export declare const enum ListIndentType {
|
|
522
|
+
/** Use spaces for indentation. Default. Width controlled by `list_indent_width`. */
|
|
233
523
|
Spaces = 'Spaces',
|
|
524
|
+
/** Use tabs for indentation. */
|
|
234
525
|
Tabs = 'Tabs'
|
|
235
526
|
}
|
|
236
527
|
|
|
237
|
-
|
|
528
|
+
/**
|
|
529
|
+
* Line break syntax in Markdown output.
|
|
530
|
+
*
|
|
531
|
+
* Controls how soft line breaks (from `<br>` or line breaks in source) are rendered.
|
|
532
|
+
*/
|
|
533
|
+
export declare const enum NewlineStyle {
|
|
534
|
+
/** Two trailing spaces at end of line. Default. Standard Markdown syntax. */
|
|
238
535
|
Spaces = 'Spaces',
|
|
536
|
+
/** Backslash at end of line. Alternative Markdown syntax. */
|
|
239
537
|
Backslash = 'Backslash'
|
|
240
538
|
}
|
|
241
539
|
|
|
242
|
-
|
|
540
|
+
/**
|
|
541
|
+
* The semantic content type of a document node.
|
|
542
|
+
*
|
|
543
|
+
* Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
|
|
544
|
+
*/
|
|
545
|
+
export interface NodeContent {
|
|
243
546
|
node_type: string
|
|
244
547
|
level?: number
|
|
245
548
|
text?: string
|
|
246
549
|
ordered?: boolean
|
|
247
|
-
grid?:
|
|
550
|
+
grid?: TableGrid
|
|
248
551
|
description?: string
|
|
249
552
|
src?: string
|
|
250
553
|
imageIndex?: number
|
|
@@ -259,191 +562,444 @@ export interface JsNodeContent {
|
|
|
259
562
|
headingText?: string
|
|
260
563
|
}
|
|
261
564
|
|
|
262
|
-
|
|
565
|
+
/**
|
|
566
|
+
* Context information passed to all visitor methods.
|
|
567
|
+
*
|
|
568
|
+
* Provides comprehensive metadata about the current node being visited,
|
|
569
|
+
* including its type, attributes, position in the DOM tree, and parent context.
|
|
570
|
+
*/
|
|
571
|
+
export interface NodeContext {
|
|
572
|
+
/** Coarse-grained node type classification */
|
|
263
573
|
nodeType: JsNodeType
|
|
574
|
+
/** Raw HTML tag name (e.g., "div", "h1", "custom-element") */
|
|
264
575
|
tagName: string
|
|
576
|
+
/** All HTML attributes as key-value pairs */
|
|
265
577
|
attributes: Record<string, string>
|
|
578
|
+
/** Depth in the DOM tree (0 = root) */
|
|
266
579
|
depth: number
|
|
580
|
+
/** Index among siblings (0-based) */
|
|
267
581
|
indexInParent: number
|
|
582
|
+
/** Parent element's tag name (None if root) */
|
|
268
583
|
parentTag?: string
|
|
584
|
+
/** Whether this element is treated as inline vs block */
|
|
269
585
|
isInline: boolean
|
|
270
586
|
}
|
|
271
587
|
|
|
272
|
-
|
|
588
|
+
/**
|
|
589
|
+
* Node type enumeration covering all HTML element types.
|
|
590
|
+
*
|
|
591
|
+
* This enum categorizes all HTML elements that the converter recognizes,
|
|
592
|
+
* providing a coarse-grained classification for visitor dispatch.
|
|
593
|
+
*/
|
|
594
|
+
export declare const enum NodeType {
|
|
595
|
+
/** Text node (most frequent - 100+ per document) */
|
|
273
596
|
Text = 'Text',
|
|
597
|
+
/** Generic element node */
|
|
274
598
|
Element = 'Element',
|
|
599
|
+
/** Heading elements (h1-h6) */
|
|
275
600
|
Heading = 'Heading',
|
|
601
|
+
/** Paragraph element */
|
|
276
602
|
Paragraph = 'Paragraph',
|
|
603
|
+
/** Generic div container */
|
|
277
604
|
Div = 'Div',
|
|
605
|
+
/** Blockquote element */
|
|
278
606
|
Blockquote = 'Blockquote',
|
|
607
|
+
/** Preformatted text block */
|
|
279
608
|
Pre = 'Pre',
|
|
609
|
+
/** Horizontal rule */
|
|
280
610
|
Hr = 'Hr',
|
|
611
|
+
/** Ordered or unordered list (ul, ol) */
|
|
281
612
|
List = 'List',
|
|
613
|
+
/** List item (li) */
|
|
282
614
|
ListItem = 'ListItem',
|
|
615
|
+
/** Definition list (dl) */
|
|
283
616
|
DefinitionList = 'DefinitionList',
|
|
617
|
+
/** Definition term (dt) */
|
|
284
618
|
DefinitionTerm = 'DefinitionTerm',
|
|
619
|
+
/** Definition description (dd) */
|
|
285
620
|
DefinitionDescription = 'DefinitionDescription',
|
|
621
|
+
/** Table element */
|
|
286
622
|
Table = 'Table',
|
|
623
|
+
/** Table row (tr) */
|
|
287
624
|
TableRow = 'TableRow',
|
|
625
|
+
/** Table cell (td, th) */
|
|
288
626
|
TableCell = 'TableCell',
|
|
627
|
+
/** Table header cell (th) */
|
|
289
628
|
TableHeader = 'TableHeader',
|
|
629
|
+
/** Table body (tbody) */
|
|
290
630
|
TableBody = 'TableBody',
|
|
631
|
+
/** Table head (thead) */
|
|
291
632
|
TableHead = 'TableHead',
|
|
633
|
+
/** Table foot (tfoot) */
|
|
292
634
|
TableFoot = 'TableFoot',
|
|
635
|
+
/** Anchor link (a) */
|
|
293
636
|
Link = 'Link',
|
|
637
|
+
/** Image (img) */
|
|
294
638
|
Image = 'Image',
|
|
639
|
+
/** Strong/bold (strong, b) */
|
|
295
640
|
Strong = 'Strong',
|
|
641
|
+
/** Emphasis/italic (em, i) */
|
|
296
642
|
Em = 'Em',
|
|
643
|
+
/** Inline code (code) */
|
|
297
644
|
Code = 'Code',
|
|
645
|
+
/** Strikethrough (s, del, strike) */
|
|
298
646
|
Strikethrough = 'Strikethrough',
|
|
647
|
+
/** Underline (u, ins) */
|
|
299
648
|
Underline = 'Underline',
|
|
649
|
+
/** Subscript (sub) */
|
|
300
650
|
Subscript = 'Subscript',
|
|
651
|
+
/** Superscript (sup) */
|
|
301
652
|
Superscript = 'Superscript',
|
|
653
|
+
/** Mark/highlight (mark) */
|
|
302
654
|
Mark = 'Mark',
|
|
655
|
+
/** Small text (small) */
|
|
303
656
|
Small = 'Small',
|
|
657
|
+
/** Line break (br) */
|
|
304
658
|
Br = 'Br',
|
|
659
|
+
/** Span element */
|
|
305
660
|
Span = 'Span',
|
|
661
|
+
/** Article element */
|
|
306
662
|
Article = 'Article',
|
|
663
|
+
/** Section element */
|
|
307
664
|
Section = 'Section',
|
|
665
|
+
/** Navigation element */
|
|
308
666
|
Nav = 'Nav',
|
|
667
|
+
/** Aside element */
|
|
309
668
|
Aside = 'Aside',
|
|
669
|
+
/** Header element */
|
|
310
670
|
Header = 'Header',
|
|
671
|
+
/** Footer element */
|
|
311
672
|
Footer = 'Footer',
|
|
673
|
+
/** Main element */
|
|
312
674
|
Main = 'Main',
|
|
675
|
+
/** Figure element */
|
|
313
676
|
Figure = 'Figure',
|
|
677
|
+
/** Figure caption */
|
|
314
678
|
Figcaption = 'Figcaption',
|
|
679
|
+
/** Time element */
|
|
315
680
|
Time = 'Time',
|
|
681
|
+
/** Details element */
|
|
316
682
|
Details = 'Details',
|
|
683
|
+
/** Summary element */
|
|
317
684
|
Summary = 'Summary',
|
|
685
|
+
/** Form element */
|
|
318
686
|
Form = 'Form',
|
|
687
|
+
/** Input element */
|
|
319
688
|
Input = 'Input',
|
|
689
|
+
/** Select element */
|
|
320
690
|
Select = 'Select',
|
|
691
|
+
/** Option element */
|
|
321
692
|
Option = 'Option',
|
|
693
|
+
/** Button element */
|
|
322
694
|
Button = 'Button',
|
|
695
|
+
/** Textarea element */
|
|
323
696
|
Textarea = 'Textarea',
|
|
697
|
+
/** Label element */
|
|
324
698
|
Label = 'Label',
|
|
699
|
+
/** Fieldset element */
|
|
325
700
|
Fieldset = 'Fieldset',
|
|
701
|
+
/** Legend element */
|
|
326
702
|
Legend = 'Legend',
|
|
703
|
+
/** Audio element */
|
|
327
704
|
Audio = 'Audio',
|
|
705
|
+
/** Video element */
|
|
328
706
|
Video = 'Video',
|
|
707
|
+
/** Picture element */
|
|
329
708
|
Picture = 'Picture',
|
|
709
|
+
/** Source element */
|
|
330
710
|
Source = 'Source',
|
|
711
|
+
/** Iframe element */
|
|
331
712
|
Iframe = 'Iframe',
|
|
713
|
+
/** SVG element */
|
|
332
714
|
Svg = 'Svg',
|
|
715
|
+
/** Canvas element */
|
|
333
716
|
Canvas = 'Canvas',
|
|
717
|
+
/** Ruby annotation */
|
|
334
718
|
Ruby = 'Ruby',
|
|
719
|
+
/** Ruby text */
|
|
335
720
|
Rt = 'Rt',
|
|
721
|
+
/** Ruby parenthesis */
|
|
336
722
|
Rp = 'Rp',
|
|
723
|
+
/** Abbreviation */
|
|
337
724
|
Abbr = 'Abbr',
|
|
725
|
+
/** Keyboard input */
|
|
338
726
|
Kbd = 'Kbd',
|
|
727
|
+
/** Sample output */
|
|
339
728
|
Samp = 'Samp',
|
|
729
|
+
/** Variable */
|
|
340
730
|
Var = 'Var',
|
|
731
|
+
/** Citation */
|
|
341
732
|
Cite = 'Cite',
|
|
733
|
+
/** Quote */
|
|
342
734
|
Q = 'Q',
|
|
735
|
+
/** Deleted text */
|
|
343
736
|
Del = 'Del',
|
|
737
|
+
/** Inserted text */
|
|
344
738
|
Ins = 'Ins',
|
|
739
|
+
/** Data element */
|
|
345
740
|
Data = 'Data',
|
|
741
|
+
/** Meter element */
|
|
346
742
|
Meter = 'Meter',
|
|
743
|
+
/** Progress element */
|
|
347
744
|
Progress = 'Progress',
|
|
745
|
+
/** Output element */
|
|
348
746
|
Output = 'Output',
|
|
747
|
+
/** Template element */
|
|
349
748
|
Template = 'Template',
|
|
749
|
+
/** Slot element */
|
|
350
750
|
Slot = 'Slot',
|
|
751
|
+
/** HTML root element */
|
|
351
752
|
Html = 'Html',
|
|
753
|
+
/** Head element */
|
|
352
754
|
Head = 'Head',
|
|
755
|
+
/** Body element */
|
|
353
756
|
Body = 'Body',
|
|
757
|
+
/** Title element */
|
|
354
758
|
Title = 'Title',
|
|
759
|
+
/** Meta element */
|
|
355
760
|
Meta = 'Meta',
|
|
761
|
+
/** Link element (not anchor) */
|
|
356
762
|
LinkTag = 'LinkTag',
|
|
763
|
+
/** Style element */
|
|
357
764
|
Style = 'Style',
|
|
765
|
+
/** Script element */
|
|
358
766
|
Script = 'Script',
|
|
767
|
+
/** Base element */
|
|
359
768
|
Base = 'Base',
|
|
769
|
+
/** Custom element (web components) or unknown tag */
|
|
360
770
|
Custom = 'Custom'
|
|
361
771
|
}
|
|
362
772
|
|
|
363
|
-
|
|
773
|
+
/**
|
|
774
|
+
* Output format for conversion.
|
|
775
|
+
*
|
|
776
|
+
* Specifies the target markup language format for the conversion output.
|
|
777
|
+
*/
|
|
778
|
+
export declare const enum OutputFormat {
|
|
779
|
+
/** Standard Markdown (CommonMark compatible). Default. */
|
|
364
780
|
Markdown = 'Markdown',
|
|
781
|
+
/** Djot lightweight markup language. */
|
|
365
782
|
Djot = 'Djot',
|
|
783
|
+
/** Plain text output (no markup, visible text only). */
|
|
366
784
|
Plain = 'Plain'
|
|
367
785
|
}
|
|
368
786
|
|
|
369
|
-
|
|
787
|
+
/** HTML preprocessing options for document cleanup before conversion. */
|
|
788
|
+
export interface PreprocessingOptions {
|
|
789
|
+
/** Enable HTML preprocessing globally */
|
|
370
790
|
enabled?: boolean
|
|
791
|
+
/** Preprocessing preset level (Minimal, Standard, Aggressive) */
|
|
371
792
|
preset?: JsPreprocessingPreset
|
|
793
|
+
/** Remove navigation elements (nav, breadcrumbs, menus, sidebars) */
|
|
372
794
|
removeNavigation?: boolean
|
|
795
|
+
/** Remove form elements (forms, inputs, buttons, etc.) */
|
|
373
796
|
removeForms?: boolean
|
|
374
797
|
}
|
|
375
798
|
|
|
376
|
-
export
|
|
377
|
-
enabled?: boolean
|
|
378
|
-
preset?: JsPreprocessingPreset
|
|
379
|
-
removeNavigation?: boolean
|
|
380
|
-
removeForms?: boolean
|
|
381
|
-
}
|
|
799
|
+
export declare function preprocessingOptionsDefault(): PreprocessingOptions
|
|
382
800
|
|
|
383
|
-
|
|
801
|
+
/**
|
|
802
|
+
* HTML preprocessing aggressiveness level.
|
|
803
|
+
*
|
|
804
|
+
* Controls the extent of cleanup performed before conversion. Higher levels remove more elements.
|
|
805
|
+
*/
|
|
806
|
+
export declare const enum PreprocessingPreset {
|
|
807
|
+
/** Minimal cleanup. Remove only essential noise (scripts, styles). */
|
|
384
808
|
Minimal = 'Minimal',
|
|
809
|
+
/** Standard cleanup. Default. Removes navigation, forms, and other auxiliary content. */
|
|
385
810
|
Standard = 'Standard',
|
|
811
|
+
/** Aggressive cleanup. Remove extensive non-content elements and structure. */
|
|
386
812
|
Aggressive = 'Aggressive'
|
|
387
813
|
}
|
|
388
814
|
|
|
389
|
-
|
|
815
|
+
/**
|
|
816
|
+
* A non-fatal diagnostic produced during HTML conversion.
|
|
817
|
+
*
|
|
818
|
+
* Warnings indicate that conversion completed but some content may have been handled
|
|
819
|
+
* differently than expected — for example, an image that could not be extracted, a truncated
|
|
820
|
+
* input, or malformed HTML that was repaired with best-effort parsing.
|
|
821
|
+
*
|
|
822
|
+
* Conversion always succeeds (returns `ConversionResult`) even when warnings are
|
|
823
|
+
* present. Callers should inspect `warnings` and decide how to
|
|
824
|
+
* handle them based on their tolerance for partial results:
|
|
825
|
+
*
|
|
826
|
+
* - **Logging pipelines**: emit each warning at `WARN` level and continue.
|
|
827
|
+
* - **Strict pipelines**: treat any warning as a hard error by checking
|
|
828
|
+
* `result.warnings.is_empty()` before using the output.
|
|
829
|
+
*
|
|
830
|
+
* See `WarningKind` for the full taxonomy of warning categories.
|
|
831
|
+
*/
|
|
832
|
+
export interface ProcessingWarning {
|
|
833
|
+
/** Human-readable warning message. */
|
|
390
834
|
message: string
|
|
835
|
+
/** The category of warning. */
|
|
391
836
|
kind: JsWarningKind
|
|
392
837
|
}
|
|
393
838
|
|
|
394
|
-
|
|
839
|
+
/**
|
|
840
|
+
* Structured data block (JSON-LD, Microdata, or RDFa).
|
|
841
|
+
*
|
|
842
|
+
* Represents machine-readable structured data found in the document.
|
|
843
|
+
* JSON-LD blocks are collected as raw JSON strings for flexibility.
|
|
844
|
+
*
|
|
845
|
+
* # Examples
|
|
846
|
+
*/
|
|
847
|
+
export interface StructuredData {
|
|
848
|
+
/** Type of structured data (JSON-LD, Microdata, RDFa) */
|
|
395
849
|
dataType: JsStructuredDataType
|
|
850
|
+
/** Raw JSON string (for JSON-LD) or serialized representation */
|
|
396
851
|
rawJson: string
|
|
852
|
+
/** Schema type if detectable (e.g., "Article", "Event", "Product") */
|
|
397
853
|
schemaType?: string
|
|
398
854
|
}
|
|
399
855
|
|
|
400
|
-
|
|
856
|
+
/**
|
|
857
|
+
* Structured data format type.
|
|
858
|
+
*
|
|
859
|
+
* Identifies the schema/format used for structured data markup.
|
|
860
|
+
*/
|
|
861
|
+
export declare const enum StructuredDataType {
|
|
862
|
+
/** JSON-LD (JSON for Linking Data) script blocks */
|
|
401
863
|
JsonLd = 'json_ld',
|
|
864
|
+
/** HTML5 Microdata attributes (itemscope, itemtype, itemprop) */
|
|
402
865
|
Microdata = 'microdata',
|
|
866
|
+
/** RDF in Attributes (RDFa) markup */
|
|
403
867
|
RDFa = 'rdfa'
|
|
404
868
|
}
|
|
405
869
|
|
|
406
|
-
|
|
407
|
-
|
|
870
|
+
/** A top-level extracted table with both structured data and markdown representation. */
|
|
871
|
+
export interface TableData {
|
|
872
|
+
/** The structured table grid. */
|
|
873
|
+
grid: TableGrid
|
|
874
|
+
/** The markdown rendering of this table. */
|
|
408
875
|
markdown: string
|
|
409
876
|
}
|
|
410
877
|
|
|
411
|
-
|
|
878
|
+
/** A structured table grid with cell-level data including spans. */
|
|
879
|
+
export interface TableGrid {
|
|
880
|
+
/** Number of rows. */
|
|
412
881
|
rows?: number
|
|
882
|
+
/** Number of columns. */
|
|
413
883
|
cols?: number
|
|
884
|
+
/**
|
|
885
|
+
* All cells in the table as a flat, sparse list.
|
|
886
|
+
*
|
|
887
|
+
* The list is ordered by `(row, col)` but is **not** a dense `rows × cols` matrix: cells
|
|
888
|
+
* that are covered by a spanning cell (via `row_span > 1` or `col_span > 1`) do not appear
|
|
889
|
+
* in the list. Only the top-left "origin" cell of a span is present, with its `row_span`
|
|
890
|
+
* and `col_span` fields set accordingly.
|
|
891
|
+
*
|
|
892
|
+
* To reconstruct the full visual grid, iterate over all cells and mark the rectangular
|
|
893
|
+
* region `[row .. row+row_span, col .. col+col_span]` as occupied by that cell. Any
|
|
894
|
+
* `(row, col)` position that is not the origin of any cell is covered by a span from an
|
|
895
|
+
* earlier cell.
|
|
896
|
+
*
|
|
897
|
+
* The length of this vec is `≤ rows * cols`. An empty table (`rows == 0 || cols == 0`)
|
|
898
|
+
* produces an empty vec.
|
|
899
|
+
*/
|
|
414
900
|
cells?: Array<JsGridCell>
|
|
415
901
|
}
|
|
416
902
|
|
|
417
|
-
|
|
903
|
+
/**
|
|
904
|
+
* A styling or semantic annotation that applies to a byte range within a node's text.
|
|
905
|
+
*
|
|
906
|
+
* Unlike `DocumentNode`, which captures block-level structure (headings, paragraphs, etc.),
|
|
907
|
+
* a `TextAnnotation` describes inline-level markup — bold, italic, links, code spans, and
|
|
908
|
+
* similar — that spans a contiguous run of bytes inside `DocumentNode.content`'s text field.
|
|
909
|
+
*
|
|
910
|
+
* Byte offsets (`start`..`end`) are into the UTF-8 encoded text of the parent node. The range
|
|
911
|
+
* follows Rust slice conventions: `start` is inclusive and `end` is exclusive, so the annotated
|
|
912
|
+
* text is `text[start as usize..end as usize]`.
|
|
913
|
+
*
|
|
914
|
+
* Multiple annotations on the same node can overlap (e.g. bold-italic text), and they are
|
|
915
|
+
* stored in the order they are encountered during DOM traversal.
|
|
916
|
+
*
|
|
917
|
+
* See `AnnotationKind` for the full list of supported annotation types.
|
|
918
|
+
*/
|
|
919
|
+
export interface TextAnnotation {
|
|
920
|
+
/** Start byte offset (inclusive) into the parent node's text. */
|
|
418
921
|
start: number
|
|
922
|
+
/** End byte offset (exclusive) into the parent node's text. */
|
|
419
923
|
end: number
|
|
924
|
+
/** The type of annotation. */
|
|
420
925
|
kind: JsAnnotationKind
|
|
421
926
|
}
|
|
422
927
|
|
|
423
|
-
|
|
928
|
+
/**
|
|
929
|
+
* Text directionality of document content.
|
|
930
|
+
*
|
|
931
|
+
* Corresponds to the HTML `dir` attribute and `bdi` element directionality.
|
|
932
|
+
*/
|
|
933
|
+
export declare const enum TextDirection {
|
|
934
|
+
/** Left-to-right text flow (default for Latin scripts) */
|
|
424
935
|
LeftToRight = 'ltr',
|
|
936
|
+
/** Right-to-left text flow (Hebrew, Arabic, Urdu, etc.) */
|
|
425
937
|
RightToLeft = 'rtl',
|
|
938
|
+
/** Automatic directionality detection */
|
|
426
939
|
Auto = 'auto'
|
|
427
940
|
}
|
|
428
941
|
|
|
429
|
-
|
|
942
|
+
/**
|
|
943
|
+
* Result of a visitor callback.
|
|
944
|
+
*
|
|
945
|
+
* Allows visitors to control the conversion flow by either proceeding
|
|
946
|
+
* with default behavior, providing custom output, skipping elements,
|
|
947
|
+
* preserving HTML, or signaling errors.
|
|
948
|
+
*/
|
|
949
|
+
export declare const enum VisitResult {
|
|
950
|
+
/** Continue with default conversion behavior */
|
|
430
951
|
Continue = 'Continue',
|
|
952
|
+
/**
|
|
953
|
+
* Replace default output with custom markdown
|
|
954
|
+
*
|
|
955
|
+
* The visitor takes full responsibility for the markdown output
|
|
956
|
+
* of this node and its children.
|
|
957
|
+
*/
|
|
431
958
|
Custom = 'Custom',
|
|
959
|
+
/**
|
|
960
|
+
* Skip this element entirely (don't output anything)
|
|
961
|
+
*
|
|
962
|
+
* The element and all its children are ignored in the output.
|
|
963
|
+
*/
|
|
432
964
|
Skip = 'Skip',
|
|
965
|
+
/**
|
|
966
|
+
* Preserve original HTML (don't convert to markdown)
|
|
967
|
+
*
|
|
968
|
+
* The element's raw HTML is included verbatim in the output.
|
|
969
|
+
*/
|
|
433
970
|
PreserveHtml = 'PreserveHtml',
|
|
971
|
+
/**
|
|
972
|
+
* Stop conversion with an error
|
|
973
|
+
*
|
|
974
|
+
* The conversion process halts and returns this error message.
|
|
975
|
+
*/
|
|
434
976
|
Error = 'Error'
|
|
435
977
|
}
|
|
436
978
|
|
|
437
|
-
|
|
979
|
+
/** Categories of processing warnings. */
|
|
980
|
+
export declare const enum WarningKind {
|
|
981
|
+
/** An image could not be extracted (e.g. invalid data URI, unsupported format). */
|
|
438
982
|
ImageExtractionFailed = 'image_extraction_failed',
|
|
983
|
+
/** The input encoding was not recognized; fell back to UTF-8. */
|
|
439
984
|
EncodingFallback = 'encoding_fallback',
|
|
985
|
+
/** The input was truncated due to size limits. */
|
|
440
986
|
TruncatedInput = 'truncated_input',
|
|
987
|
+
/** The HTML was malformed but processing continued with best effort. */
|
|
441
988
|
MalformedHtml = 'malformed_html',
|
|
989
|
+
/** Sanitization was applied to remove potentially unsafe content. */
|
|
442
990
|
SanitizationApplied = 'sanitization_applied',
|
|
991
|
+
/** DOM traversal was truncated because max_depth was exceeded. */
|
|
443
992
|
DepthLimitExceeded = 'depth_limit_exceeded'
|
|
444
993
|
}
|
|
445
994
|
|
|
446
|
-
|
|
995
|
+
/**
|
|
996
|
+
* Whitespace handling strategy during conversion.
|
|
997
|
+
*
|
|
998
|
+
* Determines how sequences of whitespace characters (spaces, tabs, newlines) are processed.
|
|
999
|
+
*/
|
|
1000
|
+
export declare const enum WhitespaceMode {
|
|
1001
|
+
/** Collapse multiple whitespace characters to single spaces. Default. Matches browser behavior. */
|
|
447
1002
|
Normalized = 'Normalized',
|
|
1003
|
+
/** Preserve all whitespace exactly as it appears in the HTML. */
|
|
448
1004
|
Strict = 'Strict'
|
|
449
1005
|
}
|