@kreuzberg/html-to-markdown-node 3.4.0-rc.9 → 3.5.0-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/index.d.ts +676 -112
- package/index.js +110 -566
- package/package.json +27 -49
package/index.d.ts
CHANGED
|
@@ -1,242 +1,553 @@
|
|
|
1
1
|
/* auto-generated by NAPI-RS */
|
|
2
2
|
/* eslint-disable */
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
3
|
+
/**
|
|
4
|
+
* Shareable, thread-safe handle to a user-provided HTML visitor implementation.
|
|
5
|
+
*
|
|
6
|
+
* Pass an instance wrapped in this handle to `ConversionOptions` to
|
|
7
|
+
* customise how the HTML document is traversed and converted to Markdown.
|
|
8
|
+
* The handle may be cloned and shared across threads without additional
|
|
9
|
+
* synchronisation on the caller's side.
|
|
10
|
+
*/
|
|
11
|
+
export declare class VisitorHandle {
|
|
12
|
+
|
|
13
|
+
}
|
|
14
|
+
export type JsVisitorHandle = VisitorHandle
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* The type of an inline text annotation.
|
|
18
|
+
*
|
|
19
|
+
* Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
|
|
20
|
+
*/
|
|
21
|
+
export interface AnnotationKind {
|
|
13
22
|
annotation_type: string
|
|
14
23
|
url?: string
|
|
15
24
|
title?: string
|
|
16
25
|
}
|
|
17
26
|
|
|
18
|
-
|
|
27
|
+
/**
|
|
28
|
+
* Code block fence style in Markdown output.
|
|
29
|
+
*
|
|
30
|
+
* Determines how code blocks (`<pre><code>`) are rendered in Markdown.
|
|
31
|
+
*/
|
|
32
|
+
export declare const enum CodeBlockStyle {
|
|
33
|
+
/** Indented code blocks (4 spaces). `CommonMark` standard. */
|
|
19
34
|
Indented = 'Indented',
|
|
35
|
+
/** Fenced code blocks with backticks (```). Default (GFM). Supports language hints. */
|
|
20
36
|
Backticks = 'Backticks',
|
|
37
|
+
/** Fenced code blocks with tildes (~~~). Supports language hints. */
|
|
21
38
|
Tildes = 'Tildes'
|
|
22
39
|
}
|
|
23
40
|
|
|
24
|
-
|
|
41
|
+
/**
|
|
42
|
+
* Main conversion options for HTML to Markdown conversion.
|
|
43
|
+
*
|
|
44
|
+
* Use `ConversionOptions.builder()` to construct, or `Default.default()` for defaults.
|
|
45
|
+
*
|
|
46
|
+
* # Example
|
|
47
|
+
*/
|
|
48
|
+
export interface ConversionOptions {
|
|
49
|
+
/** Heading style to use in Markdown output (ATX `#` or Setext underline). */
|
|
25
50
|
headingStyle?: JsHeadingStyle
|
|
51
|
+
/** How to indent nested list items (spaces or tab). */
|
|
26
52
|
listIndentType?: JsListIndentType
|
|
53
|
+
/** Number of spaces (or tabs) to use for each level of list indentation. */
|
|
27
54
|
listIndentWidth?: number
|
|
55
|
+
/** Bullet character(s) to use for unordered list items (e.g. `"-"`, `"*"`). */
|
|
28
56
|
bullets?: string
|
|
57
|
+
/** Character used for bold/italic emphasis markers (`*` or `_`). */
|
|
29
58
|
strongEmSymbol?: string
|
|
59
|
+
/** Escape `*` characters in plain text to avoid unintended bold/italic. */
|
|
30
60
|
escapeAsterisks?: boolean
|
|
61
|
+
/** Escape `_` characters in plain text to avoid unintended bold/italic. */
|
|
31
62
|
escapeUnderscores?: boolean
|
|
63
|
+
/** Escape miscellaneous Markdown metacharacters (`[]()#` etc.) in plain text. */
|
|
32
64
|
escapeMisc?: boolean
|
|
65
|
+
/** Escape ASCII characters that have special meaning in certain Markdown dialects. */
|
|
33
66
|
escapeAscii?: boolean
|
|
67
|
+
/** Default language annotation for fenced code blocks that have no language hint. */
|
|
34
68
|
codeLanguage?: string
|
|
69
|
+
/** Automatically convert bare URLs into Markdown autolinks. */
|
|
35
70
|
autolinks?: boolean
|
|
71
|
+
/** Emit a default title when no `<title>` tag is present. */
|
|
36
72
|
defaultTitle?: boolean
|
|
73
|
+
/** Render `<br>` elements inside table cells as literal line breaks. */
|
|
37
74
|
brInTables?: boolean
|
|
75
|
+
/**
|
|
76
|
+
* Emit tables without column padding (compact GFM format).
|
|
77
|
+
*
|
|
78
|
+
* When `true`, column widths are not computed and cells are emitted with
|
|
79
|
+
* no trailing spaces. Separator rows use exactly `---` per column.
|
|
80
|
+
* Produces token-efficient output suitable for RAG / LLM contexts.
|
|
81
|
+
*
|
|
82
|
+
* Default `false` (aligned padding preserved).
|
|
83
|
+
*/
|
|
84
|
+
compactTables?: boolean
|
|
85
|
+
/** Style used for `<mark>` / highlighted text (e.g. `==text==`). */
|
|
38
86
|
highlightStyle?: JsHighlightStyle
|
|
87
|
+
/**
|
|
88
|
+
* Populate `result.metadata` with `<head>` / `<meta>` extraction
|
|
89
|
+
* (title, description, Open Graph, Twitter Card, JSON-LD, …).
|
|
90
|
+
*
|
|
91
|
+
* Default `true`. Disabling skips the metadata pass only — table
|
|
92
|
+
* extraction into `result.tables` runs unconditionally.
|
|
93
|
+
*/
|
|
39
94
|
extractMetadata?: boolean
|
|
95
|
+
/**
|
|
96
|
+
* Controls how whitespace sequences are normalised in the converted output.
|
|
97
|
+
*
|
|
98
|
+
* - [`WhitespaceMode::Normalized`] (default) — collapses consecutive whitespace characters
|
|
99
|
+
* (spaces, tabs, newlines) to a single space, matching browser rendering behaviour.
|
|
100
|
+
* - [`WhitespaceMode::Strict`] — preserves all whitespace exactly as it appears in the
|
|
101
|
+
* source HTML, including runs of spaces and embedded newlines.
|
|
102
|
+
*
|
|
103
|
+
* Choose `Strict` only when the source HTML uses deliberate whitespace (e.g. pre-formatted
|
|
104
|
+
* content outside `<pre>` tags). For most documents `Normalized` produces cleaner output.
|
|
105
|
+
*/
|
|
40
106
|
whitespaceMode?: JsWhitespaceMode
|
|
107
|
+
/** Strip all newlines from the output, producing a single-line result. */
|
|
41
108
|
stripNewlines?: boolean
|
|
109
|
+
/** Wrap long lines at [`wrap_width`](Self::wrap_width) characters. */
|
|
42
110
|
wrap?: boolean
|
|
111
|
+
/**
|
|
112
|
+
* Maximum output line width in characters when [`wrap`](Self::wrap) is `true` (default `80`).
|
|
113
|
+
*
|
|
114
|
+
* Lines are broken at word boundaries so that no line exceeds this length. A value of `0`
|
|
115
|
+
* is treated as "no limit" — equivalent to leaving [`wrap`](Self::wrap) disabled. Has no
|
|
116
|
+
* effect when `wrap` is `false`.
|
|
117
|
+
*/
|
|
43
118
|
wrapWidth?: number
|
|
119
|
+
/** Treat the entire document as inline content (no block-level wrappers). */
|
|
44
120
|
convertAsInline?: boolean
|
|
121
|
+
/** Markdown notation for subscript text (e.g. `"~"`). */
|
|
45
122
|
subSymbol?: string
|
|
123
|
+
/** Markdown notation for superscript text (e.g. `"^"`). */
|
|
46
124
|
supSymbol?: string
|
|
125
|
+
/** How to encode hard line breaks (`<br>`) in Markdown. */
|
|
47
126
|
newlineStyle?: JsNewlineStyle
|
|
127
|
+
/** Style used for fenced code blocks (backticks or tilde). */
|
|
48
128
|
codeBlockStyle?: JsCodeBlockStyle
|
|
129
|
+
/** HTML tag names whose `<img>` children are kept inline instead of block. */
|
|
49
130
|
keepInlineImagesIn?: Array<string>
|
|
131
|
+
/**
|
|
132
|
+
* Options for the HTML pre-processing pass applied before conversion begins.
|
|
133
|
+
*
|
|
134
|
+
* Pre-processing runs before the HTML is handed to the converter and can perform operations
|
|
135
|
+
* such as unwrapping redundant wrapper elements, removing tracking pixels, and normalising
|
|
136
|
+
* vendor-specific markup. See [`PreprocessingOptions`] for the full set of knobs.
|
|
137
|
+
*
|
|
138
|
+
* Defaults to [`PreprocessingOptions::default()`], which enables the standard cleaning
|
|
139
|
+
* passes. Set individual fields on [`PreprocessingOptions`] (or construct via
|
|
140
|
+
* [`ConversionOptions::builder`]) to opt in or out of specific passes.
|
|
141
|
+
*/
|
|
50
142
|
preprocessing?: JsPreprocessingOptions
|
|
143
|
+
/** Expected character encoding of the input HTML (default `"utf-8"`). */
|
|
51
144
|
encoding?: string
|
|
145
|
+
/** Emit debug information during conversion. */
|
|
52
146
|
debug?: boolean
|
|
147
|
+
/** HTML tag names whose content is stripped from the output entirely. */
|
|
53
148
|
stripTags?: Array<string>
|
|
149
|
+
/** HTML tag names that are preserved verbatim in the output. */
|
|
54
150
|
preserveTags?: Array<string>
|
|
151
|
+
/** Skip conversion of `<img>` elements (omit images from output). */
|
|
55
152
|
skipImages?: boolean
|
|
153
|
+
/** Link rendering style (inline or reference). */
|
|
56
154
|
linkStyle?: JsLinkStyle
|
|
155
|
+
/** Target output format (Markdown, plain text, etc.). */
|
|
57
156
|
outputFormat?: JsOutputFormat
|
|
157
|
+
/** Include structured document tree in result. */
|
|
58
158
|
includeDocumentStructure?: boolean
|
|
159
|
+
/** Extract inline images from data URIs and SVGs. */
|
|
59
160
|
extractImages?: boolean
|
|
161
|
+
/** Maximum decoded image size in bytes (default 5MB). */
|
|
60
162
|
maxImageSize?: number
|
|
163
|
+
/** Capture SVG elements as images. */
|
|
61
164
|
captureSvg?: boolean
|
|
165
|
+
/** Infer image dimensions from data. */
|
|
62
166
|
inferDimensions?: boolean
|
|
167
|
+
/**
|
|
168
|
+
* Maximum DOM traversal depth. `None` means unlimited.
|
|
169
|
+
* When set, subtrees beyond this depth are silently truncated.
|
|
170
|
+
*/
|
|
63
171
|
maxDepth?: number
|
|
172
|
+
/**
|
|
173
|
+
* CSS selectors for elements to exclude entirely (element + all content).
|
|
174
|
+
*
|
|
175
|
+
* Unlike `strip_tags` (which removes the tag wrapper but keeps children),
|
|
176
|
+
* excluded elements and all their descendants are dropped from the output.
|
|
177
|
+
* Supports any CSS selector that `tl` supports: tag names, `.class`,
|
|
178
|
+
* `#id`, `[attribute]`, etc.
|
|
179
|
+
*
|
|
180
|
+
* Invalid selectors are silently skipped at conversion time.
|
|
181
|
+
*
|
|
182
|
+
* Example: `vec![".cookie-banner".into(), "#ad-container".into(), "[role='complementary']".into()]`
|
|
183
|
+
*/
|
|
64
184
|
excludeSelectors?: Array<string>
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
outputFormat?: JsOutputFormat
|
|
101
|
-
includeDocumentStructure?: boolean
|
|
102
|
-
extractImages?: boolean
|
|
103
|
-
maxImageSize?: number
|
|
104
|
-
captureSvg?: boolean
|
|
105
|
-
inferDimensions?: boolean
|
|
106
|
-
maxDepth?: number
|
|
107
|
-
excludeSelectors?: Array<string>
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
export interface JsConversionResult {
|
|
185
|
+
/**
|
|
186
|
+
* Optional visitor for custom traversal logic.
|
|
187
|
+
*
|
|
188
|
+
* When set, the visitor's callbacks are invoked for matching HTML elements
|
|
189
|
+
* during conversion, allowing custom output, skipping, or HTML preservation.
|
|
190
|
+
* See `HtmlVisitor`.
|
|
191
|
+
*/
|
|
192
|
+
visitor?: object
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
export declare function conversionOptionsDefault(): ConversionOptions
|
|
196
|
+
|
|
197
|
+
/**
|
|
198
|
+
* The primary result of HTML conversion and extraction.
|
|
199
|
+
*
|
|
200
|
+
* Contains the converted text output, optional structured document tree,
|
|
201
|
+
* metadata, extracted tables, images, and processing warnings.
|
|
202
|
+
*
|
|
203
|
+
* # Example
|
|
204
|
+
*
|
|
205
|
+
* ```text
|
|
206
|
+
* use html_to_markdown_rs::{convert, ConversionOptions};
|
|
207
|
+
*
|
|
208
|
+
* let result = convert("<h1>Hello</h1><p>World</p>", None)?;
|
|
209
|
+
* assert!(result.content.is_some());
|
|
210
|
+
* assert!(result.warnings.is_empty());
|
|
211
|
+
* ```
|
|
212
|
+
*/
|
|
213
|
+
export interface ConversionResult {
|
|
214
|
+
/**
|
|
215
|
+
* Converted text output (markdown, djot, or plain text).
|
|
216
|
+
*
|
|
217
|
+
* `None` when `output_format` is set to `OutputFormat::None`,
|
|
218
|
+
* indicating extraction-only mode.
|
|
219
|
+
*/
|
|
111
220
|
content?: string
|
|
112
|
-
|
|
113
|
-
|
|
221
|
+
/**
|
|
222
|
+
* Structured document tree with semantic elements.
|
|
223
|
+
*
|
|
224
|
+
* Populated when `ConversionOptions::include_document_structure` is `true`. `None`
|
|
225
|
+
* otherwise (the default), which avoids the overhead of building the tree.
|
|
226
|
+
*
|
|
227
|
+
* When present, the tree mirrors the converted document: headings open
|
|
228
|
+
* `Group` sections, paragraphs and list items carry
|
|
229
|
+
* inline `TextAnnotation`s, and tables reference the same
|
|
230
|
+
* `TableGrid` data exposed in [`Self::tables`].
|
|
231
|
+
*
|
|
232
|
+
* Note: this field is independent of the `metadata` feature flag. Document structure
|
|
233
|
+
* collection is always available at runtime; it is gated only by the runtime option, not
|
|
234
|
+
* by a compile-time feature.
|
|
235
|
+
*/
|
|
236
|
+
document?: DocumentStructure
|
|
237
|
+
/** Extracted HTML metadata (title, OG, links, images, structured data). */
|
|
238
|
+
metadata?: HtmlMetadata
|
|
239
|
+
/** Extracted tables with structured cell data and markdown representation. */
|
|
114
240
|
tables?: Array<JsTableData>
|
|
241
|
+
/**
|
|
242
|
+
* Extracted inline images (data URIs and SVGs).
|
|
243
|
+
*
|
|
244
|
+
* Populated when `extract_images` is `true` in options.
|
|
245
|
+
*/
|
|
115
246
|
images?: Array<string>
|
|
247
|
+
/** Non-fatal processing warnings. */
|
|
116
248
|
warnings?: Array<JsProcessingWarning>
|
|
117
249
|
}
|
|
118
250
|
|
|
119
|
-
export
|
|
251
|
+
export declare function convert(html: string, options?: ConversionOptions | undefined | null, visitor?: object | undefined | null): ConversionResult
|
|
252
|
+
|
|
253
|
+
/**
|
|
254
|
+
* Document-level metadata extracted from `<head>` and top-level elements.
|
|
255
|
+
*
|
|
256
|
+
* Contains all metadata typically used by search engines, social media platforms,
|
|
257
|
+
* and browsers for document indexing and presentation.
|
|
258
|
+
*
|
|
259
|
+
* # Examples
|
|
260
|
+
*/
|
|
261
|
+
export interface DocumentMetadata {
|
|
262
|
+
/** Document title from `<title>` tag */
|
|
120
263
|
title?: string
|
|
264
|
+
/** Document description from `<meta name="description">` tag */
|
|
121
265
|
description?: string
|
|
266
|
+
/** Document keywords from `<meta name="keywords">` tag, split on commas */
|
|
122
267
|
keywords?: Array<string>
|
|
268
|
+
/** Document author from `<meta name="author">` tag */
|
|
123
269
|
author?: string
|
|
270
|
+
/** Canonical URL from `<link rel="canonical">` tag */
|
|
124
271
|
canonicalUrl?: string
|
|
272
|
+
/** Base URL from `<base href="">` tag for resolving relative URLs */
|
|
125
273
|
baseHref?: string
|
|
274
|
+
/** Document language from `lang` attribute */
|
|
126
275
|
language?: string
|
|
276
|
+
/** Document text direction from `dir` attribute */
|
|
127
277
|
textDirection?: JsTextDirection
|
|
278
|
+
/**
|
|
279
|
+
* Open Graph metadata (og:* properties) for social media
|
|
280
|
+
* Keys like "title", "description", "image", "url", etc.
|
|
281
|
+
*/
|
|
128
282
|
openGraph?: Record<string, string>
|
|
283
|
+
/**
|
|
284
|
+
* Twitter Card metadata (twitter:* properties)
|
|
285
|
+
* Keys like "card", "site", "creator", "title", "description", "image", etc.
|
|
286
|
+
*/
|
|
129
287
|
twitterCard?: Record<string, string>
|
|
288
|
+
/**
|
|
289
|
+
* Additional meta tags not covered by specific fields
|
|
290
|
+
* Keys are meta name/property attributes, values are content
|
|
291
|
+
*/
|
|
130
292
|
metaTags?: Record<string, string>
|
|
131
293
|
}
|
|
132
294
|
|
|
133
|
-
|
|
295
|
+
/** A single node in the document tree. */
|
|
296
|
+
export interface DocumentNode {
|
|
297
|
+
/** Deterministic node identifier. */
|
|
134
298
|
id: string
|
|
299
|
+
/** The semantic content of this node. */
|
|
135
300
|
content: JsNodeContent
|
|
301
|
+
/** Index of the parent node (None for root nodes). */
|
|
136
302
|
parent?: number
|
|
303
|
+
/** Indices of child nodes in reading order. */
|
|
137
304
|
children: Array<number>
|
|
305
|
+
/** Inline formatting annotations (bold, italic, links, etc.) with byte offsets into the text. */
|
|
138
306
|
annotations: Array<JsTextAnnotation>
|
|
307
|
+
/**
|
|
308
|
+
* Format-specific attributes preserved from the source HTML element.
|
|
309
|
+
*
|
|
310
|
+
* Keys are lowercased attribute names as they appear in the HTML (e.g. `"class"`, `"id"`,
|
|
311
|
+
* `"data-foo"`). Values are the raw attribute strings, copied verbatim from the source —
|
|
312
|
+
* no HTML entity decoding is applied here.
|
|
313
|
+
*
|
|
314
|
+
* The map is `None` when no attributes are present (omitted entirely in serialized output).
|
|
315
|
+
* Not every HTML attribute is preserved: only attributes that carry semantic or structural
|
|
316
|
+
* significance for the node type are collected. For example, heading nodes capture the `"id"`
|
|
317
|
+
* attribute for anchor linking; other element-level attributes may be silently dropped.
|
|
318
|
+
*/
|
|
139
319
|
attributes?: Record<string, string>
|
|
140
320
|
}
|
|
141
321
|
|
|
142
|
-
|
|
322
|
+
/**
|
|
323
|
+
* A structured document tree representing the semantic content of an HTML document.
|
|
324
|
+
*
|
|
325
|
+
* Uses a flat node array with index-based parent/child references for efficient traversal.
|
|
326
|
+
*/
|
|
327
|
+
export interface DocumentStructure {
|
|
328
|
+
/** All nodes in document reading order. */
|
|
143
329
|
nodes: Array<JsDocumentNode>
|
|
330
|
+
/** The source format (always "html" for this crate). */
|
|
144
331
|
sourceFormat?: string
|
|
145
332
|
}
|
|
146
333
|
|
|
147
|
-
|
|
334
|
+
/** A single cell in a table grid. */
|
|
335
|
+
export interface GridCell {
|
|
336
|
+
/** The text content of the cell. */
|
|
148
337
|
content: string
|
|
338
|
+
/** 0-indexed row position. */
|
|
149
339
|
row: number
|
|
340
|
+
/** 0-indexed column position. */
|
|
150
341
|
col: number
|
|
342
|
+
/** Number of rows this cell spans (default 1). */
|
|
151
343
|
rowSpan: number
|
|
344
|
+
/** Number of columns this cell spans (default 1). */
|
|
152
345
|
colSpan: number
|
|
346
|
+
/** Whether this is a header cell (`<th>`). */
|
|
153
347
|
isHeader: boolean
|
|
154
348
|
}
|
|
155
349
|
|
|
156
|
-
|
|
350
|
+
/**
|
|
351
|
+
* Header element metadata with hierarchy tracking.
|
|
352
|
+
*
|
|
353
|
+
* Captures heading elements (h1-h6) with their text content, identifiers,
|
|
354
|
+
* and position in the document structure.
|
|
355
|
+
*
|
|
356
|
+
* # Examples
|
|
357
|
+
*/
|
|
358
|
+
export interface HeaderMetadata {
|
|
359
|
+
/** Header level: 1 (h1) through 6 (h6) */
|
|
157
360
|
level: number
|
|
361
|
+
/** Normalized text content of the header */
|
|
158
362
|
text: string
|
|
363
|
+
/** HTML id attribute if present */
|
|
159
364
|
id?: string
|
|
365
|
+
/** Document tree depth at the header element */
|
|
160
366
|
depth: number
|
|
367
|
+
/** Byte offset in original HTML document */
|
|
161
368
|
htmlOffset: number
|
|
162
369
|
}
|
|
163
370
|
|
|
164
|
-
|
|
371
|
+
/**
|
|
372
|
+
* Heading style options for Markdown output.
|
|
373
|
+
*
|
|
374
|
+
* Controls how headings (h1-h6) are rendered in the output Markdown.
|
|
375
|
+
*/
|
|
376
|
+
export declare const enum HeadingStyle {
|
|
377
|
+
/** Underlined style (=== for h1, --- for h2). */
|
|
165
378
|
Underlined = 'Underlined',
|
|
379
|
+
/** ATX style (# for h1, ## for h2, etc.). Default. */
|
|
166
380
|
Atx = 'Atx',
|
|
381
|
+
/** ATX closed style (# title #, with closing hashes). */
|
|
167
382
|
AtxClosed = 'AtxClosed'
|
|
168
383
|
}
|
|
169
384
|
|
|
170
|
-
|
|
385
|
+
/**
|
|
386
|
+
* Highlight rendering style for `<mark>` elements.
|
|
387
|
+
*
|
|
388
|
+
* Controls how highlighted text is rendered in Markdown output.
|
|
389
|
+
*/
|
|
390
|
+
export declare const enum HighlightStyle {
|
|
391
|
+
/** Double equals syntax (==text==). Default. Pandoc-compatible. */
|
|
171
392
|
DoubleEqual = 'DoubleEqual',
|
|
393
|
+
/** Preserve as HTML (==text==). Original HTML tag. */
|
|
172
394
|
Html = 'Html',
|
|
395
|
+
/** Render as bold (**text**). Uses strong emphasis. */
|
|
173
396
|
Bold = 'Bold',
|
|
397
|
+
/** Strip formatting, render as plain text. No markup. */
|
|
174
398
|
None = 'None'
|
|
175
399
|
}
|
|
176
400
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
export interface
|
|
401
|
+
/**
|
|
402
|
+
* Comprehensive metadata extraction result from HTML document.
|
|
403
|
+
*
|
|
404
|
+
* Contains all extracted metadata types in a single structure,
|
|
405
|
+
* suitable for serialization and transmission across language boundaries.
|
|
406
|
+
*
|
|
407
|
+
* # Examples
|
|
408
|
+
*/
|
|
409
|
+
export interface HtmlMetadata {
|
|
410
|
+
/** Document-level metadata (title, description, canonical, etc.) */
|
|
411
|
+
document?: DocumentMetadata
|
|
412
|
+
/** Extracted header elements with hierarchy */
|
|
413
|
+
headers?: Array<HeaderMetadata>
|
|
414
|
+
/** Extracted hyperlinks with type classification */
|
|
415
|
+
links?: Array<LinkMetadata>
|
|
416
|
+
/** Extracted images with source and dimensions */
|
|
417
|
+
images?: Array<ImageMetadata>
|
|
418
|
+
/** Extracted structured data blocks */
|
|
419
|
+
structuredData?: Array<StructuredData>
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
/**
|
|
423
|
+
* Image metadata with source and dimensions.
|
|
424
|
+
*
|
|
425
|
+
* Captures `<img>` elements and inline `<svg>` elements with metadata
|
|
426
|
+
* for image analysis and optimization.
|
|
427
|
+
*
|
|
428
|
+
* # Examples
|
|
429
|
+
*/
|
|
430
|
+
export interface ImageMetadata {
|
|
431
|
+
/** Image source (URL, data URI, or SVG content identifier) */
|
|
186
432
|
src: string
|
|
433
|
+
/** Alternative text from alt attribute (for accessibility) */
|
|
187
434
|
alt?: string
|
|
435
|
+
/** Title attribute (often shown as tooltip) */
|
|
188
436
|
title?: string
|
|
437
|
+
/** Image dimensions as (width, height) if available */
|
|
189
438
|
dimensions?: Array<number>
|
|
439
|
+
/** Image type classification */
|
|
190
440
|
imageType: JsImageType
|
|
441
|
+
/** Additional HTML attributes */
|
|
191
442
|
attributes: Record<string, string>
|
|
192
443
|
}
|
|
193
444
|
|
|
194
|
-
|
|
445
|
+
/**
|
|
446
|
+
* Image source classification for proper handling and processing.
|
|
447
|
+
*
|
|
448
|
+
* Determines whether an image is embedded (data URI), inline SVG, external, or relative.
|
|
449
|
+
*/
|
|
450
|
+
export declare const enum ImageType {
|
|
451
|
+
/** Data URI embedded image (base64 or other encoding) */
|
|
195
452
|
DataUri = 'data_uri',
|
|
453
|
+
/** Inline SVG element */
|
|
196
454
|
InlineSvg = 'inline_svg',
|
|
455
|
+
/** External image URL (http/https) */
|
|
197
456
|
External = 'external',
|
|
457
|
+
/** Relative image path */
|
|
198
458
|
Relative = 'relative'
|
|
199
459
|
}
|
|
200
460
|
|
|
201
|
-
|
|
461
|
+
/**
|
|
462
|
+
* Hyperlink metadata with categorization and attributes.
|
|
463
|
+
*
|
|
464
|
+
* Represents `<a>` elements with parsed href values, text content, and link type classification.
|
|
465
|
+
*
|
|
466
|
+
* # Examples
|
|
467
|
+
*/
|
|
468
|
+
export interface LinkMetadata {
|
|
469
|
+
/** The href URL value */
|
|
202
470
|
href: string
|
|
471
|
+
/** Link text content (normalized, concatenated if mixed with elements) */
|
|
203
472
|
text: string
|
|
473
|
+
/** Optional title attribute (often shown as tooltip) */
|
|
204
474
|
title?: string
|
|
475
|
+
/** Link type classification */
|
|
205
476
|
linkType: JsLinkType
|
|
477
|
+
/** Rel attribute values (e.g., "nofollow", "stylesheet", "canonical") */
|
|
206
478
|
rel: Array<string>
|
|
479
|
+
/** Additional HTML attributes */
|
|
207
480
|
attributes: Record<string, string>
|
|
208
481
|
}
|
|
209
482
|
|
|
210
|
-
|
|
483
|
+
/**
|
|
484
|
+
* Link rendering style in Markdown output.
|
|
485
|
+
*
|
|
486
|
+
* Controls whether links and images use inline `[text](url)` syntax or
|
|
487
|
+
* reference-style `[text][1]` syntax with definitions collected at the end.
|
|
488
|
+
*/
|
|
489
|
+
export declare const enum LinkStyle {
|
|
490
|
+
/** Inline links: `[text](url)`. Default. */
|
|
211
491
|
Inline = 'Inline',
|
|
492
|
+
/** Reference-style links: `[text][1]` with `[1]: url` at end of document. */
|
|
212
493
|
Reference = 'Reference'
|
|
213
494
|
}
|
|
214
495
|
|
|
215
|
-
|
|
496
|
+
/**
|
|
497
|
+
* Link classification based on href value and document context.
|
|
498
|
+
*
|
|
499
|
+
* Used to categorize links during extraction for filtering and analysis.
|
|
500
|
+
*/
|
|
501
|
+
export declare const enum LinkType {
|
|
502
|
+
/** Anchor link within same document (href starts with #) */
|
|
216
503
|
Anchor = 'anchor',
|
|
504
|
+
/** Internal link within same domain */
|
|
217
505
|
Internal = 'internal',
|
|
506
|
+
/** External link to different domain */
|
|
218
507
|
External = 'external',
|
|
508
|
+
/** Email link (mailto:) */
|
|
219
509
|
Email = 'email',
|
|
510
|
+
/** Phone link (tel:) */
|
|
220
511
|
Phone = 'phone',
|
|
512
|
+
/** Other protocol or unclassifiable */
|
|
221
513
|
Other = 'other'
|
|
222
514
|
}
|
|
223
515
|
|
|
224
|
-
|
|
516
|
+
/**
|
|
517
|
+
* List indentation character type.
|
|
518
|
+
*
|
|
519
|
+
* Controls whether list items are indented with spaces or tabs.
|
|
520
|
+
*/
|
|
521
|
+
export declare const enum ListIndentType {
|
|
522
|
+
/** Use spaces for indentation. Default. Width controlled by `list_indent_width`. */
|
|
225
523
|
Spaces = 'Spaces',
|
|
524
|
+
/** Use tabs for indentation. */
|
|
226
525
|
Tabs = 'Tabs'
|
|
227
526
|
}
|
|
228
527
|
|
|
229
|
-
|
|
528
|
+
/**
|
|
529
|
+
* Line break syntax in Markdown output.
|
|
530
|
+
*
|
|
531
|
+
* Controls how soft line breaks (from `<br>` or line breaks in source) are rendered.
|
|
532
|
+
*/
|
|
533
|
+
export declare const enum NewlineStyle {
|
|
534
|
+
/** Two trailing spaces at end of line. Default. Standard Markdown syntax. */
|
|
230
535
|
Spaces = 'Spaces',
|
|
536
|
+
/** Backslash at end of line. Alternative Markdown syntax. */
|
|
231
537
|
Backslash = 'Backslash'
|
|
232
538
|
}
|
|
233
539
|
|
|
234
|
-
|
|
540
|
+
/**
|
|
541
|
+
* The semantic content type of a document node.
|
|
542
|
+
*
|
|
543
|
+
* Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
|
|
544
|
+
*/
|
|
545
|
+
export interface NodeContent {
|
|
235
546
|
node_type: string
|
|
236
547
|
level?: number
|
|
237
548
|
text?: string
|
|
238
549
|
ordered?: boolean
|
|
239
|
-
grid?:
|
|
550
|
+
grid?: TableGrid
|
|
240
551
|
description?: string
|
|
241
552
|
src?: string
|
|
242
553
|
imageIndex?: number
|
|
@@ -251,191 +562,444 @@ export interface JsNodeContent {
|
|
|
251
562
|
headingText?: string
|
|
252
563
|
}
|
|
253
564
|
|
|
254
|
-
|
|
565
|
+
/**
|
|
566
|
+
* Context information passed to all visitor methods.
|
|
567
|
+
*
|
|
568
|
+
* Provides comprehensive metadata about the current node being visited,
|
|
569
|
+
* including its type, attributes, position in the DOM tree, and parent context.
|
|
570
|
+
*/
|
|
571
|
+
export interface NodeContext {
|
|
572
|
+
/** Coarse-grained node type classification */
|
|
255
573
|
nodeType: JsNodeType
|
|
574
|
+
/** Raw HTML tag name (e.g., "div", "h1", "custom-element") */
|
|
256
575
|
tagName: string
|
|
576
|
+
/** All HTML attributes as key-value pairs */
|
|
257
577
|
attributes: Record<string, string>
|
|
578
|
+
/** Depth in the DOM tree (0 = root) */
|
|
258
579
|
depth: number
|
|
580
|
+
/** Index among siblings (0-based) */
|
|
259
581
|
indexInParent: number
|
|
582
|
+
/** Parent element's tag name (None if root) */
|
|
260
583
|
parentTag?: string
|
|
584
|
+
/** Whether this element is treated as inline vs block */
|
|
261
585
|
isInline: boolean
|
|
262
586
|
}
|
|
263
587
|
|
|
264
|
-
|
|
588
|
+
/**
|
|
589
|
+
* Node type enumeration covering all HTML element types.
|
|
590
|
+
*
|
|
591
|
+
* This enum categorizes all HTML elements that the converter recognizes,
|
|
592
|
+
* providing a coarse-grained classification for visitor dispatch.
|
|
593
|
+
*/
|
|
594
|
+
export declare const enum NodeType {
|
|
595
|
+
/** Text node (most frequent - 100+ per document) */
|
|
265
596
|
Text = 'Text',
|
|
597
|
+
/** Generic element node */
|
|
266
598
|
Element = 'Element',
|
|
599
|
+
/** Heading elements (h1-h6) */
|
|
267
600
|
Heading = 'Heading',
|
|
601
|
+
/** Paragraph element */
|
|
268
602
|
Paragraph = 'Paragraph',
|
|
603
|
+
/** Generic div container */
|
|
269
604
|
Div = 'Div',
|
|
605
|
+
/** Blockquote element */
|
|
270
606
|
Blockquote = 'Blockquote',
|
|
607
|
+
/** Preformatted text block */
|
|
271
608
|
Pre = 'Pre',
|
|
609
|
+
/** Horizontal rule */
|
|
272
610
|
Hr = 'Hr',
|
|
611
|
+
/** Ordered or unordered list (ul, ol) */
|
|
273
612
|
List = 'List',
|
|
613
|
+
/** List item (li) */
|
|
274
614
|
ListItem = 'ListItem',
|
|
615
|
+
/** Definition list (dl) */
|
|
275
616
|
DefinitionList = 'DefinitionList',
|
|
617
|
+
/** Definition term (dt) */
|
|
276
618
|
DefinitionTerm = 'DefinitionTerm',
|
|
619
|
+
/** Definition description (dd) */
|
|
277
620
|
DefinitionDescription = 'DefinitionDescription',
|
|
621
|
+
/** Table element */
|
|
278
622
|
Table = 'Table',
|
|
623
|
+
/** Table row (tr) */
|
|
279
624
|
TableRow = 'TableRow',
|
|
625
|
+
/** Table cell (td, th) */
|
|
280
626
|
TableCell = 'TableCell',
|
|
627
|
+
/** Table header cell (th) */
|
|
281
628
|
TableHeader = 'TableHeader',
|
|
629
|
+
/** Table body (tbody) */
|
|
282
630
|
TableBody = 'TableBody',
|
|
631
|
+
/** Table head (thead) */
|
|
283
632
|
TableHead = 'TableHead',
|
|
633
|
+
/** Table foot (tfoot) */
|
|
284
634
|
TableFoot = 'TableFoot',
|
|
635
|
+
/** Anchor link (a) */
|
|
285
636
|
Link = 'Link',
|
|
637
|
+
/** Image (img) */
|
|
286
638
|
Image = 'Image',
|
|
639
|
+
/** Strong/bold (strong, b) */
|
|
287
640
|
Strong = 'Strong',
|
|
641
|
+
/** Emphasis/italic (em, i) */
|
|
288
642
|
Em = 'Em',
|
|
643
|
+
/** Inline code (code) */
|
|
289
644
|
Code = 'Code',
|
|
645
|
+
/** Strikethrough (s, del, strike) */
|
|
290
646
|
Strikethrough = 'Strikethrough',
|
|
647
|
+
/** Underline (u, ins) */
|
|
291
648
|
Underline = 'Underline',
|
|
649
|
+
/** Subscript (sub) */
|
|
292
650
|
Subscript = 'Subscript',
|
|
651
|
+
/** Superscript (sup) */
|
|
293
652
|
Superscript = 'Superscript',
|
|
653
|
+
/** Mark/highlight (mark) */
|
|
294
654
|
Mark = 'Mark',
|
|
655
|
+
/** Small text (small) */
|
|
295
656
|
Small = 'Small',
|
|
657
|
+
/** Line break (br) */
|
|
296
658
|
Br = 'Br',
|
|
659
|
+
/** Span element */
|
|
297
660
|
Span = 'Span',
|
|
661
|
+
/** Article element */
|
|
298
662
|
Article = 'Article',
|
|
663
|
+
/** Section element */
|
|
299
664
|
Section = 'Section',
|
|
665
|
+
/** Navigation element */
|
|
300
666
|
Nav = 'Nav',
|
|
667
|
+
/** Aside element */
|
|
301
668
|
Aside = 'Aside',
|
|
669
|
+
/** Header element */
|
|
302
670
|
Header = 'Header',
|
|
671
|
+
/** Footer element */
|
|
303
672
|
Footer = 'Footer',
|
|
673
|
+
/** Main element */
|
|
304
674
|
Main = 'Main',
|
|
675
|
+
/** Figure element */
|
|
305
676
|
Figure = 'Figure',
|
|
677
|
+
/** Figure caption */
|
|
306
678
|
Figcaption = 'Figcaption',
|
|
679
|
+
/** Time element */
|
|
307
680
|
Time = 'Time',
|
|
681
|
+
/** Details element */
|
|
308
682
|
Details = 'Details',
|
|
683
|
+
/** Summary element */
|
|
309
684
|
Summary = 'Summary',
|
|
685
|
+
/** Form element */
|
|
310
686
|
Form = 'Form',
|
|
687
|
+
/** Input element */
|
|
311
688
|
Input = 'Input',
|
|
689
|
+
/** Select element */
|
|
312
690
|
Select = 'Select',
|
|
691
|
+
/** Option element */
|
|
313
692
|
Option = 'Option',
|
|
693
|
+
/** Button element */
|
|
314
694
|
Button = 'Button',
|
|
695
|
+
/** Textarea element */
|
|
315
696
|
Textarea = 'Textarea',
|
|
697
|
+
/** Label element */
|
|
316
698
|
Label = 'Label',
|
|
699
|
+
/** Fieldset element */
|
|
317
700
|
Fieldset = 'Fieldset',
|
|
701
|
+
/** Legend element */
|
|
318
702
|
Legend = 'Legend',
|
|
703
|
+
/** Audio element */
|
|
319
704
|
Audio = 'Audio',
|
|
705
|
+
/** Video element */
|
|
320
706
|
Video = 'Video',
|
|
707
|
+
/** Picture element */
|
|
321
708
|
Picture = 'Picture',
|
|
709
|
+
/** Source element */
|
|
322
710
|
Source = 'Source',
|
|
711
|
+
/** Iframe element */
|
|
323
712
|
Iframe = 'Iframe',
|
|
713
|
+
/** SVG element */
|
|
324
714
|
Svg = 'Svg',
|
|
715
|
+
/** Canvas element */
|
|
325
716
|
Canvas = 'Canvas',
|
|
717
|
+
/** Ruby annotation */
|
|
326
718
|
Ruby = 'Ruby',
|
|
719
|
+
/** Ruby text */
|
|
327
720
|
Rt = 'Rt',
|
|
721
|
+
/** Ruby parenthesis */
|
|
328
722
|
Rp = 'Rp',
|
|
723
|
+
/** Abbreviation */
|
|
329
724
|
Abbr = 'Abbr',
|
|
725
|
+
/** Keyboard input */
|
|
330
726
|
Kbd = 'Kbd',
|
|
727
|
+
/** Sample output */
|
|
331
728
|
Samp = 'Samp',
|
|
729
|
+
/** Variable */
|
|
332
730
|
Var = 'Var',
|
|
731
|
+
/** Citation */
|
|
333
732
|
Cite = 'Cite',
|
|
733
|
+
/** Quote */
|
|
334
734
|
Q = 'Q',
|
|
735
|
+
/** Deleted text */
|
|
335
736
|
Del = 'Del',
|
|
737
|
+
/** Inserted text */
|
|
336
738
|
Ins = 'Ins',
|
|
739
|
+
/** Data element */
|
|
337
740
|
Data = 'Data',
|
|
741
|
+
/** Meter element */
|
|
338
742
|
Meter = 'Meter',
|
|
743
|
+
/** Progress element */
|
|
339
744
|
Progress = 'Progress',
|
|
745
|
+
/** Output element */
|
|
340
746
|
Output = 'Output',
|
|
747
|
+
/** Template element */
|
|
341
748
|
Template = 'Template',
|
|
749
|
+
/** Slot element */
|
|
342
750
|
Slot = 'Slot',
|
|
751
|
+
/** HTML root element */
|
|
343
752
|
Html = 'Html',
|
|
753
|
+
/** Head element */
|
|
344
754
|
Head = 'Head',
|
|
755
|
+
/** Body element */
|
|
345
756
|
Body = 'Body',
|
|
757
|
+
/** Title element */
|
|
346
758
|
Title = 'Title',
|
|
759
|
+
/** Meta element */
|
|
347
760
|
Meta = 'Meta',
|
|
761
|
+
/** Link element (not anchor) */
|
|
348
762
|
LinkTag = 'LinkTag',
|
|
763
|
+
/** Style element */
|
|
349
764
|
Style = 'Style',
|
|
765
|
+
/** Script element */
|
|
350
766
|
Script = 'Script',
|
|
767
|
+
/** Base element */
|
|
351
768
|
Base = 'Base',
|
|
769
|
+
/** Custom element (web components) or unknown tag */
|
|
352
770
|
Custom = 'Custom'
|
|
353
771
|
}
|
|
354
772
|
|
|
355
|
-
|
|
773
|
+
/**
|
|
774
|
+
* Output format for conversion.
|
|
775
|
+
*
|
|
776
|
+
* Specifies the target markup language format for the conversion output.
|
|
777
|
+
*/
|
|
778
|
+
export declare const enum OutputFormat {
|
|
779
|
+
/** Standard Markdown (CommonMark compatible). Default. */
|
|
356
780
|
Markdown = 'Markdown',
|
|
781
|
+
/** Djot lightweight markup language. */
|
|
357
782
|
Djot = 'Djot',
|
|
783
|
+
/** Plain text output (no markup, visible text only). */
|
|
358
784
|
Plain = 'Plain'
|
|
359
785
|
}
|
|
360
786
|
|
|
361
|
-
|
|
787
|
+
/** HTML preprocessing options for document cleanup before conversion. */
|
|
788
|
+
export interface PreprocessingOptions {
|
|
789
|
+
/** Enable HTML preprocessing globally */
|
|
362
790
|
enabled?: boolean
|
|
791
|
+
/** Preprocessing preset level (Minimal, Standard, Aggressive) */
|
|
363
792
|
preset?: JsPreprocessingPreset
|
|
793
|
+
/** Remove navigation elements (nav, breadcrumbs, menus, sidebars) */
|
|
364
794
|
removeNavigation?: boolean
|
|
795
|
+
/** Remove form elements (forms, inputs, buttons, etc.) */
|
|
365
796
|
removeForms?: boolean
|
|
366
797
|
}
|
|
367
798
|
|
|
368
|
-
export
|
|
369
|
-
enabled?: boolean
|
|
370
|
-
preset?: JsPreprocessingPreset
|
|
371
|
-
removeNavigation?: boolean
|
|
372
|
-
removeForms?: boolean
|
|
373
|
-
}
|
|
799
|
+
export declare function preprocessingOptionsDefault(): PreprocessingOptions
|
|
374
800
|
|
|
375
|
-
|
|
801
|
+
/**
|
|
802
|
+
* HTML preprocessing aggressiveness level.
|
|
803
|
+
*
|
|
804
|
+
* Controls the extent of cleanup performed before conversion. Higher levels remove more elements.
|
|
805
|
+
*/
|
|
806
|
+
export declare const enum PreprocessingPreset {
|
|
807
|
+
/** Minimal cleanup. Remove only essential noise (scripts, styles). */
|
|
376
808
|
Minimal = 'Minimal',
|
|
809
|
+
/** Standard cleanup. Default. Removes navigation, forms, and other auxiliary content. */
|
|
377
810
|
Standard = 'Standard',
|
|
811
|
+
/** Aggressive cleanup. Remove extensive non-content elements and structure. */
|
|
378
812
|
Aggressive = 'Aggressive'
|
|
379
813
|
}
|
|
380
814
|
|
|
381
|
-
|
|
815
|
+
/**
|
|
816
|
+
* A non-fatal diagnostic produced during HTML conversion.
|
|
817
|
+
*
|
|
818
|
+
* Warnings indicate that conversion completed but some content may have been handled
|
|
819
|
+
* differently than expected — for example, an image that could not be extracted, a truncated
|
|
820
|
+
* input, or malformed HTML that was repaired with best-effort parsing.
|
|
821
|
+
*
|
|
822
|
+
* Conversion always succeeds (returns `ConversionResult`) even when warnings are
|
|
823
|
+
* present. Callers should inspect `warnings` and decide how to
|
|
824
|
+
* handle them based on their tolerance for partial results:
|
|
825
|
+
*
|
|
826
|
+
* - **Logging pipelines**: emit each warning at `WARN` level and continue.
|
|
827
|
+
* - **Strict pipelines**: treat any warning as a hard error by checking
|
|
828
|
+
* `result.warnings.is_empty()` before using the output.
|
|
829
|
+
*
|
|
830
|
+
* See `WarningKind` for the full taxonomy of warning categories.
|
|
831
|
+
*/
|
|
832
|
+
export interface ProcessingWarning {
|
|
833
|
+
/** Human-readable warning message. */
|
|
382
834
|
message: string
|
|
835
|
+
/** The category of warning. */
|
|
383
836
|
kind: JsWarningKind
|
|
384
837
|
}
|
|
385
838
|
|
|
386
|
-
|
|
839
|
+
/**
|
|
840
|
+
* Structured data block (JSON-LD, Microdata, or RDFa).
|
|
841
|
+
*
|
|
842
|
+
* Represents machine-readable structured data found in the document.
|
|
843
|
+
* JSON-LD blocks are collected as raw JSON strings for flexibility.
|
|
844
|
+
*
|
|
845
|
+
* # Examples
|
|
846
|
+
*/
|
|
847
|
+
export interface StructuredData {
|
|
848
|
+
/** Type of structured data (JSON-LD, Microdata, RDFa) */
|
|
387
849
|
dataType: JsStructuredDataType
|
|
850
|
+
/** Raw JSON string (for JSON-LD) or serialized representation */
|
|
388
851
|
rawJson: string
|
|
852
|
+
/** Schema type if detectable (e.g., "Article", "Event", "Product") */
|
|
389
853
|
schemaType?: string
|
|
390
854
|
}
|
|
391
855
|
|
|
392
|
-
|
|
856
|
+
/**
|
|
857
|
+
* Structured data format type.
|
|
858
|
+
*
|
|
859
|
+
* Identifies the schema/format used for structured data markup.
|
|
860
|
+
*/
|
|
861
|
+
export declare const enum StructuredDataType {
|
|
862
|
+
/** JSON-LD (JSON for Linking Data) script blocks */
|
|
393
863
|
JsonLd = 'json_ld',
|
|
864
|
+
/** HTML5 Microdata attributes (itemscope, itemtype, itemprop) */
|
|
394
865
|
Microdata = 'microdata',
|
|
395
|
-
RDFa
|
|
866
|
+
/** RDF in Attributes (RDFa) markup */
|
|
867
|
+
RDFa = 'rdfa'
|
|
396
868
|
}
|
|
397
869
|
|
|
398
|
-
|
|
399
|
-
|
|
870
|
+
/** A top-level extracted table with both structured data and markdown representation. */
|
|
871
|
+
export interface TableData {
|
|
872
|
+
/** The structured table grid. */
|
|
873
|
+
grid: TableGrid
|
|
874
|
+
/** The markdown rendering of this table. */
|
|
400
875
|
markdown: string
|
|
401
876
|
}
|
|
402
877
|
|
|
403
|
-
|
|
878
|
+
/** A structured table grid with cell-level data including spans. */
|
|
879
|
+
export interface TableGrid {
|
|
880
|
+
/** Number of rows. */
|
|
404
881
|
rows?: number
|
|
882
|
+
/** Number of columns. */
|
|
405
883
|
cols?: number
|
|
884
|
+
/**
|
|
885
|
+
* All cells in the table as a flat, sparse list.
|
|
886
|
+
*
|
|
887
|
+
* The list is ordered by `(row, col)` but is **not** a dense `rows × cols` matrix: cells
|
|
888
|
+
* that are covered by a spanning cell (via `row_span > 1` or `col_span > 1`) do not appear
|
|
889
|
+
* in the list. Only the top-left "origin" cell of a span is present, with its `row_span`
|
|
890
|
+
* and `col_span` fields set accordingly.
|
|
891
|
+
*
|
|
892
|
+
* To reconstruct the full visual grid, iterate over all cells and mark the rectangular
|
|
893
|
+
* region `[row .. row+row_span, col .. col+col_span]` as occupied by that cell. Any
|
|
894
|
+
* `(row, col)` position that is not the origin of any cell is covered by a span from an
|
|
895
|
+
* earlier cell.
|
|
896
|
+
*
|
|
897
|
+
* The length of this vec is `≤ rows * cols`. An empty table (`rows == 0 || cols == 0`)
|
|
898
|
+
* produces an empty vec.
|
|
899
|
+
*/
|
|
406
900
|
cells?: Array<JsGridCell>
|
|
407
901
|
}
|
|
408
902
|
|
|
409
|
-
|
|
903
|
+
/**
|
|
904
|
+
* A styling or semantic annotation that applies to a byte range within a node's text.
|
|
905
|
+
*
|
|
906
|
+
* Unlike `DocumentNode`, which captures block-level structure (headings, paragraphs, etc.),
|
|
907
|
+
* a `TextAnnotation` describes inline-level markup — bold, italic, links, code spans, and
|
|
908
|
+
* similar — that spans a contiguous run of bytes inside `DocumentNode.content`'s text field.
|
|
909
|
+
*
|
|
910
|
+
* Byte offsets (`start`..`end`) are into the UTF-8 encoded text of the parent node. The range
|
|
911
|
+
* follows Rust slice conventions: `start` is inclusive and `end` is exclusive, so the annotated
|
|
912
|
+
* text is `text[start as usize..end as usize]`.
|
|
913
|
+
*
|
|
914
|
+
* Multiple annotations on the same node can overlap (e.g. bold-italic text), and they are
|
|
915
|
+
* stored in the order they are encountered during DOM traversal.
|
|
916
|
+
*
|
|
917
|
+
* See `AnnotationKind` for the full list of supported annotation types.
|
|
918
|
+
*/
|
|
919
|
+
export interface TextAnnotation {
|
|
920
|
+
/** Start byte offset (inclusive) into the parent node's text. */
|
|
410
921
|
start: number
|
|
922
|
+
/** End byte offset (exclusive) into the parent node's text. */
|
|
411
923
|
end: number
|
|
924
|
+
/** The type of annotation. */
|
|
412
925
|
kind: JsAnnotationKind
|
|
413
926
|
}
|
|
414
927
|
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
928
|
+
/**
|
|
929
|
+
* Text directionality of document content.
|
|
930
|
+
*
|
|
931
|
+
* Corresponds to the HTML `dir` attribute and `bdi` element directionality.
|
|
932
|
+
*/
|
|
933
|
+
export declare const enum TextDirection {
|
|
934
|
+
/** Left-to-right text flow (default for Latin scripts) */
|
|
935
|
+
LeftToRight = 'ltr',
|
|
936
|
+
/** Right-to-left text flow (Hebrew, Arabic, Urdu, etc.) */
|
|
937
|
+
RightToLeft = 'rtl',
|
|
938
|
+
/** Automatic directionality detection */
|
|
939
|
+
Auto = 'auto'
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
/**
|
|
943
|
+
* Result of a visitor callback.
|
|
944
|
+
*
|
|
945
|
+
* Allows visitors to control the conversion flow by either proceeding
|
|
946
|
+
* with default behavior, providing custom output, skipping elements,
|
|
947
|
+
* preserving HTML, or signaling errors.
|
|
948
|
+
*/
|
|
949
|
+
export declare const enum VisitResult {
|
|
950
|
+
/** Continue with default conversion behavior */
|
|
422
951
|
Continue = 'Continue',
|
|
952
|
+
/**
|
|
953
|
+
* Replace default output with custom markdown
|
|
954
|
+
*
|
|
955
|
+
* The visitor takes full responsibility for the markdown output
|
|
956
|
+
* of this node and its children.
|
|
957
|
+
*/
|
|
423
958
|
Custom = 'Custom',
|
|
959
|
+
/**
|
|
960
|
+
* Skip this element entirely (don't output anything)
|
|
961
|
+
*
|
|
962
|
+
* The element and all its children are ignored in the output.
|
|
963
|
+
*/
|
|
424
964
|
Skip = 'Skip',
|
|
965
|
+
/**
|
|
966
|
+
* Preserve original HTML (don't convert to markdown)
|
|
967
|
+
*
|
|
968
|
+
* The element's raw HTML is included verbatim in the output.
|
|
969
|
+
*/
|
|
425
970
|
PreserveHtml = 'PreserveHtml',
|
|
971
|
+
/**
|
|
972
|
+
* Stop conversion with an error
|
|
973
|
+
*
|
|
974
|
+
* The conversion process halts and returns this error message.
|
|
975
|
+
*/
|
|
426
976
|
Error = 'Error'
|
|
427
977
|
}
|
|
428
978
|
|
|
429
|
-
|
|
979
|
+
/** Categories of processing warnings. */
|
|
980
|
+
export declare const enum WarningKind {
|
|
981
|
+
/** An image could not be extracted (e.g. invalid data URI, unsupported format). */
|
|
430
982
|
ImageExtractionFailed = 'image_extraction_failed',
|
|
983
|
+
/** The input encoding was not recognized; fell back to UTF-8. */
|
|
431
984
|
EncodingFallback = 'encoding_fallback',
|
|
985
|
+
/** The input was truncated due to size limits. */
|
|
432
986
|
TruncatedInput = 'truncated_input',
|
|
987
|
+
/** The HTML was malformed but processing continued with best effort. */
|
|
433
988
|
MalformedHtml = 'malformed_html',
|
|
989
|
+
/** Sanitization was applied to remove potentially unsafe content. */
|
|
434
990
|
SanitizationApplied = 'sanitization_applied',
|
|
991
|
+
/** DOM traversal was truncated because max_depth was exceeded. */
|
|
435
992
|
DepthLimitExceeded = 'depth_limit_exceeded'
|
|
436
993
|
}
|
|
437
994
|
|
|
438
|
-
|
|
995
|
+
/**
|
|
996
|
+
* Whitespace handling strategy during conversion.
|
|
997
|
+
*
|
|
998
|
+
* Determines how sequences of whitespace characters (spaces, tabs, newlines) are processed.
|
|
999
|
+
*/
|
|
1000
|
+
export declare const enum WhitespaceMode {
|
|
1001
|
+
/** Collapse multiple whitespace characters to single spaces. Default. Matches browser behavior. */
|
|
439
1002
|
Normalized = 'Normalized',
|
|
1003
|
+
/** Preserve all whitespace exactly as it appears in the HTML. */
|
|
440
1004
|
Strict = 'Strict'
|
|
441
1005
|
}
|