html-to-markdown 3.4.1-aarch64-linux → 3.5.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Steepfile +10 -2
- data/ext/html_to_markdown_rb/Cargo.toml +1 -1
- data/ext/html_to_markdown_rb/extconf.rb +5 -5
- data/ext/html_to_markdown_rb/native/Cargo.lock +10 -10
- data/ext/html_to_markdown_rb/native/Cargo.toml +2 -11
- data/ext/html_to_markdown_rb/src/lib.rs +184 -122
- data/lib/html_to_markdown/native.rb +1204 -36
- data/lib/html_to_markdown/version.rb +2 -2
- data/lib/html_to_markdown.rb +3 -3
- data/lib/html_to_markdown_rb.so +0 -0
- data/sig/types.rbs +12 -273
- metadata +16 -2
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:13800159bcd376ddaa37a79cd1dbda8cbefca3a8450e10e9a6510a9576cad1f4
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
6
6
|
# frozen_string_literal: true
|
|
7
7
|
|
|
8
8
|
module HtmlToMarkdown
|
|
9
|
-
VERSION =
|
|
9
|
+
VERSION = "3.5.0"
|
|
10
10
|
end
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:47eeb6be7adffaa90e4383392c292f7b374ce905d84da6b6997e9798053a05a0
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
6
6
|
# frozen_string_literal: true
|
|
7
7
|
|
|
8
|
-
require_relative
|
|
9
|
-
require_relative
|
|
8
|
+
require_relative "html_to_markdown/version"
|
|
9
|
+
require_relative "html_to_markdown/native"
|
|
10
10
|
|
|
11
11
|
module HtmlToMarkdown
|
|
12
12
|
# Re-export all types and functions from native extension
|
data/lib/html_to_markdown_rb.so
CHANGED
|
Binary file
|
data/sig/types.rbs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:86b0e721bec1828e9c4c971cf6a562e12a69f76d8e0f89e96dc5c1b80f4dd75e
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
@@ -8,25 +8,9 @@ module HtmlToMarkdown
|
|
|
8
8
|
|
|
9
9
|
VERSION: String
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
# Document-level metadata extracted from `<head>` and top-level elements.
|
|
13
|
-
#
|
|
14
|
-
# Contains all metadata typically used by search engines, social media platforms,
|
|
15
|
-
# and browsers for document indexing and presentation.
|
|
16
|
-
#
|
|
17
|
-
# # Examples
|
|
18
|
-
#
|
|
19
|
-
# ```
|
|
20
|
-
# let doc = DocumentMetadata {
|
|
21
|
-
# title: Some("My Article".to_string()),
|
|
22
|
-
# description: Some("A great article about Rust".to_string()),
|
|
23
|
-
# keywords: vec!["rust".to_string(), "programming".to_string()],
|
|
24
|
-
# ..Default::default()
|
|
25
|
-
# };
|
|
26
|
-
#
|
|
27
|
-
# assert_eq!(doc.title, Some("My Article".to_string()));
|
|
28
|
-
# ```
|
|
11
|
+
type json_value = Hash[String, untyped] | Array[untyped] | String | Integer | Float | bool | nil
|
|
29
12
|
|
|
13
|
+
class DocumentMetadata
|
|
30
14
|
attr_accessor title: String?
|
|
31
15
|
attr_accessor description: String?
|
|
32
16
|
attr_accessor keywords: Array[String]?
|
|
@@ -39,30 +23,10 @@ module HtmlToMarkdown
|
|
|
39
23
|
attr_accessor twitter_card: Hash[String, String]?
|
|
40
24
|
attr_accessor meta_tags: Hash[String, String]?
|
|
41
25
|
|
|
42
|
-
def initialize: (?title: String, ?description: String, keywords: Array[String], ?author: String, ?canonical_url: String, ?base_href: String, ?language: String, ?text_direction: TextDirection, open_graph: Hash[String, String], twitter_card: Hash[String, String], meta_tags: Hash[String, String]) -> void
|
|
26
|
+
def initialize: (?title: String, ?description: String, ?keywords: Array[String], ?author: String, ?canonical_url: String, ?base_href: String, ?language: String, ?text_direction: TextDirection, ?open_graph: Hash[String, String], ?twitter_card: Hash[String, String], ?meta_tags: Hash[String, String]) -> void
|
|
43
27
|
end
|
|
44
28
|
|
|
45
29
|
class HeaderMetadata
|
|
46
|
-
# Header element metadata with hierarchy tracking.
|
|
47
|
-
#
|
|
48
|
-
# Captures heading elements (h1-h6) with their text content, identifiers,
|
|
49
|
-
# and position in the document structure.
|
|
50
|
-
#
|
|
51
|
-
# # Examples
|
|
52
|
-
#
|
|
53
|
-
# ```
|
|
54
|
-
# let header = HeaderMetadata {
|
|
55
|
-
# level: 1,
|
|
56
|
-
# text: "Main Title".to_string(),
|
|
57
|
-
# id: Some("main-title".to_string()),
|
|
58
|
-
# depth: 0,
|
|
59
|
-
# html_offset: 145,
|
|
60
|
-
# };
|
|
61
|
-
#
|
|
62
|
-
# assert_eq!(header.level, 1);
|
|
63
|
-
# assert!(header.is_valid());
|
|
64
|
-
# ```
|
|
65
|
-
|
|
66
30
|
attr_reader level: Integer
|
|
67
31
|
attr_reader text: String
|
|
68
32
|
attr_reader id: String
|
|
@@ -74,26 +38,6 @@ module HtmlToMarkdown
|
|
|
74
38
|
end
|
|
75
39
|
|
|
76
40
|
class LinkMetadata
|
|
77
|
-
# Hyperlink metadata with categorization and attributes.
|
|
78
|
-
#
|
|
79
|
-
# Represents `<a>` elements with parsed href values, text content, and link type classification.
|
|
80
|
-
#
|
|
81
|
-
# # Examples
|
|
82
|
-
#
|
|
83
|
-
# ```
|
|
84
|
-
# let link = LinkMetadata {
|
|
85
|
-
# href: "https://example.com".to_string(),
|
|
86
|
-
# text: "Example".to_string(),
|
|
87
|
-
# title: Some("Visit Example".to_string()),
|
|
88
|
-
# link_type: LinkType::External,
|
|
89
|
-
# rel: vec!["nofollow".to_string()],
|
|
90
|
-
# attributes: Default::default(),
|
|
91
|
-
# };
|
|
92
|
-
#
|
|
93
|
-
# assert_eq!(link.link_type, LinkType::External);
|
|
94
|
-
# assert_eq!(link.text, "Example");
|
|
95
|
-
# ```
|
|
96
|
-
|
|
97
41
|
attr_reader href: String
|
|
98
42
|
attr_reader text: String
|
|
99
43
|
attr_reader title: String
|
|
@@ -102,30 +46,9 @@ module HtmlToMarkdown
|
|
|
102
46
|
attr_reader attributes: Hash[String, String]
|
|
103
47
|
|
|
104
48
|
def initialize: (href: String, text: String, ?title: String, link_type: LinkType, rel: Array[String], attributes: Hash[String, String]) -> void
|
|
105
|
-
def self.classify_link: (String href) -> LinkType
|
|
106
49
|
end
|
|
107
50
|
|
|
108
51
|
class ImageMetadata
|
|
109
|
-
# Image metadata with source and dimensions.
|
|
110
|
-
#
|
|
111
|
-
# Captures `<img>` elements and inline `<svg>` elements with metadata
|
|
112
|
-
# for image analysis and optimization.
|
|
113
|
-
#
|
|
114
|
-
# # Examples
|
|
115
|
-
#
|
|
116
|
-
# ```
|
|
117
|
-
# let img = ImageMetadata {
|
|
118
|
-
# src: "https://example.com/image.jpg".to_string(),
|
|
119
|
-
# alt: Some("An example image".to_string()),
|
|
120
|
-
# title: Some("Example".to_string()),
|
|
121
|
-
# dimensions: Some((800, 600)),
|
|
122
|
-
# image_type: ImageType::External,
|
|
123
|
-
# attributes: Default::default(),
|
|
124
|
-
# };
|
|
125
|
-
#
|
|
126
|
-
# assert_eq!(img.image_type, ImageType::External);
|
|
127
|
-
# ```
|
|
128
|
-
|
|
129
52
|
attr_reader src: String
|
|
130
53
|
attr_reader alt: String
|
|
131
54
|
attr_reader title: String
|
|
@@ -137,23 +60,6 @@ module HtmlToMarkdown
|
|
|
137
60
|
end
|
|
138
61
|
|
|
139
62
|
class StructuredData
|
|
140
|
-
# Structured data block (JSON-LD, Microdata, or RDFa).
|
|
141
|
-
#
|
|
142
|
-
# Represents machine-readable structured data found in the document.
|
|
143
|
-
# JSON-LD blocks are collected as raw JSON strings for flexibility.
|
|
144
|
-
#
|
|
145
|
-
# # Examples
|
|
146
|
-
#
|
|
147
|
-
# ```
|
|
148
|
-
# let schema = StructuredData {
|
|
149
|
-
# data_type: StructuredDataType::JsonLd,
|
|
150
|
-
# raw_json: r#"{"@context":"https://schema.org","@type":"Article"}"#.to_string(),
|
|
151
|
-
# schema_type: Some("Article".to_string()),
|
|
152
|
-
# };
|
|
153
|
-
#
|
|
154
|
-
# assert_eq!(schema.data_type, StructuredDataType::JsonLd);
|
|
155
|
-
# ```
|
|
156
|
-
|
|
157
63
|
attr_reader data_type: StructuredDataType
|
|
158
64
|
attr_reader raw_json: String
|
|
159
65
|
attr_reader schema_type: String
|
|
@@ -162,51 +68,16 @@ module HtmlToMarkdown
|
|
|
162
68
|
end
|
|
163
69
|
|
|
164
70
|
class HtmlMetadata
|
|
165
|
-
# Comprehensive metadata extraction result from HTML document.
|
|
166
|
-
#
|
|
167
|
-
# Contains all extracted metadata types in a single structure,
|
|
168
|
-
# suitable for serialization and transmission across language boundaries.
|
|
169
|
-
#
|
|
170
|
-
# # Examples
|
|
171
|
-
#
|
|
172
|
-
# ```
|
|
173
|
-
# let metadata = HtmlMetadata {
|
|
174
|
-
# document: Default::default(),
|
|
175
|
-
# headers: Vec::new(),
|
|
176
|
-
# links: Vec::new(),
|
|
177
|
-
# images: Vec::new(),
|
|
178
|
-
# structured_data: Vec::new(),
|
|
179
|
-
# };
|
|
180
|
-
#
|
|
181
|
-
# assert!(metadata.headers.is_empty());
|
|
182
|
-
# ```
|
|
183
|
-
|
|
184
71
|
attr_accessor document: DocumentMetadata?
|
|
185
72
|
attr_accessor headers: Array[HeaderMetadata]?
|
|
186
73
|
attr_accessor links: Array[LinkMetadata]?
|
|
187
74
|
attr_accessor images: Array[ImageMetadata]?
|
|
188
75
|
attr_accessor structured_data: Array[StructuredData]?
|
|
189
76
|
|
|
190
|
-
def initialize: (document: DocumentMetadata, headers: Array[HeaderMetadata], links: Array[LinkMetadata], images: Array[ImageMetadata], structured_data: Array[StructuredData]) -> void
|
|
77
|
+
def initialize: (?document: DocumentMetadata, ?headers: Array[HeaderMetadata], ?links: Array[LinkMetadata], ?images: Array[ImageMetadata], ?structured_data: Array[StructuredData]) -> void
|
|
191
78
|
end
|
|
192
79
|
|
|
193
80
|
class ConversionOptions
|
|
194
|
-
# Main conversion options for HTML to Markdown conversion.
|
|
195
|
-
#
|
|
196
|
-
# Use [`ConversionOptions::builder()`] to construct, or [`Default::default()`] for defaults.
|
|
197
|
-
#
|
|
198
|
-
# # Example
|
|
199
|
-
#
|
|
200
|
-
# ```text
|
|
201
|
-
# use html_to_markdown_rs::ConversionOptions;
|
|
202
|
-
#
|
|
203
|
-
# let options = ConversionOptions::builder()
|
|
204
|
-
# .heading_style(HeadingStyle::Atx)
|
|
205
|
-
# .wrap(true)
|
|
206
|
-
# .wrap_width(100)
|
|
207
|
-
# .build();
|
|
208
|
-
# ```
|
|
209
|
-
|
|
210
81
|
attr_accessor heading_style: HeadingStyle?
|
|
211
82
|
attr_accessor list_indent_type: ListIndentType?
|
|
212
83
|
attr_accessor list_indent_width: Integer?
|
|
@@ -220,6 +91,7 @@ module HtmlToMarkdown
|
|
|
220
91
|
attr_accessor autolinks: bool?
|
|
221
92
|
attr_accessor default_title: bool?
|
|
222
93
|
attr_accessor br_in_tables: bool?
|
|
94
|
+
attr_accessor compact_tables: bool?
|
|
223
95
|
attr_accessor highlight_style: HighlightStyle?
|
|
224
96
|
attr_accessor extract_metadata: bool?
|
|
225
97
|
attr_accessor whitespace_mode: WhitespaceMode?
|
|
@@ -249,34 +121,11 @@ module HtmlToMarkdown
|
|
|
249
121
|
attr_accessor exclude_selectors: Array[String]?
|
|
250
122
|
attr_accessor visitor: VisitorHandle?
|
|
251
123
|
|
|
252
|
-
def initialize: (heading_style: HeadingStyle, list_indent_type: ListIndentType, list_indent_width: Integer, bullets: String, strong_em_symbol: String, escape_asterisks: bool, escape_underscores: bool, escape_misc: bool, escape_ascii: bool, code_language: String, autolinks: bool, default_title: bool, br_in_tables: bool, highlight_style: HighlightStyle, extract_metadata: bool, whitespace_mode: WhitespaceMode, strip_newlines: bool, wrap: bool, wrap_width: Integer, convert_as_inline: bool, sub_symbol: String, sup_symbol: String, newline_style: NewlineStyle, code_block_style: CodeBlockStyle, keep_inline_images_in: Array[String], preprocessing: PreprocessingOptions, encoding: String, debug: bool, strip_tags: Array[String], preserve_tags: Array[String], skip_images: bool, link_style: LinkStyle, output_format: OutputFormat, include_document_structure: bool, extract_images: bool, max_image_size: Integer, capture_svg: bool, infer_dimensions: bool, ?max_depth: Integer, exclude_selectors: Array[String], ?visitor: VisitorHandle) -> void
|
|
253
|
-
def apply_update: (ConversionOptionsUpdate update) -> void
|
|
124
|
+
def initialize: (?heading_style: HeadingStyle, ?list_indent_type: ListIndentType, ?list_indent_width: Integer, ?bullets: String, ?strong_em_symbol: String, ?escape_asterisks: bool, ?escape_underscores: bool, ?escape_misc: bool, ?escape_ascii: bool, ?code_language: String, ?autolinks: bool, ?default_title: bool, ?br_in_tables: bool, ?compact_tables: bool, ?highlight_style: HighlightStyle, ?extract_metadata: bool, ?whitespace_mode: WhitespaceMode, ?strip_newlines: bool, ?wrap: bool, ?wrap_width: Integer, ?convert_as_inline: bool, ?sub_symbol: String, ?sup_symbol: String, ?newline_style: NewlineStyle, ?code_block_style: CodeBlockStyle, ?keep_inline_images_in: Array[String], ?preprocessing: PreprocessingOptions, ?encoding: String, ?debug: bool, ?strip_tags: Array[String], ?preserve_tags: Array[String], ?skip_images: bool, ?link_style: LinkStyle, ?output_format: OutputFormat, ?include_document_structure: bool, ?extract_images: bool, ?max_image_size: Integer, ?capture_svg: bool, ?infer_dimensions: bool, ?max_depth: Integer, ?exclude_selectors: Array[String], ?visitor: VisitorHandle) -> void
|
|
254
125
|
def self.default: () -> ConversionOptions
|
|
255
|
-
def self.builder: () -> ConversionOptionsBuilder
|
|
256
|
-
def self.from_update: (ConversionOptionsUpdate update) -> ConversionOptions
|
|
257
|
-
def self.from: (ConversionOptionsUpdate update) -> ConversionOptions
|
|
258
|
-
end
|
|
259
|
-
|
|
260
|
-
class ConversionOptionsBuilder
|
|
261
|
-
# Builder for [`ConversionOptions`].
|
|
262
|
-
#
|
|
263
|
-
# All fields start with default values. Call `.build()` to produce the final options.
|
|
264
|
-
|
|
265
|
-
def strip_tags: (Array[String] tags) -> ConversionOptionsBuilder
|
|
266
|
-
def preserve_tags: (Array[String] tags) -> ConversionOptionsBuilder
|
|
267
|
-
def keep_inline_images_in: (Array[String] tags) -> ConversionOptionsBuilder
|
|
268
|
-
def exclude_selectors: (Array[String] selectors) -> ConversionOptionsBuilder
|
|
269
|
-
def visitor: (?VisitorHandle visitor) -> ConversionOptionsBuilder
|
|
270
|
-
def preprocessing: (PreprocessingOptions preprocessing) -> ConversionOptionsBuilder
|
|
271
|
-
def build: () -> ConversionOptions
|
|
272
126
|
end
|
|
273
127
|
|
|
274
128
|
class ConversionOptionsUpdate
|
|
275
|
-
# Partial update for `ConversionOptions`.
|
|
276
|
-
#
|
|
277
|
-
# Uses `Option<T>` fields for selective updates. Bindings use this to construct
|
|
278
|
-
# options from language-native types. Prefer [`ConversionOptionsBuilder`] for Rust code.
|
|
279
|
-
|
|
280
129
|
attr_accessor heading_style: HeadingStyle?
|
|
281
130
|
attr_accessor list_indent_type: ListIndentType?
|
|
282
131
|
attr_accessor list_indent_width: Integer?
|
|
@@ -290,6 +139,7 @@ module HtmlToMarkdown
|
|
|
290
139
|
attr_accessor autolinks: bool?
|
|
291
140
|
attr_accessor default_title: bool?
|
|
292
141
|
attr_accessor br_in_tables: bool?
|
|
142
|
+
attr_accessor compact_tables: bool?
|
|
293
143
|
attr_accessor highlight_style: HighlightStyle?
|
|
294
144
|
attr_accessor extract_metadata: bool?
|
|
295
145
|
attr_accessor whitespace_mode: WhitespaceMode?
|
|
@@ -319,31 +169,20 @@ module HtmlToMarkdown
|
|
|
319
169
|
attr_accessor exclude_selectors: Array[String]?
|
|
320
170
|
attr_accessor visitor: VisitorHandle?
|
|
321
171
|
|
|
322
|
-
def initialize: (?heading_style: HeadingStyle, ?list_indent_type: ListIndentType, ?list_indent_width: Integer, ?bullets: String, ?strong_em_symbol: String, ?escape_asterisks: bool, ?escape_underscores: bool, ?escape_misc: bool, ?escape_ascii: bool, ?code_language: String, ?autolinks: bool, ?default_title: bool, ?br_in_tables: bool, ?highlight_style: HighlightStyle, ?extract_metadata: bool, ?whitespace_mode: WhitespaceMode, ?strip_newlines: bool, ?wrap: bool, ?wrap_width: Integer, ?convert_as_inline: bool, ?sub_symbol: String, ?sup_symbol: String, ?newline_style: NewlineStyle, ?code_block_style: CodeBlockStyle, ?keep_inline_images_in: Array[String], ?preprocessing: PreprocessingOptionsUpdate, ?encoding: String, ?debug: bool, ?strip_tags: Array[String], ?preserve_tags: Array[String], ?skip_images: bool, ?link_style: LinkStyle, ?output_format: OutputFormat, ?include_document_structure: bool, ?extract_images: bool, ?max_image_size: Integer, ?capture_svg: bool, ?infer_dimensions: bool, ?max_depth: Integer?, ?exclude_selectors: Array[String], ?visitor: VisitorHandle) -> void
|
|
172
|
+
def initialize: (?heading_style: HeadingStyle, ?list_indent_type: ListIndentType, ?list_indent_width: Integer, ?bullets: String, ?strong_em_symbol: String, ?escape_asterisks: bool, ?escape_underscores: bool, ?escape_misc: bool, ?escape_ascii: bool, ?code_language: String, ?autolinks: bool, ?default_title: bool, ?br_in_tables: bool, ?compact_tables: bool, ?highlight_style: HighlightStyle, ?extract_metadata: bool, ?whitespace_mode: WhitespaceMode, ?strip_newlines: bool, ?wrap: bool, ?wrap_width: Integer, ?convert_as_inline: bool, ?sub_symbol: String, ?sup_symbol: String, ?newline_style: NewlineStyle, ?code_block_style: CodeBlockStyle, ?keep_inline_images_in: Array[String], ?preprocessing: PreprocessingOptionsUpdate, ?encoding: String, ?debug: bool, ?strip_tags: Array[String], ?preserve_tags: Array[String], ?skip_images: bool, ?link_style: LinkStyle, ?output_format: OutputFormat, ?include_document_structure: bool, ?extract_images: bool, ?max_image_size: Integer, ?capture_svg: bool, ?infer_dimensions: bool, ?max_depth: Integer?, ?exclude_selectors: Array[String], ?visitor: VisitorHandle) -> void
|
|
323
173
|
end
|
|
324
174
|
|
|
325
175
|
class PreprocessingOptions
|
|
326
|
-
# HTML preprocessing options for document cleanup before conversion.
|
|
327
|
-
|
|
328
176
|
attr_accessor enabled: bool?
|
|
329
177
|
attr_accessor preset: PreprocessingPreset?
|
|
330
178
|
attr_accessor remove_navigation: bool?
|
|
331
179
|
attr_accessor remove_forms: bool?
|
|
332
180
|
|
|
333
|
-
def initialize: (enabled: bool, preset: PreprocessingPreset, remove_navigation: bool, remove_forms: bool) -> void
|
|
334
|
-
def apply_update: (PreprocessingOptionsUpdate update) -> void
|
|
181
|
+
def initialize: (?enabled: bool, ?preset: PreprocessingPreset, ?remove_navigation: bool, ?remove_forms: bool) -> void
|
|
335
182
|
def self.default: () -> PreprocessingOptions
|
|
336
|
-
def self.from_update: (PreprocessingOptionsUpdate update) -> PreprocessingOptions
|
|
337
|
-
def self.from: (PreprocessingOptionsUpdate update) -> PreprocessingOptions
|
|
338
183
|
end
|
|
339
184
|
|
|
340
185
|
class PreprocessingOptionsUpdate
|
|
341
|
-
# Partial update for `PreprocessingOptions`.
|
|
342
|
-
#
|
|
343
|
-
# This struct uses `Option<T>` to represent optional fields that can be selectively updated.
|
|
344
|
-
# Only specified fields (Some values) will override existing options; None values leave the
|
|
345
|
-
# corresponding fields unchanged when applied via [`PreprocessingOptions::apply_update`].
|
|
346
|
-
|
|
347
186
|
attr_accessor enabled: bool?
|
|
348
187
|
attr_accessor preset: PreprocessingPreset?
|
|
349
188
|
attr_accessor remove_navigation: bool?
|
|
@@ -353,10 +192,6 @@ module HtmlToMarkdown
|
|
|
353
192
|
end
|
|
354
193
|
|
|
355
194
|
class DocumentStructure
|
|
356
|
-
# A structured document tree representing the semantic content of an HTML document.
|
|
357
|
-
#
|
|
358
|
-
# Uses a flat node array with index-based parent/child references for efficient traversal.
|
|
359
|
-
|
|
360
195
|
attr_reader nodes: Array[DocumentNode]
|
|
361
196
|
attr_reader source_format: String
|
|
362
197
|
|
|
@@ -364,8 +199,6 @@ module HtmlToMarkdown
|
|
|
364
199
|
end
|
|
365
200
|
|
|
366
201
|
class DocumentNode
|
|
367
|
-
# A single node in the document tree.
|
|
368
|
-
|
|
369
202
|
attr_reader id: String
|
|
370
203
|
attr_reader content: NodeContent
|
|
371
204
|
attr_reader parent: Integer
|
|
@@ -377,10 +210,6 @@ module HtmlToMarkdown
|
|
|
377
210
|
end
|
|
378
211
|
|
|
379
212
|
class TextAnnotation
|
|
380
|
-
# An inline text annotation with byte-range offsets.
|
|
381
|
-
#
|
|
382
|
-
# Annotations describe formatting (bold, italic, etc.) and links within a node's text content.
|
|
383
|
-
|
|
384
213
|
attr_reader start: Integer
|
|
385
214
|
attr_reader end: Integer
|
|
386
215
|
attr_reader kind: AnnotationKind
|
|
@@ -389,21 +218,6 @@ module HtmlToMarkdown
|
|
|
389
218
|
end
|
|
390
219
|
|
|
391
220
|
class ConversionResult
|
|
392
|
-
# The primary result of HTML conversion and extraction.
|
|
393
|
-
#
|
|
394
|
-
# Contains the converted text output, optional structured document tree,
|
|
395
|
-
# metadata, extracted tables, images, and processing warnings.
|
|
396
|
-
#
|
|
397
|
-
# # Example
|
|
398
|
-
#
|
|
399
|
-
# ```text
|
|
400
|
-
# use html_to_markdown_rs::{convert, ConversionOptions};
|
|
401
|
-
#
|
|
402
|
-
# let result = convert("<h1>Hello</h1><p>World</p>", None)?;
|
|
403
|
-
# assert!(result.content.is_some());
|
|
404
|
-
# assert!(result.warnings.is_empty());
|
|
405
|
-
# ```
|
|
406
|
-
|
|
407
221
|
attr_accessor content: String?
|
|
408
222
|
attr_accessor document: DocumentStructure?
|
|
409
223
|
attr_accessor metadata: HtmlMetadata?
|
|
@@ -411,22 +225,18 @@ module HtmlToMarkdown
|
|
|
411
225
|
attr_accessor images: Array[String]?
|
|
412
226
|
attr_accessor warnings: Array[ProcessingWarning]?
|
|
413
227
|
|
|
414
|
-
def initialize: (?content: String, ?document: DocumentStructure, metadata: HtmlMetadata, tables: Array[TableData], images: Array[String], warnings: Array[ProcessingWarning]) -> void
|
|
228
|
+
def initialize: (?content: String, ?document: DocumentStructure, ?metadata: HtmlMetadata, ?tables: Array[TableData], ?images: Array[String], ?warnings: Array[ProcessingWarning]) -> void
|
|
415
229
|
end
|
|
416
230
|
|
|
417
231
|
class TableGrid
|
|
418
|
-
# A structured table grid with cell-level data including spans.
|
|
419
|
-
|
|
420
232
|
attr_accessor rows: Integer?
|
|
421
233
|
attr_accessor cols: Integer?
|
|
422
234
|
attr_accessor cells: Array[GridCell]?
|
|
423
235
|
|
|
424
|
-
def initialize: (rows: Integer, cols: Integer, cells: Array[GridCell]) -> void
|
|
236
|
+
def initialize: (?rows: Integer, ?cols: Integer, ?cells: Array[GridCell]) -> void
|
|
425
237
|
end
|
|
426
238
|
|
|
427
239
|
class GridCell
|
|
428
|
-
# A single cell in a table grid.
|
|
429
|
-
|
|
430
240
|
attr_reader content: String
|
|
431
241
|
attr_reader row: Integer
|
|
432
242
|
attr_reader col: Integer
|
|
@@ -438,8 +248,6 @@ module HtmlToMarkdown
|
|
|
438
248
|
end
|
|
439
249
|
|
|
440
250
|
class TableData
|
|
441
|
-
# A top-level extracted table with both structured data and markdown representation.
|
|
442
|
-
|
|
443
251
|
attr_reader grid: TableGrid
|
|
444
252
|
attr_reader markdown: String
|
|
445
253
|
|
|
@@ -447,8 +255,6 @@ module HtmlToMarkdown
|
|
|
447
255
|
end
|
|
448
256
|
|
|
449
257
|
class ProcessingWarning
|
|
450
|
-
# A non-fatal warning generated during HTML processing.
|
|
451
|
-
|
|
452
258
|
attr_reader message: String
|
|
453
259
|
attr_reader kind: WarningKind
|
|
454
260
|
|
|
@@ -456,20 +262,9 @@ module HtmlToMarkdown
|
|
|
456
262
|
end
|
|
457
263
|
|
|
458
264
|
class VisitorHandle
|
|
459
|
-
# Type alias for a visitor handle (`Arc`-wrapped `Mutex` for thread-safe shared mutation).
|
|
460
|
-
#
|
|
461
|
-
# `Send + Sync` so that types embedding a `VisitorHandle` (e.g. `ConversionOptions`)
|
|
462
|
-
# can be shared across threads — required by callers that stash configs inside
|
|
463
|
-
# axum/rmcp/tokio Send-bound contexts.
|
|
464
|
-
|
|
465
265
|
end
|
|
466
266
|
|
|
467
267
|
class NodeContext
|
|
468
|
-
# Context information passed to all visitor methods.
|
|
469
|
-
#
|
|
470
|
-
# Provides comprehensive metadata about the current node being visited,
|
|
471
|
-
# including its type, attributes, position in the DOM tree, and parent context.
|
|
472
|
-
|
|
473
268
|
attr_reader node_type: NodeType
|
|
474
269
|
attr_reader tag_name: String
|
|
475
270
|
attr_reader attributes: Hash[String, String]
|
|
@@ -482,128 +277,72 @@ module HtmlToMarkdown
|
|
|
482
277
|
end
|
|
483
278
|
|
|
484
279
|
class TextDirection
|
|
485
|
-
# Text directionality of document content.
|
|
486
|
-
#
|
|
487
|
-
# Corresponds to the HTML `dir` attribute and `bdi` element directionality.
|
|
488
280
|
type value = :left_to_right | :right_to_left | :auto
|
|
489
281
|
end
|
|
490
282
|
|
|
491
283
|
class LinkType
|
|
492
|
-
# Link classification based on href value and document context.
|
|
493
|
-
#
|
|
494
|
-
# Used to categorize links during extraction for filtering and analysis.
|
|
495
284
|
type value = :anchor | :internal | :external | :email | :phone | :other
|
|
496
285
|
end
|
|
497
286
|
|
|
498
287
|
class ImageType
|
|
499
|
-
# Image source classification for proper handling and processing.
|
|
500
|
-
#
|
|
501
|
-
# Determines whether an image is embedded (data URI), inline SVG, external, or relative.
|
|
502
288
|
type value = :data_uri | :inline_svg | :external | :relative
|
|
503
289
|
end
|
|
504
290
|
|
|
505
291
|
class StructuredDataType
|
|
506
|
-
# Structured data format type.
|
|
507
|
-
#
|
|
508
|
-
# Identifies the schema/format used for structured data markup.
|
|
509
292
|
type value = :json_ld | :microdata | :r_d_fa
|
|
510
293
|
end
|
|
511
294
|
|
|
512
295
|
class PreprocessingPreset
|
|
513
|
-
# HTML preprocessing aggressiveness level.
|
|
514
|
-
#
|
|
515
|
-
# Controls the extent of cleanup performed before conversion. Higher levels remove more elements.
|
|
516
296
|
type value = :minimal | :standard | :aggressive
|
|
517
297
|
end
|
|
518
298
|
|
|
519
299
|
class HeadingStyle
|
|
520
|
-
# Heading style options for Markdown output.
|
|
521
|
-
#
|
|
522
|
-
# Controls how headings (h1-h6) are rendered in the output Markdown.
|
|
523
300
|
type value = :underlined | :atx | :atx_closed
|
|
524
301
|
end
|
|
525
302
|
|
|
526
303
|
class ListIndentType
|
|
527
|
-
# List indentation character type.
|
|
528
|
-
#
|
|
529
|
-
# Controls whether list items are indented with spaces or tabs.
|
|
530
304
|
type value = :spaces | :tabs
|
|
531
305
|
end
|
|
532
306
|
|
|
533
307
|
class WhitespaceMode
|
|
534
|
-
# Whitespace handling strategy during conversion.
|
|
535
|
-
#
|
|
536
|
-
# Determines how sequences of whitespace characters (spaces, tabs, newlines) are processed.
|
|
537
308
|
type value = :normalized | :strict
|
|
538
309
|
end
|
|
539
310
|
|
|
540
311
|
class NewlineStyle
|
|
541
|
-
# Line break syntax in Markdown output.
|
|
542
|
-
#
|
|
543
|
-
# Controls how soft line breaks (from `<br>` or line breaks in source) are rendered.
|
|
544
312
|
type value = :spaces | :backslash
|
|
545
313
|
end
|
|
546
314
|
|
|
547
315
|
class CodeBlockStyle
|
|
548
|
-
# Code block fence style in Markdown output.
|
|
549
|
-
#
|
|
550
|
-
# Determines how code blocks (`<pre><code>`) are rendered in Markdown.
|
|
551
316
|
type value = :indented | :backticks | :tildes
|
|
552
317
|
end
|
|
553
318
|
|
|
554
319
|
class HighlightStyle
|
|
555
|
-
# Highlight rendering style for `<mark>` elements.
|
|
556
|
-
#
|
|
557
|
-
# Controls how highlighted text is rendered in Markdown output.
|
|
558
320
|
type value = :double_equal | :html | :bold | :none
|
|
559
321
|
end
|
|
560
322
|
|
|
561
323
|
class LinkStyle
|
|
562
|
-
# Link rendering style in Markdown output.
|
|
563
|
-
#
|
|
564
|
-
# Controls whether links and images use inline `[text](url)` syntax or
|
|
565
|
-
# reference-style `[text][1]` syntax with definitions collected at the end.
|
|
566
324
|
type value = :inline | :reference
|
|
567
325
|
end
|
|
568
326
|
|
|
569
327
|
class OutputFormat
|
|
570
|
-
# Output format for conversion.
|
|
571
|
-
#
|
|
572
|
-
# Specifies the target markup language format for the conversion output.
|
|
573
328
|
type value = :markdown | :djot | :plain
|
|
574
329
|
end
|
|
575
330
|
|
|
576
331
|
class NodeContent
|
|
577
|
-
# The semantic content type of a document node.
|
|
578
|
-
#
|
|
579
|
-
# Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
|
|
580
332
|
end
|
|
581
333
|
|
|
582
334
|
class AnnotationKind
|
|
583
|
-
# The type of an inline text annotation.
|
|
584
|
-
#
|
|
585
|
-
# Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
|
|
586
335
|
end
|
|
587
336
|
|
|
588
337
|
class WarningKind
|
|
589
|
-
# Categories of processing warnings.
|
|
590
338
|
type value = :image_extraction_failed | :encoding_fallback | :truncated_input | :malformed_html | :sanitization_applied | :depth_limit_exceeded
|
|
591
339
|
end
|
|
592
340
|
|
|
593
341
|
class NodeType
|
|
594
|
-
# Node type enumeration covering all HTML element types.
|
|
595
|
-
#
|
|
596
|
-
# This enum categorizes all HTML elements that the converter recognizes,
|
|
597
|
-
# providing a coarse-grained classification for visitor dispatch.
|
|
598
342
|
type value = :text | :element | :heading | :paragraph | :div | :blockquote | :pre | :hr | :list | :list_item | :definition_list | :definition_term | :definition_description | :table | :table_row | :table_cell | :table_header | :table_body | :table_head | :table_foot | :link | :image | :strong | :em | :code | :strikethrough | :underline | :subscript | :superscript | :mark | :small | :br | :span | :article | :section | :nav | :aside | :header | :footer | :main | :figure | :figcaption | :time | :details | :summary | :form | :input | :select | :option | :button | :textarea | :label | :fieldset | :legend | :audio | :video | :picture | :source | :iframe | :svg | :canvas | :ruby | :rt | :rp | :abbr | :kbd | :samp | :var | :cite | :q | :del | :ins | :data | :meter | :progress | :output | :template | :slot | :html | :head | :body | :title | :meta | :link_tag | :style | :script | :base | :custom
|
|
599
343
|
end
|
|
600
344
|
|
|
601
345
|
class VisitResult
|
|
602
|
-
# Result of a visitor callback.
|
|
603
|
-
#
|
|
604
|
-
# Allows visitors to control the conversion flow by either proceeding
|
|
605
|
-
# with default behavior, providing custom output, skipping elements,
|
|
606
|
-
# preserving HTML, or signaling errors.
|
|
607
346
|
end
|
|
608
347
|
|
|
609
348
|
def self.convert: (String html, ?ConversionOptions options) -> ConversionResult
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.
|
|
4
|
+
version: 3.5.0
|
|
5
5
|
platform: aarch64-linux
|
|
6
6
|
authors:
|
|
7
7
|
- Kreuzberg Team
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-05-
|
|
11
|
+
date: 2026-05-25 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -24,6 +24,20 @@ dependencies:
|
|
|
24
24
|
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
26
|
version: '0.9'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: sorbet-runtime
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - "~>"
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '0.5'
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - "~>"
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '0.5'
|
|
27
41
|
description: High-performance HTML to Markdown converter
|
|
28
42
|
email:
|
|
29
43
|
executables: []
|