html-to-markdown 3.4.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: '092b4d1ad3d0ea4ac57bfe807760729d6cd676066dd05fc5f7fd3ecfeb3cb1c7'
4
+ data.tar.gz: d488926fb2483f76d356eb12082bb0f426a21b7d8bde7a981297a14abc7542a7
5
+ SHA512:
6
+ metadata.gz: 1d1cd3a0b1135303d143374ec4bac13cf69d6dc1448bf29c90431ee7de7cce0e228aa568604575c39b2fb1237a32a48f76aad79ec486739910a7bb907a1d640d
7
+ data.tar.gz: 0a58dbc37be917e28cbc14f2d97f6ed51db6191ebddf1229ab49fa78f5dcdf2623c67762dff40a99adc44fb0de339317ea10123312f50469e8a865d061998f31
data/Steepfile ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ target :lib do
4
+ signature 'sig'
5
+ check 'lib'
6
+ end
Binary file
@@ -0,0 +1,59 @@
1
+ # This file is auto-generated by alef — DO NOT EDIT.
2
+ # alef:hash:b54e7bb2ab55cc6c25c9cac0e62ec66c35fd2d1956ef9ba5e3dc9e7ba5e666a5
3
+ # To regenerate: alef generate
4
+ # To verify freshness: alef verify --exit-code
5
+ # Issues & docs: https://github.com/kreuzberg-dev/alef
6
+ # frozen_string_literal: true
7
+
8
+ require 'json'
9
+ require 'html_to_markdown_rb'
10
+ module HtmlToMarkdown
11
+ # Re-export all public module functions from the native extension
12
+ HtmlToMarkdownRs.methods(false).each do |m|
13
+ define_singleton_method(m) { |*args, **kwargs, &blk| HtmlToMarkdownRs.public_send(m, *args, **kwargs, &blk) }
14
+ end
15
+
16
+ # Re-export all constants (classes, structs, etc.) from the native extension
17
+ HtmlToMarkdownRs.constants.each do |c|
18
+ const_set(c, HtmlToMarkdownRs.const_get(c)) unless const_defined?(c)
19
+ end
20
+ end
21
+
22
+ # Add accessor methods to Hash-based internally-tagged enum instances
23
+ class Hash
24
+ # Support internally-tagged enum accessors like format.excel, format.email, etc.
25
+ # Also support direct field access like format.sheet_count
26
+ # rubocop:disable Metrics/CyclomaticComplexity
27
+ def method_missing(method_name, *args, &block)
28
+ # Try symbol key first (how Magnus converts JSON keys)
29
+ return self[method_name] if key?(method_name)
30
+
31
+ # Try string key
32
+ return self[method_name.to_s] if key?(method_name.to_s)
33
+
34
+ # Check if this hash has a 'format_type' field (indicating an internally-tagged enum)
35
+ format_type = self[:'format_type'] || self['format_type']
36
+ return super unless format_type
37
+
38
+ # If the method name matches the format_type (snake_case), extract and return the variant's wrapped data
39
+ # Internally-tagged enums store variant data in the '_0' field (from alef's struct variant conversion)
40
+ # This allows format.excel to return the ExcelMetadata hash with sheet_count, sheet_names, etc.
41
+ snake_case_method = method_name.to_s.downcase
42
+ if snake_case_method == format_type.to_s.downcase
43
+ return self[:'_0'] || self['_0'] || self
44
+ end
45
+
46
+ super
47
+ end
48
+ # rubocop:enable Metrics/CyclomaticComplexity
49
+
50
+ def respond_to_missing?(method_name, include_private = false)
51
+ return true if key?(method_name) || key?(method_name.to_s)
52
+
53
+ format_type = self[:'format_type'] || self['format_type']
54
+ return false unless format_type
55
+
56
+ snake_case_method = method_name.to_s.downcase
57
+ snake_case_method == format_type.to_s.downcase || super
58
+ end
59
+ end
@@ -0,0 +1,10 @@
1
+ # This file is auto-generated by alef — DO NOT EDIT.
2
+ # alef:hash:9c58cf63849e82246f03b4fcc3996c264d47f2b2c27e0e8ba6b93eb4a84cb279
3
+ # To regenerate: alef generate
4
+ # To verify freshness: alef verify --exit-code
5
+ # Issues & docs: https://github.com/kreuzberg-dev/alef
6
+ # frozen_string_literal: true
7
+
8
+ module HtmlToMarkdown
9
+ VERSION = '3.4.0'
10
+ end
@@ -0,0 +1,13 @@
1
+ # This file is auto-generated by alef — DO NOT EDIT.
2
+ # alef:hash:b671355c68864d5f935b91f875ab29144d9543baad5a955cd926ab9881762a19
3
+ # To regenerate: alef generate
4
+ # To verify freshness: alef verify --exit-code
5
+ # Issues & docs: https://github.com/kreuzberg-dev/alef
6
+ # frozen_string_literal: true
7
+
8
+ require_relative 'html_to_markdown/version'
9
+ require_relative 'html_to_markdown/native'
10
+
11
+ module HtmlToMarkdown
12
+ # Re-export all types and functions from native extension
13
+ end
Binary file
@@ -0,0 +1,24 @@
1
+ module HtmlToMarkdown
2
+ module CLI
3
+ # Module method (module_function creates both module and instance methods)
4
+ #
5
+ # Run the CLI with the given arguments
6
+ #
7
+ # @param argv Command-line arguments (defaults to ARGV)
8
+ # @param stdout Output stream for standard output
9
+ # @param stderr Output stream for standard error
10
+ # @return Exit code (0 for success, non-zero for failure)
11
+ def self.run: (
12
+ ?Array[String] argv,
13
+ ?stdout: IO,
14
+ ?stderr: IO
15
+ ) -> Integer
16
+
17
+ # Instance method version (created by module_function)
18
+ def run: (
19
+ ?Array[String] argv,
20
+ ?stdout: IO,
21
+ ?stderr: IO
22
+ ) -> Integer
23
+ end
24
+ end
@@ -0,0 +1,48 @@
1
+ module HtmlToMarkdown
2
+ module CLIProxy
3
+ # Base error class
4
+ class Error < StandardError
5
+ end
6
+
7
+ # Error when CLI binary is not found
8
+ class MissingBinaryError < Error
9
+ end
10
+
11
+ # Error when CLI execution fails
12
+ class CLIExecutionError < Error
13
+ attr_reader stderr: String
14
+ attr_reader status: Integer?
15
+
16
+ def initialize: (String message, stderr: String, status: Integer?) -> void
17
+ end
18
+
19
+ # Module methods (module_function creates both module and instance methods)
20
+
21
+ # Execute CLI with given arguments
22
+ def self.call: (Array[String] argv) -> String
23
+
24
+ # Find the CLI binary in search paths
25
+ def self.find_cli_binary: () -> Pathname
26
+
27
+ # Get root path of the gem
28
+ def self.root_path: () -> Pathname
29
+
30
+ # Get lib path of the gem
31
+ def self.lib_path: () -> Pathname
32
+
33
+ # Get search paths for CLI binary
34
+ def self.search_paths: (String binary_name) -> Array[Pathname]
35
+
36
+ # Get error message for missing binary
37
+ def self.missing_binary_message: () -> String
38
+
39
+ # Instance method versions (created by module_function)
40
+
41
+ def call: (Array[String] argv) -> String
42
+ def find_cli_binary: () -> Pathname
43
+ def root_path: () -> Pathname
44
+ def lib_path: () -> Pathname
45
+ def search_paths: (String binary_name) -> Array[Pathname]
46
+ def missing_binary_message: () -> String
47
+ end
48
+ end
data/sig/open3.rbs ADDED
@@ -0,0 +1,12 @@
1
+ # Type signature for Open3 standard library
2
+ module Open3
3
+ # Execute command and capture stdout, stderr, and status
4
+ #
5
+ # @param cmd Command to execute
6
+ # @param args Command arguments
7
+ # @return Array containing stdout (String), stderr (String), and status (Process::Status)
8
+ def self.capture3: (
9
+ String cmd,
10
+ *String args
11
+ ) -> [String, String, Process::Status]
12
+ end
data/sig/types.rbs ADDED
@@ -0,0 +1,609 @@
1
+ # This file is auto-generated by alef — DO NOT EDIT.
2
+ # alef:hash:da88db156d77eefe37cfd0ca53ea75c07abbc5d3ebb7ad977060f871af4c9ff3
3
+ # To regenerate: alef generate
4
+ # To verify freshness: alef verify --exit-code
5
+ # Issues & docs: https://github.com/kreuzberg-dev/alef
6
+
7
+ module HtmlToMarkdown
8
+
9
+ VERSION: String
10
+
11
+ class DocumentMetadata
12
+ # Document-level metadata extracted from `<head>` and top-level elements.
13
+ #
14
+ # Contains all metadata typically used by search engines, social media platforms,
15
+ # and browsers for document indexing and presentation.
16
+ #
17
+ # # Examples
18
+ #
19
+ # ```
20
+ # let doc = DocumentMetadata {
21
+ # title: Some("My Article".to_string()),
22
+ # description: Some("A great article about Rust".to_string()),
23
+ # keywords: vec!["rust".to_string(), "programming".to_string()],
24
+ # ..Default::default()
25
+ # };
26
+ #
27
+ # assert_eq!(doc.title, Some("My Article".to_string()));
28
+ # ```
29
+
30
+ attr_accessor title: String?
31
+ attr_accessor description: String?
32
+ attr_accessor keywords: Array[String]?
33
+ attr_accessor author: String?
34
+ attr_accessor canonical_url: String?
35
+ attr_accessor base_href: String?
36
+ attr_accessor language: String?
37
+ attr_accessor text_direction: TextDirection?
38
+ attr_accessor open_graph: Hash[String, String]?
39
+ attr_accessor twitter_card: Hash[String, String]?
40
+ attr_accessor meta_tags: Hash[String, String]?
41
+
42
+ def initialize: (?title: String, ?description: String, keywords: Array[String], ?author: String, ?canonical_url: String, ?base_href: String, ?language: String, ?text_direction: TextDirection, open_graph: Hash[String, String], twitter_card: Hash[String, String], meta_tags: Hash[String, String]) -> void
43
+ end
44
+
45
+ class HeaderMetadata
46
+ # Header element metadata with hierarchy tracking.
47
+ #
48
+ # Captures heading elements (h1-h6) with their text content, identifiers,
49
+ # and position in the document structure.
50
+ #
51
+ # # Examples
52
+ #
53
+ # ```
54
+ # let header = HeaderMetadata {
55
+ # level: 1,
56
+ # text: "Main Title".to_string(),
57
+ # id: Some("main-title".to_string()),
58
+ # depth: 0,
59
+ # html_offset: 145,
60
+ # };
61
+ #
62
+ # assert_eq!(header.level, 1);
63
+ # assert!(header.is_valid());
64
+ # ```
65
+
66
+ attr_reader level: Integer
67
+ attr_reader text: String
68
+ attr_reader id: String
69
+ attr_reader depth: Integer
70
+ attr_reader html_offset: Integer
71
+
72
+ def initialize: (level: Integer, text: String, ?id: String, depth: Integer, html_offset: Integer) -> void
73
+ def is_valid: () -> bool
74
+ end
75
+
76
+ class LinkMetadata
77
+ # Hyperlink metadata with categorization and attributes.
78
+ #
79
+ # Represents `<a>` elements with parsed href values, text content, and link type classification.
80
+ #
81
+ # # Examples
82
+ #
83
+ # ```
84
+ # let link = LinkMetadata {
85
+ # href: "https://example.com".to_string(),
86
+ # text: "Example".to_string(),
87
+ # title: Some("Visit Example".to_string()),
88
+ # link_type: LinkType::External,
89
+ # rel: vec!["nofollow".to_string()],
90
+ # attributes: Default::default(),
91
+ # };
92
+ #
93
+ # assert_eq!(link.link_type, LinkType::External);
94
+ # assert_eq!(link.text, "Example");
95
+ # ```
96
+
97
+ attr_reader href: String
98
+ attr_reader text: String
99
+ attr_reader title: String
100
+ attr_reader link_type: LinkType
101
+ attr_reader rel: Array[String]
102
+ attr_reader attributes: Hash[String, String]
103
+
104
+ def initialize: (href: String, text: String, ?title: String, link_type: LinkType, rel: Array[String], attributes: Hash[String, String]) -> void
105
+ def self.classify_link: (String href) -> LinkType
106
+ end
107
+
108
+ class ImageMetadata
109
+ # Image metadata with source and dimensions.
110
+ #
111
+ # Captures `<img>` elements and inline `<svg>` elements with metadata
112
+ # for image analysis and optimization.
113
+ #
114
+ # # Examples
115
+ #
116
+ # ```
117
+ # let img = ImageMetadata {
118
+ # src: "https://example.com/image.jpg".to_string(),
119
+ # alt: Some("An example image".to_string()),
120
+ # title: Some("Example".to_string()),
121
+ # dimensions: Some((800, 600)),
122
+ # image_type: ImageType::External,
123
+ # attributes: Default::default(),
124
+ # };
125
+ #
126
+ # assert_eq!(img.image_type, ImageType::External);
127
+ # ```
128
+
129
+ attr_reader src: String
130
+ attr_reader alt: String
131
+ attr_reader title: String
132
+ attr_reader dimensions: Array[Integer]
133
+ attr_reader image_type: ImageType
134
+ attr_reader attributes: Hash[String, String]
135
+
136
+ def initialize: (src: String, ?alt: String, ?title: String, ?dimensions: Array[Integer], image_type: ImageType, attributes: Hash[String, String]) -> void
137
+ end
138
+
139
+ class StructuredData
140
+ # Structured data block (JSON-LD, Microdata, or RDFa).
141
+ #
142
+ # Represents machine-readable structured data found in the document.
143
+ # JSON-LD blocks are collected as raw JSON strings for flexibility.
144
+ #
145
+ # # Examples
146
+ #
147
+ # ```
148
+ # let schema = StructuredData {
149
+ # data_type: StructuredDataType::JsonLd,
150
+ # raw_json: r#"{"@context":"https://schema.org","@type":"Article"}"#.to_string(),
151
+ # schema_type: Some("Article".to_string()),
152
+ # };
153
+ #
154
+ # assert_eq!(schema.data_type, StructuredDataType::JsonLd);
155
+ # ```
156
+
157
+ attr_reader data_type: StructuredDataType
158
+ attr_reader raw_json: String
159
+ attr_reader schema_type: String
160
+
161
+ def initialize: (data_type: StructuredDataType, raw_json: String, ?schema_type: String) -> void
162
+ end
163
+
164
+ class HtmlMetadata
165
+ # Comprehensive metadata extraction result from HTML document.
166
+ #
167
+ # Contains all extracted metadata types in a single structure,
168
+ # suitable for serialization and transmission across language boundaries.
169
+ #
170
+ # # Examples
171
+ #
172
+ # ```
173
+ # let metadata = HtmlMetadata {
174
+ # document: Default::default(),
175
+ # headers: Vec::new(),
176
+ # links: Vec::new(),
177
+ # images: Vec::new(),
178
+ # structured_data: Vec::new(),
179
+ # };
180
+ #
181
+ # assert!(metadata.headers.is_empty());
182
+ # ```
183
+
184
+ attr_accessor document: DocumentMetadata?
185
+ attr_accessor headers: Array[HeaderMetadata]?
186
+ attr_accessor links: Array[LinkMetadata]?
187
+ attr_accessor images: Array[ImageMetadata]?
188
+ attr_accessor structured_data: Array[StructuredData]?
189
+
190
+ def initialize: (document: DocumentMetadata, headers: Array[HeaderMetadata], links: Array[LinkMetadata], images: Array[ImageMetadata], structured_data: Array[StructuredData]) -> void
191
+ end
192
+
193
+ class ConversionOptions
194
+ # Main conversion options for HTML to Markdown conversion.
195
+ #
196
+ # Use [`ConversionOptions::builder()`] to construct, or [`Default::default()`] for defaults.
197
+ #
198
+ # # Example
199
+ #
200
+ # ```text
201
+ # use html_to_markdown_rs::ConversionOptions;
202
+ #
203
+ # let options = ConversionOptions::builder()
204
+ # .heading_style(HeadingStyle::Atx)
205
+ # .wrap(true)
206
+ # .wrap_width(100)
207
+ # .build();
208
+ # ```
209
+
210
+ attr_accessor heading_style: HeadingStyle?
211
+ attr_accessor list_indent_type: ListIndentType?
212
+ attr_accessor list_indent_width: Integer?
213
+ attr_accessor bullets: String?
214
+ attr_accessor strong_em_symbol: String?
215
+ attr_accessor escape_asterisks: bool?
216
+ attr_accessor escape_underscores: bool?
217
+ attr_accessor escape_misc: bool?
218
+ attr_accessor escape_ascii: bool?
219
+ attr_accessor code_language: String?
220
+ attr_accessor autolinks: bool?
221
+ attr_accessor default_title: bool?
222
+ attr_accessor br_in_tables: bool?
223
+ attr_accessor highlight_style: HighlightStyle?
224
+ attr_accessor extract_metadata: bool?
225
+ attr_accessor whitespace_mode: WhitespaceMode?
226
+ attr_accessor strip_newlines: bool?
227
+ attr_accessor wrap: bool?
228
+ attr_accessor wrap_width: Integer?
229
+ attr_accessor convert_as_inline: bool?
230
+ attr_accessor sub_symbol: String?
231
+ attr_accessor sup_symbol: String?
232
+ attr_accessor newline_style: NewlineStyle?
233
+ attr_accessor code_block_style: CodeBlockStyle?
234
+ attr_accessor keep_inline_images_in: Array[String]?
235
+ attr_accessor preprocessing: PreprocessingOptions?
236
+ attr_accessor encoding: String?
237
+ attr_accessor debug: bool?
238
+ attr_accessor strip_tags: Array[String]?
239
+ attr_accessor preserve_tags: Array[String]?
240
+ attr_accessor skip_images: bool?
241
+ attr_accessor link_style: LinkStyle?
242
+ attr_accessor output_format: OutputFormat?
243
+ attr_accessor include_document_structure: bool?
244
+ attr_accessor extract_images: bool?
245
+ attr_accessor max_image_size: Integer?
246
+ attr_accessor capture_svg: bool?
247
+ attr_accessor infer_dimensions: bool?
248
+ attr_accessor max_depth: Integer?
249
+ attr_accessor exclude_selectors: Array[String]?
250
+ attr_accessor visitor: VisitorHandle?
251
+
252
+ def initialize: (heading_style: HeadingStyle, list_indent_type: ListIndentType, list_indent_width: Integer, bullets: String, strong_em_symbol: String, escape_asterisks: bool, escape_underscores: bool, escape_misc: bool, escape_ascii: bool, code_language: String, autolinks: bool, default_title: bool, br_in_tables: bool, highlight_style: HighlightStyle, extract_metadata: bool, whitespace_mode: WhitespaceMode, strip_newlines: bool, wrap: bool, wrap_width: Integer, convert_as_inline: bool, sub_symbol: String, sup_symbol: String, newline_style: NewlineStyle, code_block_style: CodeBlockStyle, keep_inline_images_in: Array[String], preprocessing: PreprocessingOptions, encoding: String, debug: bool, strip_tags: Array[String], preserve_tags: Array[String], skip_images: bool, link_style: LinkStyle, output_format: OutputFormat, include_document_structure: bool, extract_images: bool, max_image_size: Integer, capture_svg: bool, infer_dimensions: bool, ?max_depth: Integer, exclude_selectors: Array[String], ?visitor: VisitorHandle) -> void
253
+ def apply_update: (ConversionOptionsUpdate update) -> void
254
+ def self.default: () -> ConversionOptions
255
+ def self.builder: () -> ConversionOptionsBuilder
256
+ def self.from_update: (ConversionOptionsUpdate update) -> ConversionOptions
257
+ def self.from: (ConversionOptionsUpdate update) -> ConversionOptions
258
+ end
259
+
260
+ class ConversionOptionsBuilder
261
+ # Builder for [`ConversionOptions`].
262
+ #
263
+ # All fields start with default values. Call `.build()` to produce the final options.
264
+
265
+ def strip_tags: (Array[String] tags) -> ConversionOptionsBuilder
266
+ def preserve_tags: (Array[String] tags) -> ConversionOptionsBuilder
267
+ def keep_inline_images_in: (Array[String] tags) -> ConversionOptionsBuilder
268
+ def exclude_selectors: (Array[String] selectors) -> ConversionOptionsBuilder
269
+ def visitor: (?VisitorHandle visitor) -> ConversionOptionsBuilder
270
+ def preprocessing: (PreprocessingOptions preprocessing) -> ConversionOptionsBuilder
271
+ def build: () -> ConversionOptions
272
+ end
273
+
274
+ class ConversionOptionsUpdate
275
+ # Partial update for `ConversionOptions`.
276
+ #
277
+ # Uses `Option<T>` fields for selective updates. Bindings use this to construct
278
+ # options from language-native types. Prefer [`ConversionOptionsBuilder`] for Rust code.
279
+
280
+ attr_accessor heading_style: HeadingStyle?
281
+ attr_accessor list_indent_type: ListIndentType?
282
+ attr_accessor list_indent_width: Integer?
283
+ attr_accessor bullets: String?
284
+ attr_accessor strong_em_symbol: String?
285
+ attr_accessor escape_asterisks: bool?
286
+ attr_accessor escape_underscores: bool?
287
+ attr_accessor escape_misc: bool?
288
+ attr_accessor escape_ascii: bool?
289
+ attr_accessor code_language: String?
290
+ attr_accessor autolinks: bool?
291
+ attr_accessor default_title: bool?
292
+ attr_accessor br_in_tables: bool?
293
+ attr_accessor highlight_style: HighlightStyle?
294
+ attr_accessor extract_metadata: bool?
295
+ attr_accessor whitespace_mode: WhitespaceMode?
296
+ attr_accessor strip_newlines: bool?
297
+ attr_accessor wrap: bool?
298
+ attr_accessor wrap_width: Integer?
299
+ attr_accessor convert_as_inline: bool?
300
+ attr_accessor sub_symbol: String?
301
+ attr_accessor sup_symbol: String?
302
+ attr_accessor newline_style: NewlineStyle?
303
+ attr_accessor code_block_style: CodeBlockStyle?
304
+ attr_accessor keep_inline_images_in: Array[String]?
305
+ attr_accessor preprocessing: PreprocessingOptionsUpdate?
306
+ attr_accessor encoding: String?
307
+ attr_accessor debug: bool?
308
+ attr_accessor strip_tags: Array[String]?
309
+ attr_accessor preserve_tags: Array[String]?
310
+ attr_accessor skip_images: bool?
311
+ attr_accessor link_style: LinkStyle?
312
+ attr_accessor output_format: OutputFormat?
313
+ attr_accessor include_document_structure: bool?
314
+ attr_accessor extract_images: bool?
315
+ attr_accessor max_image_size: Integer?
316
+ attr_accessor capture_svg: bool?
317
+ attr_accessor infer_dimensions: bool?
318
+ attr_accessor max_depth: Integer?
319
+ attr_accessor exclude_selectors: Array[String]?
320
+ attr_accessor visitor: VisitorHandle?
321
+
322
+ def initialize: (?heading_style: HeadingStyle, ?list_indent_type: ListIndentType, ?list_indent_width: Integer, ?bullets: String, ?strong_em_symbol: String, ?escape_asterisks: bool, ?escape_underscores: bool, ?escape_misc: bool, ?escape_ascii: bool, ?code_language: String, ?autolinks: bool, ?default_title: bool, ?br_in_tables: bool, ?highlight_style: HighlightStyle, ?extract_metadata: bool, ?whitespace_mode: WhitespaceMode, ?strip_newlines: bool, ?wrap: bool, ?wrap_width: Integer, ?convert_as_inline: bool, ?sub_symbol: String, ?sup_symbol: String, ?newline_style: NewlineStyle, ?code_block_style: CodeBlockStyle, ?keep_inline_images_in: Array[String], ?preprocessing: PreprocessingOptionsUpdate, ?encoding: String, ?debug: bool, ?strip_tags: Array[String], ?preserve_tags: Array[String], ?skip_images: bool, ?link_style: LinkStyle, ?output_format: OutputFormat, ?include_document_structure: bool, ?extract_images: bool, ?max_image_size: Integer, ?capture_svg: bool, ?infer_dimensions: bool, ?max_depth: Integer?, ?exclude_selectors: Array[String], ?visitor: VisitorHandle) -> void
323
+ end
324
+
325
+ class PreprocessingOptions
326
+ # HTML preprocessing options for document cleanup before conversion.
327
+
328
+ attr_accessor enabled: bool?
329
+ attr_accessor preset: PreprocessingPreset?
330
+ attr_accessor remove_navigation: bool?
331
+ attr_accessor remove_forms: bool?
332
+
333
+ def initialize: (enabled: bool, preset: PreprocessingPreset, remove_navigation: bool, remove_forms: bool) -> void
334
+ def apply_update: (PreprocessingOptionsUpdate update) -> void
335
+ def self.default: () -> PreprocessingOptions
336
+ def self.from_update: (PreprocessingOptionsUpdate update) -> PreprocessingOptions
337
+ def self.from: (PreprocessingOptionsUpdate update) -> PreprocessingOptions
338
+ end
339
+
340
+ class PreprocessingOptionsUpdate
341
+ # Partial update for `PreprocessingOptions`.
342
+ #
343
+ # This struct uses `Option<T>` to represent optional fields that can be selectively updated.
344
+ # Only specified fields (Some values) will override existing options; None values leave the
345
+ # corresponding fields unchanged when applied via [`PreprocessingOptions::apply_update`].
346
+
347
+ attr_accessor enabled: bool?
348
+ attr_accessor preset: PreprocessingPreset?
349
+ attr_accessor remove_navigation: bool?
350
+ attr_accessor remove_forms: bool?
351
+
352
+ def initialize: (?enabled: bool, ?preset: PreprocessingPreset, ?remove_navigation: bool, ?remove_forms: bool) -> void
353
+ end
354
+
355
+ class DocumentStructure
356
+ # A structured document tree representing the semantic content of an HTML document.
357
+ #
358
+ # Uses a flat node array with index-based parent/child references for efficient traversal.
359
+
360
+ attr_reader nodes: Array[DocumentNode]
361
+ attr_reader source_format: String
362
+
363
+ def initialize: (nodes: Array[DocumentNode], ?source_format: String) -> void
364
+ end
365
+
366
+ class DocumentNode
367
+ # A single node in the document tree.
368
+
369
+ attr_reader id: String
370
+ attr_reader content: NodeContent
371
+ attr_reader parent: Integer
372
+ attr_reader children: Array[Integer]
373
+ attr_reader annotations: Array[TextAnnotation]
374
+ attr_reader attributes: Hash[String, String]
375
+
376
+ def initialize: (id: String, content: NodeContent, ?parent: Integer, children: Array[Integer], annotations: Array[TextAnnotation], ?attributes: Hash[String, String]) -> void
377
+ end
378
+
379
+ class TextAnnotation
380
+ # An inline text annotation with byte-range offsets.
381
+ #
382
+ # Annotations describe formatting (bold, italic, etc.) and links within a node's text content.
383
+
384
+ attr_reader start: Integer
385
+ attr_reader end: Integer
386
+ attr_reader kind: AnnotationKind
387
+
388
+ def initialize: (start: Integer, end: Integer, kind: AnnotationKind) -> void
389
+ end
390
+
391
+ class ConversionResult
392
+ # The primary result of HTML conversion and extraction.
393
+ #
394
+ # Contains the converted text output, optional structured document tree,
395
+ # metadata, extracted tables, images, and processing warnings.
396
+ #
397
+ # # Example
398
+ #
399
+ # ```text
400
+ # use html_to_markdown_rs::{convert, ConversionOptions};
401
+ #
402
+ # let result = convert("<h1>Hello</h1><p>World</p>", None)?;
403
+ # assert!(result.content.is_some());
404
+ # assert!(result.warnings.is_empty());
405
+ # ```
406
+
407
+ attr_accessor content: String?
408
+ attr_accessor document: DocumentStructure?
409
+ attr_accessor metadata: HtmlMetadata?
410
+ attr_accessor tables: Array[TableData]?
411
+ attr_accessor images: Array[String]?
412
+ attr_accessor warnings: Array[ProcessingWarning]?
413
+
414
+ def initialize: (?content: String, ?document: DocumentStructure, metadata: HtmlMetadata, tables: Array[TableData], images: Array[String], warnings: Array[ProcessingWarning]) -> void
415
+ end
416
+
417
+ class TableGrid
418
+ # A structured table grid with cell-level data including spans.
419
+
420
+ attr_accessor rows: Integer?
421
+ attr_accessor cols: Integer?
422
+ attr_accessor cells: Array[GridCell]?
423
+
424
+ def initialize: (rows: Integer, cols: Integer, cells: Array[GridCell]) -> void
425
+ end
426
+
427
+ class GridCell
428
+ # A single cell in a table grid.
429
+
430
+ attr_reader content: String
431
+ attr_reader row: Integer
432
+ attr_reader col: Integer
433
+ attr_reader row_span: Integer
434
+ attr_reader col_span: Integer
435
+ attr_reader is_header: bool
436
+
437
+ def initialize: (content: String, row: Integer, col: Integer, row_span: Integer, col_span: Integer, is_header: bool) -> void
438
+ end
439
+
440
+ class TableData
441
+ # A top-level extracted table with both structured data and markdown representation.
442
+
443
+ attr_reader grid: TableGrid
444
+ attr_reader markdown: String
445
+
446
+ def initialize: (grid: TableGrid, markdown: String) -> void
447
+ end
448
+
449
+ class ProcessingWarning
450
+ # A non-fatal warning generated during HTML processing.
451
+
452
+ attr_reader message: String
453
+ attr_reader kind: WarningKind
454
+
455
+ def initialize: (message: String, kind: WarningKind) -> void
456
+ end
457
+
458
+ class VisitorHandle
459
+ # Type alias for a visitor handle (Rc-wrapped `RefCell` for interior mutability).
460
+ #
461
+ # This allows visitors to be passed around and shared while still being mutable.
462
+
463
+ end
464
+
465
+ class NodeContext
466
+ # Context information passed to all visitor methods.
467
+ #
468
+ # Provides comprehensive metadata about the current node being visited,
469
+ # including its type, attributes, position in the DOM tree, and parent context.
470
+
471
+ attr_reader node_type: NodeType
472
+ attr_reader tag_name: String
473
+ attr_reader attributes: Hash[String, String]
474
+ attr_reader depth: Integer
475
+ attr_reader index_in_parent: Integer
476
+ attr_reader parent_tag: String
477
+ attr_reader is_inline: bool
478
+
479
+ def initialize: (node_type: NodeType, tag_name: String, attributes: Hash[String, String], depth: Integer, index_in_parent: Integer, ?parent_tag: String, is_inline: bool) -> void
480
+ end
481
+
482
+ class TextDirection
483
+ # Text directionality of document content.
484
+ #
485
+ # Corresponds to the HTML `dir` attribute and `bdi` element directionality.
486
+ type instance = :left_to_right | :right_to_left | :auto
487
+ end
488
+
489
+ class LinkType
490
+ # Link classification based on href value and document context.
491
+ #
492
+ # Used to categorize links during extraction for filtering and analysis.
493
+ type instance = :anchor | :internal | :external | :email | :phone | :other
494
+ end
495
+
496
+ class ImageType
497
+ # Image source classification for proper handling and processing.
498
+ #
499
+ # Determines whether an image is embedded (data URI), inline SVG, external, or relative.
500
+ type instance = :data_uri | :inline_svg | :external | :relative
501
+ end
502
+
503
+ class StructuredDataType
504
+ # Structured data format type.
505
+ #
506
+ # Identifies the schema/format used for structured data markup.
507
+ type instance = :json_ld | :microdata | :r_d_fa
508
+ end
509
+
510
+ class PreprocessingPreset
511
+ # HTML preprocessing aggressiveness level.
512
+ #
513
+ # Controls the extent of cleanup performed before conversion. Higher levels remove more elements.
514
+ type instance = :minimal | :standard | :aggressive
515
+ end
516
+
517
+ class HeadingStyle
518
+ # Heading style options for Markdown output.
519
+ #
520
+ # Controls how headings (h1-h6) are rendered in the output Markdown.
521
+ type instance = :underlined | :atx | :atx_closed
522
+ end
523
+
524
+ class ListIndentType
525
+ # List indentation character type.
526
+ #
527
+ # Controls whether list items are indented with spaces or tabs.
528
+ type instance = :spaces | :tabs
529
+ end
530
+
531
+ class WhitespaceMode
532
+ # Whitespace handling strategy during conversion.
533
+ #
534
+ # Determines how sequences of whitespace characters (spaces, tabs, newlines) are processed.
535
+ type instance = :normalized | :strict
536
+ end
537
+
538
+ class NewlineStyle
539
+ # Line break syntax in Markdown output.
540
+ #
541
+ # Controls how soft line breaks (from `<br>` or line breaks in source) are rendered.
542
+ type instance = :spaces | :backslash
543
+ end
544
+
545
+ class CodeBlockStyle
546
+ # Code block fence style in Markdown output.
547
+ #
548
+ # Determines how code blocks (`<pre><code>`) are rendered in Markdown.
549
+ type instance = :indented | :backticks | :tildes
550
+ end
551
+
552
+ class HighlightStyle
553
+ # Highlight rendering style for `<mark>` elements.
554
+ #
555
+ # Controls how highlighted text is rendered in Markdown output.
556
+ type instance = :double_equal | :html | :bold | :none
557
+ end
558
+
559
+ class LinkStyle
560
+ # Link rendering style in Markdown output.
561
+ #
562
+ # Controls whether links and images use inline `[text](url)` syntax or
563
+ # reference-style `[text][1]` syntax with definitions collected at the end.
564
+ type instance = :inline | :reference
565
+ end
566
+
567
+ class OutputFormat
568
+ # Output format for conversion.
569
+ #
570
+ # Specifies the target markup language format for the conversion output.
571
+ type instance = :markdown | :djot | :plain
572
+ end
573
+
574
+ class NodeContent
575
+ # The semantic content type of a document node.
576
+ #
577
+ # Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
578
+ end
579
+
580
+ class AnnotationKind
581
+ # The type of an inline text annotation.
582
+ #
583
+ # Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
584
+ end
585
+
586
+ class WarningKind
587
+ # Categories of processing warnings.
588
+ type instance = :image_extraction_failed | :encoding_fallback | :truncated_input | :malformed_html | :sanitization_applied | :depth_limit_exceeded
589
+ end
590
+
591
+ class NodeType
592
+ # Node type enumeration covering all HTML element types.
593
+ #
594
+ # This enum categorizes all HTML elements that the converter recognizes,
595
+ # providing a coarse-grained classification for visitor dispatch.
596
+ type instance = :text | :element | :heading | :paragraph | :div | :blockquote | :pre | :hr | :list | :list_item | :definition_list | :definition_term | :definition_description | :table | :table_row | :table_cell | :table_header | :table_body | :table_head | :table_foot | :link | :image | :strong | :em | :code | :strikethrough | :underline | :subscript | :superscript | :mark | :small | :br | :span | :article | :section | :nav | :aside | :header | :footer | :main | :figure | :figcaption | :time | :details | :summary | :form | :input | :select | :option | :button | :textarea | :label | :fieldset | :legend | :audio | :video | :picture | :source | :iframe | :svg | :canvas | :ruby | :rt | :rp | :abbr | :kbd | :samp | :var | :cite | :q | :del | :ins | :data | :meter | :progress | :output | :template | :slot | :html | :head | :body | :title | :meta | :link_tag | :style | :script | :base | :custom
597
+ end
598
+
599
+ class VisitResult
600
+ # Result of a visitor callback.
601
+ #
602
+ # Allows visitors to control the conversion flow by either proceeding
603
+ # with default behavior, providing custom output, skipping elements,
604
+ # preserving HTML, or signaling errors.
605
+ end
606
+
607
+ def self.convert: (String html, ?ConversionOptions options) -> ConversionResult
608
+
609
+ end
metadata ADDED
@@ -0,0 +1,54 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html-to-markdown
3
+ version: !ruby/object:Gem::Version
4
+ version: 3.4.0
5
+ platform: aarch64-linux
6
+ authors:
7
+ - Kreuzberg Team
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2026-05-09 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: High-performance HTML to Markdown converter
14
+ email:
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - Steepfile
20
+ - lib/bin/html-to-markdown
21
+ - lib/html_to_markdown.rb
22
+ - lib/html_to_markdown/native.rb
23
+ - lib/html_to_markdown/version.rb
24
+ - lib/html_to_markdown_rb.so
25
+ - sig/html_to_markdown/cli.rbs
26
+ - sig/html_to_markdown/cli_proxy.rbs
27
+ - sig/open3.rbs
28
+ - sig/types.rbs
29
+ homepage: https://github.com/kreuzberg-dev/html-to-markdown
30
+ licenses:
31
+ - MIT
32
+ metadata:
33
+ keywords: html,markdown,converter
34
+ rubygems_mfa_required: 'true'
35
+ post_install_message:
36
+ rdoc_options: []
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 3.2.0
44
+ required_rubygems_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ requirements: []
50
+ rubygems_version: 3.5.22
51
+ signing_key:
52
+ specification_version: 4
53
+ summary: High-performance HTML to Markdown converter
54
+ test_files: []