html-to-markdown 3.4.0.pre.rc.24-aarch64-linux → 3.4.0.pre.rc.30-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: eeb8b4589ba7a98ab2d206913e1d484305864c7ca1d5700f70dbc901ccfd6854
4
- data.tar.gz: 1e0e434f693eee13aa4b3d8a5d367bf738d197712cdfd0a526b86e478551d0e7
3
+ metadata.gz: 0dc08795dc2bd8efce4fa93d0cd32f48820e5e62006a1ec30de1db83a0756aa6
4
+ data.tar.gz: f69ec46c76e87bedcf33032daff71050bc687c837b184ba434d24a2b8bf8a13c
5
5
  SHA512:
6
- metadata.gz: d375116ad5a261bfa2ff3aa75244c7df761fae4c2b158b4425bf026636105ad1bc61c5f92d9d3cd593552a13c3280e5b24c98a44c98a80ec1cded15db959a792
7
- data.tar.gz: cb0658eb7f815f8ae7e8bbe644396a9f2a7f019e75a1f4d988bc1b03b5cd46400aaec675274026934523eb442ac1678fc6fff9e8f89354dc2a42b47234b2842e
6
+ metadata.gz: d4c0ec31bc6b61106b1203c2b3d5aaef2900e9bf930ebc8c135258f9d85a70c73397e9a2cdfb087a182d235110f49603beb6c71d8b79c3d058f02f6e8b03a7e8
7
+ data.tar.gz: 90f3e6a64111b2dd63c8521205ab906ddf0c95d6c718910baf37a64826d52a6103c3efa81cb111e071624439f3451dd22f9d4466621855ea2615c027b4e07abb
data/Steepfile CHANGED
@@ -1,32 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # Steepfile for type checking html-to-markdown Ruby gem
4
-
5
3
  target :lib do
6
4
  signature 'sig'
7
-
8
5
  check 'lib'
9
-
10
- configure_code_diagnostics do |hash|
11
- hash[Steep::Diagnostic::Ruby::UnannotatedEmptyCollection] = :hint
12
- hash[Steep::Diagnostic::Ruby::UnknownConstant] = :hint
13
- hash[Steep::Diagnostic::Ruby::NoMethod] = :hint
14
- end
15
-
16
- # Configure libraries
17
- library 'pathname'
18
- library 'open3'
19
-
20
- # Ignore vendor directory
21
- ignore 'vendor'
22
-
23
- # Ignore spec directory
24
- ignore 'spec'
25
-
26
- # Ignore bin directory
27
- ignore 'bin'
28
-
29
- # Ignore internal implementation modules (not public API)
30
- ignore 'lib/html_to_markdown/cli.rb'
31
- ignore 'lib/html_to_markdown/cli_proxy.rb'
32
6
  end
Binary file
@@ -0,0 +1,21 @@
1
+ # This file is auto-generated by alef — DO NOT EDIT.
2
+ # alef:hash:990c47eb4d87f0600f1e6da62d32bf37c84cf430faf31d78f5df7425a93a0f0c
3
+ # To regenerate: alef generate
4
+ # To verify freshness: alef verify --exit-code
5
+ # Issues & docs: https://github.com/kreuzberg-dev/alef
6
+ # frozen_string_literal: true
7
+
8
+ require 'json'
9
+ require 'html_to_markdown_rb'
10
+
11
+ module HtmlToMarkdown
12
+ # Re-export all public module functions from the native extension
13
+ HtmlToMarkdownRs.methods(false).each do |m|
14
+ define_singleton_method(m) { |*args, **kwargs, &blk| HtmlToMarkdownRs.public_send(m, *args, **kwargs, &blk) }
15
+ end
16
+
17
+ # Re-export all constants (classes, structs, etc.) from the native extension
18
+ HtmlToMarkdownRs.constants.each do |c|
19
+ const_set(c, HtmlToMarkdownRs.const_get(c)) unless const_defined?(c)
20
+ end
21
+ end
@@ -1,5 +1,10 @@
1
+ # This file is auto-generated by alef — DO NOT EDIT.
2
+ # alef:hash:bf6359c7254886342acb441f675467dc2b2e926e46a52f7006642ddfd64583f8
3
+ # To regenerate: alef generate
4
+ # To verify freshness: alef verify --exit-code
5
+ # Issues & docs: https://github.com/kreuzberg-dev/alef
1
6
  # frozen_string_literal: true
2
7
 
3
8
  module HtmlToMarkdown
4
- VERSION = '3.4.0.pre.rc.24'
9
+ VERSION = '3.4.0.pre.rc.30'
5
10
  end
@@ -1,40 +1,13 @@
1
+ # This file is auto-generated by alef — DO NOT EDIT.
2
+ # alef:hash:b671355c68864d5f935b91f875ab29144d9543baad5a955cd926ab9881762a19
3
+ # To regenerate: alef generate
4
+ # To verify freshness: alef verify --exit-code
5
+ # Issues & docs: https://github.com/kreuzberg-dev/alef
1
6
  # frozen_string_literal: true
2
7
 
3
8
  require_relative 'html_to_markdown/version'
4
- require 'html_to_markdown_rb'
5
- require 'json'
9
+ require_relative 'html_to_markdown/native'
6
10
 
7
- # High-performance HTML to Markdown conversion.
8
- #
9
- # @example Simple conversion
10
- # HtmlToMarkdown.convert('<h1>Hello</h1>') # => "# Hello\n\n"
11
- #
12
- # @example With options
13
- # HtmlToMarkdown.convert('<h1>Hello</h1>', heading_style: 'atx')
14
11
  module HtmlToMarkdown
15
- # Convert HTML to Markdown.
16
- #
17
- # @param html [String] The HTML content to convert.
18
- # @param options [Hash] Optional conversion options.
19
- # Supported keys (all optional):
20
- # - :heading_style - 'atx', 'atx_closed', 'setext', 'underlined'
21
- # - :code_block_style - 'backticks', 'tildes', 'indented'
22
- # - :escape_asterisks - Boolean
23
- # - :escape_underscores - Boolean
24
- # - :escape_misc - Boolean
25
- # - :escape_ascii - Boolean
26
- # - :strip_newlines - Boolean
27
- # - :keep_inline_images_in - Array of tag names
28
- # - :strip_tags - Array of tag names to strip
29
- # - :preserve_tags - Array of tag names to preserve verbatim
30
- # (and more, matching ConversionOptions fields)
31
- # @return [String] The converted Markdown content.
32
- def self.convert(html, options = {}, visitor = nil)
33
- # The Rust FFI expects options as a JSON string; serialise the hash here
34
- # rather than constructing a ConversionOptions object, which the generated
35
- # FFI layer cannot coerce back to String (see issue #334).
36
- opts_json = options.nil? || options.empty? ? nil : options.to_json
37
- result = HtmlToMarkdownRs.convert(html, opts_json, visitor)
38
- result.content || ''
39
- end
12
+ # Re-export all types and functions from native extension
40
13
  end
Binary file
data/sig/types.rbs CHANGED
@@ -1,5 +1,5 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:f0d66ccd989cb158aa2206dc4fc0596d3e4060cbb323372db1418e22598b6c21
2
+ # alef:hash:da88db156d77eefe37cfd0ca53ea75c07abbc5d3ebb7ad977060f871af4c9ff3
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # Issues & docs: https://github.com/kreuzberg-dev/alef
@@ -17,7 +17,6 @@ module HtmlToMarkdown
17
17
  # # Examples
18
18
  #
19
19
  # ```
20
- # # use html_to_markdown_rs::metadata::DocumentMetadata;
21
20
  # let doc = DocumentMetadata {
22
21
  # title: Some("My Article".to_string()),
23
22
  # description: Some("A great article about Rust".to_string()),
@@ -28,17 +27,17 @@ module HtmlToMarkdown
28
27
  # assert_eq!(doc.title, Some("My Article".to_string()));
29
28
  # ```
30
29
 
31
- attr_accessor title: String
32
- attr_accessor description: String
33
- attr_accessor keywords: Array[String]
34
- attr_accessor author: String
35
- attr_accessor canonical_url: String
36
- attr_accessor base_href: String
37
- attr_accessor language: String
38
- attr_accessor text_direction: TextDirection
39
- attr_accessor open_graph: Hash[String, String]
40
- attr_accessor twitter_card: Hash[String, String]
41
- attr_accessor meta_tags: Hash[String, String]
30
+ attr_accessor title: String?
31
+ attr_accessor description: String?
32
+ attr_accessor keywords: Array[String]?
33
+ attr_accessor author: String?
34
+ attr_accessor canonical_url: String?
35
+ attr_accessor base_href: String?
36
+ attr_accessor language: String?
37
+ attr_accessor text_direction: TextDirection?
38
+ attr_accessor open_graph: Hash[String, String]?
39
+ attr_accessor twitter_card: Hash[String, String]?
40
+ attr_accessor meta_tags: Hash[String, String]?
42
41
 
43
42
  def initialize: (?title: String, ?description: String, keywords: Array[String], ?author: String, ?canonical_url: String, ?base_href: String, ?language: String, ?text_direction: TextDirection, open_graph: Hash[String, String], twitter_card: Hash[String, String], meta_tags: Hash[String, String]) -> void
44
43
  end
@@ -52,7 +51,6 @@ module HtmlToMarkdown
52
51
  # # Examples
53
52
  #
54
53
  # ```
55
- # # use html_to_markdown_rs::metadata::HeaderMetadata;
56
54
  # let header = HeaderMetadata {
57
55
  # level: 1,
58
56
  # text: "Main Title".to_string(),
@@ -83,7 +81,6 @@ module HtmlToMarkdown
83
81
  # # Examples
84
82
  #
85
83
  # ```
86
- # # use html_to_markdown_rs::metadata::{LinkMetadata, LinkType};
87
84
  # let link = LinkMetadata {
88
85
  # href: "https://example.com".to_string(),
89
86
  # text: "Example".to_string(),
@@ -117,7 +114,6 @@ module HtmlToMarkdown
117
114
  # # Examples
118
115
  #
119
116
  # ```
120
- # # use html_to_markdown_rs::metadata::{ImageMetadata, ImageType};
121
117
  # let img = ImageMetadata {
122
118
  # src: "https://example.com/image.jpg".to_string(),
123
119
  # alt: Some("An example image".to_string()),
@@ -149,7 +145,6 @@ module HtmlToMarkdown
149
145
  # # Examples
150
146
  #
151
147
  # ```
152
- # # use html_to_markdown_rs::metadata::{StructuredData, StructuredDataType};
153
148
  # let schema = StructuredData {
154
149
  # data_type: StructuredDataType::JsonLd,
155
150
  # raw_json: r#"{"@context":"https://schema.org","@type":"Article"}"#.to_string(),
@@ -175,7 +170,6 @@ module HtmlToMarkdown
175
170
  # # Examples
176
171
  #
177
172
  # ```
178
- # # use html_to_markdown_rs::metadata::HtmlMetadata;
179
173
  # let metadata = HtmlMetadata {
180
174
  # document: Default::default(),
181
175
  # headers: Vec::new(),
@@ -187,11 +181,11 @@ module HtmlToMarkdown
187
181
  # assert!(metadata.headers.is_empty());
188
182
  # ```
189
183
 
190
- attr_accessor document: DocumentMetadata
191
- attr_accessor headers: Array[HeaderMetadata]
192
- attr_accessor links: Array[LinkMetadata]
193
- attr_accessor images: Array[ImageMetadata]
194
- attr_accessor structured_data: Array[StructuredData]
184
+ attr_accessor document: DocumentMetadata?
185
+ attr_accessor headers: Array[HeaderMetadata]?
186
+ attr_accessor links: Array[LinkMetadata]?
187
+ attr_accessor images: Array[ImageMetadata]?
188
+ attr_accessor structured_data: Array[StructuredData]?
195
189
 
196
190
  def initialize: (document: DocumentMetadata, headers: Array[HeaderMetadata], links: Array[LinkMetadata], images: Array[ImageMetadata], structured_data: Array[StructuredData]) -> void
197
191
  end
@@ -213,48 +207,49 @@ module HtmlToMarkdown
213
207
  # .build();
214
208
  # ```
215
209
 
216
- attr_accessor heading_style: HeadingStyle
217
- attr_accessor list_indent_type: ListIndentType
218
- attr_accessor list_indent_width: Integer
219
- attr_accessor bullets: String
220
- attr_accessor strong_em_symbol: String
221
- attr_accessor escape_asterisks: bool
222
- attr_accessor escape_underscores: bool
223
- attr_accessor escape_misc: bool
224
- attr_accessor escape_ascii: bool
225
- attr_accessor code_language: String
226
- attr_accessor autolinks: bool
227
- attr_accessor default_title: bool
228
- attr_accessor br_in_tables: bool
229
- attr_accessor highlight_style: HighlightStyle
230
- attr_accessor extract_metadata: bool
231
- attr_accessor whitespace_mode: WhitespaceMode
232
- attr_accessor strip_newlines: bool
233
- attr_accessor wrap: bool
234
- attr_accessor wrap_width: Integer
235
- attr_accessor convert_as_inline: bool
236
- attr_accessor sub_symbol: String
237
- attr_accessor sup_symbol: String
238
- attr_accessor newline_style: NewlineStyle
239
- attr_accessor code_block_style: CodeBlockStyle
240
- attr_accessor keep_inline_images_in: Array[String]
241
- attr_accessor preprocessing: PreprocessingOptions
242
- attr_accessor encoding: String
243
- attr_accessor debug: bool
244
- attr_accessor strip_tags: Array[String]
245
- attr_accessor preserve_tags: Array[String]
246
- attr_accessor skip_images: bool
247
- attr_accessor link_style: LinkStyle
248
- attr_accessor output_format: OutputFormat
249
- attr_accessor include_document_structure: bool
250
- attr_accessor extract_images: bool
251
- attr_accessor max_image_size: Integer
252
- attr_accessor capture_svg: bool
253
- attr_accessor infer_dimensions: bool
254
- attr_accessor max_depth: Integer
255
- attr_accessor exclude_selectors: Array[String]
256
-
257
- def initialize: (heading_style: HeadingStyle, list_indent_type: ListIndentType, list_indent_width: Integer, bullets: String, strong_em_symbol: String, escape_asterisks: bool, escape_underscores: bool, escape_misc: bool, escape_ascii: bool, code_language: String, autolinks: bool, default_title: bool, br_in_tables: bool, highlight_style: HighlightStyle, extract_metadata: bool, whitespace_mode: WhitespaceMode, strip_newlines: bool, wrap: bool, wrap_width: Integer, convert_as_inline: bool, sub_symbol: String, sup_symbol: String, newline_style: NewlineStyle, code_block_style: CodeBlockStyle, keep_inline_images_in: Array[String], preprocessing: PreprocessingOptions, encoding: String, debug: bool, strip_tags: Array[String], preserve_tags: Array[String], skip_images: bool, link_style: LinkStyle, output_format: OutputFormat, include_document_structure: bool, extract_images: bool, max_image_size: Integer, capture_svg: bool, infer_dimensions: bool, ?max_depth: Integer, exclude_selectors: Array[String]) -> void
210
+ attr_accessor heading_style: HeadingStyle?
211
+ attr_accessor list_indent_type: ListIndentType?
212
+ attr_accessor list_indent_width: Integer?
213
+ attr_accessor bullets: String?
214
+ attr_accessor strong_em_symbol: String?
215
+ attr_accessor escape_asterisks: bool?
216
+ attr_accessor escape_underscores: bool?
217
+ attr_accessor escape_misc: bool?
218
+ attr_accessor escape_ascii: bool?
219
+ attr_accessor code_language: String?
220
+ attr_accessor autolinks: bool?
221
+ attr_accessor default_title: bool?
222
+ attr_accessor br_in_tables: bool?
223
+ attr_accessor highlight_style: HighlightStyle?
224
+ attr_accessor extract_metadata: bool?
225
+ attr_accessor whitespace_mode: WhitespaceMode?
226
+ attr_accessor strip_newlines: bool?
227
+ attr_accessor wrap: bool?
228
+ attr_accessor wrap_width: Integer?
229
+ attr_accessor convert_as_inline: bool?
230
+ attr_accessor sub_symbol: String?
231
+ attr_accessor sup_symbol: String?
232
+ attr_accessor newline_style: NewlineStyle?
233
+ attr_accessor code_block_style: CodeBlockStyle?
234
+ attr_accessor keep_inline_images_in: Array[String]?
235
+ attr_accessor preprocessing: PreprocessingOptions?
236
+ attr_accessor encoding: String?
237
+ attr_accessor debug: bool?
238
+ attr_accessor strip_tags: Array[String]?
239
+ attr_accessor preserve_tags: Array[String]?
240
+ attr_accessor skip_images: bool?
241
+ attr_accessor link_style: LinkStyle?
242
+ attr_accessor output_format: OutputFormat?
243
+ attr_accessor include_document_structure: bool?
244
+ attr_accessor extract_images: bool?
245
+ attr_accessor max_image_size: Integer?
246
+ attr_accessor capture_svg: bool?
247
+ attr_accessor infer_dimensions: bool?
248
+ attr_accessor max_depth: Integer?
249
+ attr_accessor exclude_selectors: Array[String]?
250
+ attr_accessor visitor: VisitorHandle?
251
+
252
+ def initialize: (heading_style: HeadingStyle, list_indent_type: ListIndentType, list_indent_width: Integer, bullets: String, strong_em_symbol: String, escape_asterisks: bool, escape_underscores: bool, escape_misc: bool, escape_ascii: bool, code_language: String, autolinks: bool, default_title: bool, br_in_tables: bool, highlight_style: HighlightStyle, extract_metadata: bool, whitespace_mode: WhitespaceMode, strip_newlines: bool, wrap: bool, wrap_width: Integer, convert_as_inline: bool, sub_symbol: String, sup_symbol: String, newline_style: NewlineStyle, code_block_style: CodeBlockStyle, keep_inline_images_in: Array[String], preprocessing: PreprocessingOptions, encoding: String, debug: bool, strip_tags: Array[String], preserve_tags: Array[String], skip_images: bool, link_style: LinkStyle, output_format: OutputFormat, include_document_structure: bool, extract_images: bool, max_image_size: Integer, capture_svg: bool, infer_dimensions: bool, ?max_depth: Integer, exclude_selectors: Array[String], ?visitor: VisitorHandle) -> void
258
253
  def apply_update: (ConversionOptionsUpdate update) -> void
259
254
  def self.default: () -> ConversionOptions
260
255
  def self.builder: () -> ConversionOptionsBuilder
@@ -271,6 +266,7 @@ module HtmlToMarkdown
271
266
  def preserve_tags: (Array[String] tags) -> ConversionOptionsBuilder
272
267
  def keep_inline_images_in: (Array[String] tags) -> ConversionOptionsBuilder
273
268
  def exclude_selectors: (Array[String] selectors) -> ConversionOptionsBuilder
269
+ def visitor: (?VisitorHandle visitor) -> ConversionOptionsBuilder
274
270
  def preprocessing: (PreprocessingOptions preprocessing) -> ConversionOptionsBuilder
275
271
  def build: () -> ConversionOptions
276
272
  end
@@ -281,57 +277,58 @@ module HtmlToMarkdown
281
277
  # Uses `Option<T>` fields for selective updates. Bindings use this to construct
282
278
  # options from language-native types. Prefer [`ConversionOptionsBuilder`] for Rust code.
283
279
 
284
- attr_accessor heading_style: HeadingStyle
285
- attr_accessor list_indent_type: ListIndentType
286
- attr_accessor list_indent_width: Integer
287
- attr_accessor bullets: String
288
- attr_accessor strong_em_symbol: String
289
- attr_accessor escape_asterisks: bool
290
- attr_accessor escape_underscores: bool
291
- attr_accessor escape_misc: bool
292
- attr_accessor escape_ascii: bool
293
- attr_accessor code_language: String
294
- attr_accessor autolinks: bool
295
- attr_accessor default_title: bool
296
- attr_accessor br_in_tables: bool
297
- attr_accessor highlight_style: HighlightStyle
298
- attr_accessor extract_metadata: bool
299
- attr_accessor whitespace_mode: WhitespaceMode
300
- attr_accessor strip_newlines: bool
301
- attr_accessor wrap: bool
302
- attr_accessor wrap_width: Integer
303
- attr_accessor convert_as_inline: bool
304
- attr_accessor sub_symbol: String
305
- attr_accessor sup_symbol: String
306
- attr_accessor newline_style: NewlineStyle
307
- attr_accessor code_block_style: CodeBlockStyle
308
- attr_accessor keep_inline_images_in: Array[String]
309
- attr_accessor preprocessing: PreprocessingOptionsUpdate
310
- attr_accessor encoding: String
311
- attr_accessor debug: bool
312
- attr_accessor strip_tags: Array[String]
313
- attr_accessor preserve_tags: Array[String]
314
- attr_accessor skip_images: bool
315
- attr_accessor link_style: LinkStyle
316
- attr_accessor output_format: OutputFormat
317
- attr_accessor include_document_structure: bool
318
- attr_accessor extract_images: bool
319
- attr_accessor max_image_size: Integer
320
- attr_accessor capture_svg: bool
321
- attr_accessor infer_dimensions: bool
280
+ attr_accessor heading_style: HeadingStyle?
281
+ attr_accessor list_indent_type: ListIndentType?
282
+ attr_accessor list_indent_width: Integer?
283
+ attr_accessor bullets: String?
284
+ attr_accessor strong_em_symbol: String?
285
+ attr_accessor escape_asterisks: bool?
286
+ attr_accessor escape_underscores: bool?
287
+ attr_accessor escape_misc: bool?
288
+ attr_accessor escape_ascii: bool?
289
+ attr_accessor code_language: String?
290
+ attr_accessor autolinks: bool?
291
+ attr_accessor default_title: bool?
292
+ attr_accessor br_in_tables: bool?
293
+ attr_accessor highlight_style: HighlightStyle?
294
+ attr_accessor extract_metadata: bool?
295
+ attr_accessor whitespace_mode: WhitespaceMode?
296
+ attr_accessor strip_newlines: bool?
297
+ attr_accessor wrap: bool?
298
+ attr_accessor wrap_width: Integer?
299
+ attr_accessor convert_as_inline: bool?
300
+ attr_accessor sub_symbol: String?
301
+ attr_accessor sup_symbol: String?
302
+ attr_accessor newline_style: NewlineStyle?
303
+ attr_accessor code_block_style: CodeBlockStyle?
304
+ attr_accessor keep_inline_images_in: Array[String]?
305
+ attr_accessor preprocessing: PreprocessingOptionsUpdate?
306
+ attr_accessor encoding: String?
307
+ attr_accessor debug: bool?
308
+ attr_accessor strip_tags: Array[String]?
309
+ attr_accessor preserve_tags: Array[String]?
310
+ attr_accessor skip_images: bool?
311
+ attr_accessor link_style: LinkStyle?
312
+ attr_accessor output_format: OutputFormat?
313
+ attr_accessor include_document_structure: bool?
314
+ attr_accessor extract_images: bool?
315
+ attr_accessor max_image_size: Integer?
316
+ attr_accessor capture_svg: bool?
317
+ attr_accessor infer_dimensions: bool?
322
318
  attr_accessor max_depth: Integer?
323
- attr_accessor exclude_selectors: Array[String]
319
+ attr_accessor exclude_selectors: Array[String]?
320
+ attr_accessor visitor: VisitorHandle?
324
321
 
325
- def initialize: (?heading_style: HeadingStyle, ?list_indent_type: ListIndentType, ?list_indent_width: Integer, ?bullets: String, ?strong_em_symbol: String, ?escape_asterisks: bool, ?escape_underscores: bool, ?escape_misc: bool, ?escape_ascii: bool, ?code_language: String, ?autolinks: bool, ?default_title: bool, ?br_in_tables: bool, ?highlight_style: HighlightStyle, ?extract_metadata: bool, ?whitespace_mode: WhitespaceMode, ?strip_newlines: bool, ?wrap: bool, ?wrap_width: Integer, ?convert_as_inline: bool, ?sub_symbol: String, ?sup_symbol: String, ?newline_style: NewlineStyle, ?code_block_style: CodeBlockStyle, ?keep_inline_images_in: Array[String], ?preprocessing: PreprocessingOptionsUpdate, ?encoding: String, ?debug: bool, ?strip_tags: Array[String], ?preserve_tags: Array[String], ?skip_images: bool, ?link_style: LinkStyle, ?output_format: OutputFormat, ?include_document_structure: bool, ?extract_images: bool, ?max_image_size: Integer, ?capture_svg: bool, ?infer_dimensions: bool, ?max_depth: Integer?, ?exclude_selectors: Array[String]) -> void
322
+ def initialize: (?heading_style: HeadingStyle, ?list_indent_type: ListIndentType, ?list_indent_width: Integer, ?bullets: String, ?strong_em_symbol: String, ?escape_asterisks: bool, ?escape_underscores: bool, ?escape_misc: bool, ?escape_ascii: bool, ?code_language: String, ?autolinks: bool, ?default_title: bool, ?br_in_tables: bool, ?highlight_style: HighlightStyle, ?extract_metadata: bool, ?whitespace_mode: WhitespaceMode, ?strip_newlines: bool, ?wrap: bool, ?wrap_width: Integer, ?convert_as_inline: bool, ?sub_symbol: String, ?sup_symbol: String, ?newline_style: NewlineStyle, ?code_block_style: CodeBlockStyle, ?keep_inline_images_in: Array[String], ?preprocessing: PreprocessingOptionsUpdate, ?encoding: String, ?debug: bool, ?strip_tags: Array[String], ?preserve_tags: Array[String], ?skip_images: bool, ?link_style: LinkStyle, ?output_format: OutputFormat, ?include_document_structure: bool, ?extract_images: bool, ?max_image_size: Integer, ?capture_svg: bool, ?infer_dimensions: bool, ?max_depth: Integer?, ?exclude_selectors: Array[String], ?visitor: VisitorHandle) -> void
326
323
  end
327
324
 
328
325
  class PreprocessingOptions
329
326
  # HTML preprocessing options for document cleanup before conversion.
330
327
 
331
- attr_accessor enabled: bool
332
- attr_accessor preset: PreprocessingPreset
333
- attr_accessor remove_navigation: bool
334
- attr_accessor remove_forms: bool
328
+ attr_accessor enabled: bool?
329
+ attr_accessor preset: PreprocessingPreset?
330
+ attr_accessor remove_navigation: bool?
331
+ attr_accessor remove_forms: bool?
335
332
 
336
333
  def initialize: (enabled: bool, preset: PreprocessingPreset, remove_navigation: bool, remove_forms: bool) -> void
337
334
  def apply_update: (PreprocessingOptionsUpdate update) -> void
@@ -347,10 +344,10 @@ module HtmlToMarkdown
347
344
  # Only specified fields (Some values) will override existing options; None values leave the
348
345
  # corresponding fields unchanged when applied via [`PreprocessingOptions::apply_update`].
349
346
 
350
- attr_accessor enabled: bool
351
- attr_accessor preset: PreprocessingPreset
352
- attr_accessor remove_navigation: bool
353
- attr_accessor remove_forms: bool
347
+ attr_accessor enabled: bool?
348
+ attr_accessor preset: PreprocessingPreset?
349
+ attr_accessor remove_navigation: bool?
350
+ attr_accessor remove_forms: bool?
354
351
 
355
352
  def initialize: (?enabled: bool, ?preset: PreprocessingPreset, ?remove_navigation: bool, ?remove_forms: bool) -> void
356
353
  end
@@ -407,12 +404,12 @@ module HtmlToMarkdown
407
404
  # assert!(result.warnings.is_empty());
408
405
  # ```
409
406
 
410
- attr_accessor content: String
411
- attr_accessor document: DocumentStructure
412
- attr_accessor metadata: HtmlMetadata
413
- attr_accessor tables: Array[TableData]
414
- attr_accessor images: Array[String]
415
- attr_accessor warnings: Array[ProcessingWarning]
407
+ attr_accessor content: String?
408
+ attr_accessor document: DocumentStructure?
409
+ attr_accessor metadata: HtmlMetadata?
410
+ attr_accessor tables: Array[TableData]?
411
+ attr_accessor images: Array[String]?
412
+ attr_accessor warnings: Array[ProcessingWarning]?
416
413
 
417
414
  def initialize: (?content: String, ?document: DocumentStructure, metadata: HtmlMetadata, tables: Array[TableData], images: Array[String], warnings: Array[ProcessingWarning]) -> void
418
415
  end
@@ -420,9 +417,9 @@ module HtmlToMarkdown
420
417
  class TableGrid
421
418
  # A structured table grid with cell-level data including spans.
422
419
 
423
- attr_accessor rows: Integer
424
- attr_accessor cols: Integer
425
- attr_accessor cells: Array[GridCell]
420
+ attr_accessor rows: Integer?
421
+ attr_accessor cols: Integer?
422
+ attr_accessor cells: Array[GridCell]?
426
423
 
427
424
  def initialize: (rows: Integer, cols: Integer, cells: Array[GridCell]) -> void
428
425
  end
@@ -458,6 +455,13 @@ module HtmlToMarkdown
458
455
  def initialize: (message: String, kind: WarningKind) -> void
459
456
  end
460
457
 
458
+ class VisitorHandle
459
+ # Type alias for a visitor handle (Rc-wrapped `RefCell` for interior mutability).
460
+ #
461
+ # This allows visitors to be passed around and shared while still being mutable.
462
+
463
+ end
464
+
461
465
  class NodeContext
462
466
  # Context information passed to all visitor methods.
463
467
  #
@@ -479,112 +483,77 @@ module HtmlToMarkdown
479
483
  # Text directionality of document content.
480
484
  #
481
485
  # Corresponds to the HTML `dir` attribute and `bdi` element directionality.
482
-
483
- LeftToRight: Integer
484
- RightToLeft: Integer
485
- Auto: Integer
486
+ type instance = :left_to_right | :right_to_left | :auto
486
487
  end
487
488
 
488
489
  class LinkType
489
490
  # Link classification based on href value and document context.
490
491
  #
491
492
  # Used to categorize links during extraction for filtering and analysis.
492
-
493
- Anchor: Integer
494
- Internal: Integer
495
- External: Integer
496
- Email: Integer
497
- Phone: Integer
498
- Other: Integer
493
+ type instance = :anchor | :internal | :external | :email | :phone | :other
499
494
  end
500
495
 
501
496
  class ImageType
502
497
  # Image source classification for proper handling and processing.
503
498
  #
504
499
  # Determines whether an image is embedded (data URI), inline SVG, external, or relative.
505
-
506
- DataUri: Integer
507
- InlineSvg: Integer
508
- External: Integer
509
- Relative: Integer
500
+ type instance = :data_uri | :inline_svg | :external | :relative
510
501
  end
511
502
 
512
503
  class StructuredDataType
513
504
  # Structured data format type.
514
505
  #
515
506
  # Identifies the schema/format used for structured data markup.
516
-
517
- JsonLd: Integer
518
- Microdata: Integer
519
- RDFa: Integer
507
+ type instance = :json_ld | :microdata | :r_d_fa
520
508
  end
521
509
 
522
510
  class PreprocessingPreset
523
511
  # HTML preprocessing aggressiveness level.
524
512
  #
525
513
  # Controls the extent of cleanup performed before conversion. Higher levels remove more elements.
526
-
527
- Minimal: Integer
528
- Standard: Integer
529
- Aggressive: Integer
514
+ type instance = :minimal | :standard | :aggressive
530
515
  end
531
516
 
532
517
  class HeadingStyle
533
518
  # Heading style options for Markdown output.
534
519
  #
535
520
  # Controls how headings (h1-h6) are rendered in the output Markdown.
536
-
537
- Underlined: Integer
538
- Atx: Integer
539
- AtxClosed: Integer
521
+ type instance = :underlined | :atx | :atx_closed
540
522
  end
541
523
 
542
524
  class ListIndentType
543
525
  # List indentation character type.
544
526
  #
545
527
  # Controls whether list items are indented with spaces or tabs.
546
-
547
- Spaces: Integer
548
- Tabs: Integer
528
+ type instance = :spaces | :tabs
549
529
  end
550
530
 
551
531
  class WhitespaceMode
552
532
  # Whitespace handling strategy during conversion.
553
533
  #
554
534
  # Determines how sequences of whitespace characters (spaces, tabs, newlines) are processed.
555
-
556
- Normalized: Integer
557
- Strict: Integer
535
+ type instance = :normalized | :strict
558
536
  end
559
537
 
560
538
  class NewlineStyle
561
539
  # Line break syntax in Markdown output.
562
540
  #
563
541
  # Controls how soft line breaks (from `<br>` or line breaks in source) are rendered.
564
-
565
- Spaces: Integer
566
- Backslash: Integer
542
+ type instance = :spaces | :backslash
567
543
  end
568
544
 
569
545
  class CodeBlockStyle
570
546
  # Code block fence style in Markdown output.
571
547
  #
572
548
  # Determines how code blocks (`<pre><code>`) are rendered in Markdown.
573
-
574
- Indented: Integer
575
- Backticks: Integer
576
- Tildes: Integer
549
+ type instance = :indented | :backticks | :tildes
577
550
  end
578
551
 
579
552
  class HighlightStyle
580
553
  # Highlight rendering style for `<mark>` elements.
581
554
  #
582
555
  # Controls how highlighted text is rendered in Markdown output.
583
-
584
- DoubleEqual: Integer
585
- Html: Integer
586
- Bold: Integer
587
- None: Integer
556
+ type instance = :double_equal | :html | :bold | :none
588
557
  end
589
558
 
590
559
  class LinkStyle
@@ -592,66 +561,31 @@ module HtmlToMarkdown
592
561
  #
593
562
  # Controls whether links and images use inline `[text](url)` syntax or
594
563
  # reference-style `[text][1]` syntax with definitions collected at the end.
595
-
596
- Inline: Integer
597
- Reference: Integer
564
+ type instance = :inline | :reference
598
565
  end
599
566
 
600
567
  class OutputFormat
601
568
  # Output format for conversion.
602
569
  #
603
570
  # Specifies the target markup language format for the conversion output.
604
-
605
- Markdown: Integer
606
- Djot: Integer
607
- Plain: Integer
571
+ type instance = :markdown | :djot | :plain
608
572
  end
609
573
 
610
574
  class NodeContent
611
575
  # The semantic content type of a document node.
612
576
  #
613
577
  # Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
614
-
615
- Heading: Integer
616
- Paragraph: Integer
617
- List: Integer
618
- ListItem: Integer
619
- Table: Integer
620
- Image: Integer
621
- Code: Integer
622
- Quote: Integer
623
- DefinitionList: Integer
624
- DefinitionItem: Integer
625
- RawBlock: Integer
626
- MetadataBlock: Integer
627
- Group: Integer
628
578
  end
629
579
 
630
580
  class AnnotationKind
631
581
  # The type of an inline text annotation.
632
582
  #
633
583
  # Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
634
-
635
- Bold: Integer
636
- Italic: Integer
637
- Underline: Integer
638
- Strikethrough: Integer
639
- Code: Integer
640
- Subscript: Integer
641
- Superscript: Integer
642
- Highlight: Integer
643
- Link: Integer
644
584
  end
645
585
 
646
586
  class WarningKind
647
587
  # Categories of processing warnings.
648
-
649
- ImageExtractionFailed: Integer
650
- EncodingFallback: Integer
651
- TruncatedInput: Integer
652
- MalformedHtml: Integer
653
- SanitizationApplied: Integer
654
- DepthLimitExceeded: Integer
588
+ type instance = :image_extraction_failed | :encoding_fallback | :truncated_input | :malformed_html | :sanitization_applied | :depth_limit_exceeded
655
589
  end
656
590
 
657
591
  class NodeType
@@ -659,95 +593,7 @@ module HtmlToMarkdown
659
593
  #
660
594
  # This enum categorizes all HTML elements that the converter recognizes,
661
595
  # providing a coarse-grained classification for visitor dispatch.
662
-
663
- Text: Integer
664
- Element: Integer
665
- Heading: Integer
666
- Paragraph: Integer
667
- Div: Integer
668
- Blockquote: Integer
669
- Pre: Integer
670
- Hr: Integer
671
- List: Integer
672
- ListItem: Integer
673
- DefinitionList: Integer
674
- DefinitionTerm: Integer
675
- DefinitionDescription: Integer
676
- Table: Integer
677
- TableRow: Integer
678
- TableCell: Integer
679
- TableHeader: Integer
680
- TableBody: Integer
681
- TableHead: Integer
682
- TableFoot: Integer
683
- Link: Integer
684
- Image: Integer
685
- Strong: Integer
686
- Em: Integer
687
- Code: Integer
688
- Strikethrough: Integer
689
- Underline: Integer
690
- Subscript: Integer
691
- Superscript: Integer
692
- Mark: Integer
693
- Small: Integer
694
- Br: Integer
695
- Span: Integer
696
- Article: Integer
697
- Section: Integer
698
- Nav: Integer
699
- Aside: Integer
700
- Header: Integer
701
- Footer: Integer
702
- Main: Integer
703
- Figure: Integer
704
- Figcaption: Integer
705
- Time: Integer
706
- Details: Integer
707
- Summary: Integer
708
- Form: Integer
709
- Input: Integer
710
- Select: Integer
711
- Option: Integer
712
- Button: Integer
713
- Textarea: Integer
714
- Label: Integer
715
- Fieldset: Integer
716
- Legend: Integer
717
- Audio: Integer
718
- Video: Integer
719
- Picture: Integer
720
- Source: Integer
721
- Iframe: Integer
722
- Svg: Integer
723
- Canvas: Integer
724
- Ruby: Integer
725
- Rt: Integer
726
- Rp: Integer
727
- Abbr: Integer
728
- Kbd: Integer
729
- Samp: Integer
730
- Var: Integer
731
- Cite: Integer
732
- Q: Integer
733
- Del: Integer
734
- Ins: Integer
735
- Data: Integer
736
- Meter: Integer
737
- Progress: Integer
738
- Output: Integer
739
- Template: Integer
740
- Slot: Integer
741
- Html: Integer
742
- Head: Integer
743
- Body: Integer
744
- Title: Integer
745
- Meta: Integer
746
- LinkTag: Integer
747
- Style: Integer
748
- Script: Integer
749
- Base: Integer
750
- Custom: Integer
596
+ type instance = :text | :element | :heading | :paragraph | :div | :blockquote | :pre | :hr | :list | :list_item | :definition_list | :definition_term | :definition_description | :table | :table_row | :table_cell | :table_header | :table_body | :table_head | :table_foot | :link | :image | :strong | :em | :code | :strikethrough | :underline | :subscript | :superscript | :mark | :small | :br | :span | :article | :section | :nav | :aside | :header | :footer | :main | :figure | :figcaption | :time | :details | :summary | :form | :input | :select | :option | :button | :textarea | :label | :fieldset | :legend | :audio | :video | :picture | :source | :iframe | :svg | :canvas | :ruby | :rt | :rp | :abbr | :kbd | :samp | :var | :cite | :q | :del | :ins | :data | :meter | :progress | :output | :template | :slot | :html | :head | :body | :title | :meta | :link_tag | :style | :script | :base | :custom
751
597
  end
752
598
 
753
599
  class VisitResult
@@ -756,14 +602,8 @@ module HtmlToMarkdown
756
602
  # Allows visitors to control the conversion flow by either proceeding
757
603
  # with default behavior, providing custom output, skipping elements,
758
604
  # preserving HTML, or signaling errors.
759
-
760
- Continue: Integer
761
- Custom: Integer
762
- Skip: Integer
763
- PreserveHtml: Integer
764
- Error: Integer
765
605
  end
766
606
 
767
- def self.convert: (String html, ?ConversionOptions options, ?String visitor) -> ConversionResult
607
+ def self.convert: (String html, ?ConversionOptions options) -> ConversionResult
768
608
 
769
609
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.4.0.pre.rc.24
4
+ version: 3.4.0.pre.rc.30
5
5
  platform: aarch64-linux
6
6
  authors:
7
7
  - Kreuzberg Team
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-05-01 00:00:00.000000000 Z
11
+ date: 2026-05-07 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: High-performance HTML to Markdown converter
14
14
  email:
@@ -19,6 +19,7 @@ files:
19
19
  - Steepfile
20
20
  - lib/bin/html-to-markdown
21
21
  - lib/html_to_markdown.rb
22
+ - lib/html_to_markdown/native.rb
22
23
  - lib/html_to_markdown/version.rb
23
24
  - lib/html_to_markdown_rb.so
24
25
  - sig/html_to_markdown/cli.rbs