html-to-markdown 2.29.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +18 -41
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +7 -4
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +127 -51
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -7,205 +7,24 @@ module HtmlToMarkdown
7
7
  autoload :CLI, 'html_to_markdown/cli'
8
8
  autoload :CLIProxy, 'html_to_markdown/cli_proxy'
9
9
 
10
- class Options; end # rubocop:disable Lint/EmptyClass
11
-
12
10
  class << self
13
11
  alias native_convert convert
14
- alias native_convert_with_inline_images convert_with_inline_images
15
- alias native_convert_with_inline_images_handle convert_with_inline_images_handle
16
- alias native_options options
17
- alias native_convert_with_options convert_with_options
18
- alias native_convert_with_metadata convert_with_metadata
19
- alias native_convert_with_metadata_handle convert_with_metadata_handle
20
- alias native_convert_with_visitor convert_with_visitor
21
- alias native_convert_with_tables convert_with_tables
22
12
  end
23
13
 
24
14
  module_function
25
15
 
26
- def convert(html, options = nil, visitor = nil)
27
- if visitor
28
- native_convert_with_visitor(html.to_s, options, visitor)
29
- else
30
- native_convert(html.to_s, options)
31
- end
32
- end
33
-
34
- def convert_with_options(html, options_handle)
35
- native_convert_with_options(html.to_s, options_handle)
36
- end
37
-
38
- def convert_with_inline_images(html, options = nil, image_config = nil, _visitor = nil)
39
- # NOTE: visitor parameter is accepted for API compatibility but not used in inline images mode
40
- # The visitor pattern is only supported in the standard convert() method
41
- native_convert_with_inline_images(html.to_s, options, image_config)
42
- end
43
-
44
- def convert_with_inline_images_handle(html, options_handle, image_config = nil)
45
- native_convert_with_inline_images_handle(html.to_s, options_handle, image_config)
46
- end
47
-
48
- def options(options_hash = nil)
49
- native_options(options_hash)
50
- end
51
-
52
- # Convert HTML to Markdown with comprehensive metadata extraction.
53
- #
54
- # Performs HTML-to-Markdown conversion while extracting document metadata, headers,
55
- # links, images, and structured data in a single pass. Ideal for content analysis,
56
- # SEO workflows, and document indexing.
57
- #
58
- # @param html [String] HTML string to convert. Line endings are normalized (CRLF -> LF).
59
- # @param options [ConversionOptions, Hash, nil] Optional conversion configuration.
60
- # When a Hash, keys should match ConversionOptions field names (as symbols or strings).
61
- # Common options:
62
- # - :heading_style [String] "atx", "atx_closed", or "underlined" (default: "underlined")
63
- # - :list_indent_type [String] "spaces" or "tabs" (default: "spaces")
64
- # - :list_indent_width [Integer] Spaces per indent level (default: 4)
65
- # - :wrap [true, false] Enable text wrapping (default: false)
66
- # - :wrap_width [Integer] Wrap at this column width (default: 80)
67
- # See ConversionOptions documentation for complete list.
68
- #
69
- # @param metadata_config [Hash, nil] Optional metadata extraction configuration.
70
- # Keys should be symbols or strings. Supported keys:
71
- # - :extract_headers [true, false] Extract h1-h6 heading elements (default: true)
72
- # - :extract_links [true, false] Extract hyperlinks with type classification (default: true)
73
- # - :extract_images [true, false] Extract image elements (default: true)
74
- # - :extract_structured_data [true, false] Extract JSON-LD/Microdata/RDFa (default: true)
75
- # - :max_structured_data_size [Integer] Size limit for structured data in bytes (default: 1_000_000)
76
- #
77
- # @return [Array<String, Hash>] Tuple of [markdown_string, metadata_hash]
78
- # markdown_string: String - The converted Markdown output
79
- #
80
- # metadata_hash: Hash with keys:
81
- # - :document [Hash] Document-level metadata:
82
- # - :title [String, nil] From <title> tag
83
- # - :description [String, nil] From <meta name="description">
84
- # - :keywords [Array<String>] From <meta name="keywords">
85
- # - :author [String, nil] From <meta name="author">
86
- # - :language [String, nil] From lang attribute (e.g., "en")
87
- # - :text_direction [String, nil] "ltr", "rtl", or "auto"
88
- # - :canonical_url [String, nil] From <link rel="canonical">
89
- # - :base_href [String, nil] From <base href="">
90
- # - :open_graph [Hash<String, String>] Open Graph properties (og:* meta tags)
91
- # - :twitter_card [Hash<String, String>] Twitter Card properties (twitter:* meta tags)
92
- # - :meta_tags [Hash<String, String>] Other meta tags
93
- #
94
- # - :headers [Array<Hash>] Heading elements:
95
- # - :level [Integer] 1-6
96
- # - :text [String] Header text content
97
- # - :id [String, nil] HTML id attribute
98
- # - :depth [Integer] Tree nesting depth
99
- # - :html_offset [Integer] Byte offset in original HTML
100
- #
101
- # - :links [Array<Hash>] Hyperlinks:
102
- # - :href [String] Link URL
103
- # - :text [String] Link text content
104
- # - :title [String, nil] Title attribute
105
- # - :link_type [String] "anchor", "internal", "external", "email", "phone", or "other"
106
- # - :rel [Array<String>] Rel attribute values
107
- # - :attributes [Hash<String, String>] Additional HTML attributes
108
- #
109
- # - :images [Array<Hash>] Image elements:
110
- # - :src [String] Image source URL or data URI
111
- # - :alt [String, nil] Alt text for accessibility
112
- # - :title [String, nil] Title attribute
113
- # - :dimensions [Array<Integer>, nil] [width, height] if available
114
- # - :image_type [String] "data_uri", "external", "relative", or "inline_svg"
115
- # - :attributes [Hash<String, String>] Additional HTML attributes
116
- #
117
- # - :structured_data [Array<Hash>] Structured data blocks:
118
- # - :data_type [String] "json_ld", "microdata", or "rdfa"
119
- # - :raw_json [String] Raw JSON content
120
- # - :schema_type [String, nil] Schema type (e.g., "Article", "Event")
121
- #
122
- # @raise [StandardError] If conversion fails or invalid configuration
123
- #
124
- # @example Basic usage
125
- # html = <<~HTML
126
- # <html lang="en">
127
- # <head>
128
- # <title>My Article</title>
129
- # <meta name="description" content="A great read">
130
- # </head>
131
- # <body>
132
- # <h1 id="intro">Introduction</h1>
133
- # <p>Visit <a href="https://example.com">our site</a></p>
134
- # <img src="photo.jpg" alt="Beautiful landscape">
135
- # </body>
136
- # </html>
137
- # HTML
138
- #
139
- # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
140
- #
141
- # puts metadata[:document][:title] # => "My Article"
142
- # puts metadata[:document][:language] # => "en"
143
- # puts metadata[:headers].length # => 1
144
- # puts metadata[:headers][0][:text] # => "Introduction"
145
- # puts metadata[:links].length # => 1
146
- # puts metadata[:images].length # => 1
147
- #
148
- # @example With selective metadata extraction
149
- # config = {
150
- # extract_headers: true,
151
- # extract_links: true,
152
- # extract_images: false, # Skip images
153
- # extract_structured_data: false # Skip structured data
154
- # }
155
- #
156
- # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, nil, config)
157
- # puts metadata[:images].empty? # => true (not extracted)
158
- #
159
- # @example With conversion options
160
- # options = {
161
- # heading_style: "atx", # Use # H1, ## H2 style
162
- # wrap: true,
163
- # wrap_width: 80
164
- # }
165
- #
166
- # config = { extract_headers: true }
167
- #
168
- # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, options, config)
169
- # # Markdown uses ATX-style headings and wraps at 80 characters
170
- #
171
- # @see #convert Simple conversion without metadata
172
- # @see #convert_with_inline_images Extract inline images during conversion
173
- # @see ConversionOptions Detailed conversion configuration
174
- def convert_with_metadata(html, options = nil, metadata_config = nil, _visitor = nil)
175
- # NOTE: visitor parameter is accepted for API compatibility but not used in metadata extraction mode
176
- # The visitor pattern is only supported in the standard convert() method
177
- native_convert_with_metadata(html.to_s, options, metadata_config)
178
- end
179
-
180
- def convert_with_metadata_handle(html, options_handle, metadata_config = nil)
181
- native_convert_with_metadata_handle(html.to_s, options_handle, metadata_config)
182
- end
183
-
184
- # Convert HTML to Markdown with table extraction.
185
- #
186
- # Performs HTML-to-Markdown conversion while extracting structured table data
187
- # (cells, markdown representation, header row flags) in a single pass.
188
- #
189
- # @param html [String] HTML string to convert.
190
- # @param options [Hash, nil] Optional conversion configuration.
191
- # @param metadata_config [Hash, nil] Optional metadata extraction configuration.
192
- #
193
- # @return [Hash] A hash with keys:
194
- # - :content [String] The converted Markdown output
195
- # - :metadata [Hash, nil] Extended metadata (if metadata extraction was configured)
196
- # - :tables [Array<Hash>] Extracted tables, each with:
197
- # - :cells [Array<Array<String>>] Table cells organized as rows x columns
198
- # - :markdown [String] Complete rendered table in Markdown format
199
- # - :is_header_row [Array<Boolean>] Per-row flag indicating header rows
200
- #
201
- # @raise [StandardError] If conversion fails or invalid configuration
202
- #
203
- # @example Basic usage
204
- # html = '<table><thead><tr><th>Name</th></tr></thead><tbody><tr><td>Alice</td></tr></tbody></table>'
205
- # result = HtmlToMarkdown.convert_with_tables(html)
206
- # puts result[:tables].length # => 1
207
- # puts result[:tables][0][:cells] # => [["Name"], ["Alice"]]
208
- def convert_with_tables(html, options = nil, metadata_config = nil)
209
- native_convert_with_tables(html.to_s, options, metadata_config)
16
+ # Convert HTML to Markdown, returning a Hash with:
17
+ # - :content [String, nil] the converted Markdown output
18
+ # - :document [nil] document structure (not yet exposed)
19
+ # - :metadata [Hash, nil] extracted HTML metadata
20
+ # - :tables [Array<Hash>] extracted tables with :grid and :markdown
21
+ # - :images [Array<Hash>] extracted inline images
22
+ # - :warnings [Array<Hash>] processing warnings
23
+ #
24
+ # @param html [String] HTML string to convert
25
+ # @param options [Hash, nil] optional conversion options
26
+ # @return [Hash] conversion result
27
+ def convert(html, options = nil)
28
+ native_convert(html.to_s, options)
210
29
  end
211
30
  end
@@ -2,36 +2,6 @@
2
2
  module HtmlToMarkdown
3
3
  VERSION: String
4
4
 
5
- # Opaque handle for reusable conversion options
6
- class Options
7
- end
8
-
9
- # Visitor context information passed to visitor callbacks
10
- class NodeContext
11
- attr_reader node_type: Symbol
12
- attr_reader tag_name: String
13
- attr_reader attributes: Hash[String, String]
14
- attr_reader depth: Integer
15
- attr_reader index_in_parent: Integer
16
- attr_reader parent_tag: String | nil
17
- attr_reader is_inline: bool
18
- end
19
-
20
- # Result of a visitor callback
21
- type visitor_result = {
22
- type: :continue,
23
- } | {
24
- type: :custom,
25
- output: String,
26
- } | {
27
- type: :skip,
28
- } | {
29
- type: :preserve_html,
30
- } | {
31
- type: :error,
32
- message: String,
33
- }
34
-
35
5
  type heading_style = :underlined | :atx | :atx_closed
36
6
  type list_indent_type = :spaces | :tabs
37
7
  type highlight_style = :double_equal | :html | :bold | :none
@@ -62,7 +32,6 @@ module HtmlToMarkdown
62
32
  autolinks?: bool,
63
33
  default_title?: bool,
64
34
  br_in_tables?: bool,
65
- hocr_spatial_tables?: bool,
66
35
  highlight_style?: highlight_style,
67
36
  extract_metadata?: bool,
68
37
  whitespace_mode?: whitespace_mode,
@@ -81,50 +50,14 @@ module HtmlToMarkdown
81
50
  strip_tags?: Array[String],
82
51
  preserve_tags?: Array[String],
83
52
  output_format?: output_format,
84
- skip_images?: bool
85
- }
86
-
87
- type inline_image_config = {
88
- max_decoded_size_bytes?: Integer,
89
- filename_prefix?: String?,
53
+ skip_images?: bool,
54
+ include_document_structure?: bool,
55
+ extract_images?: bool,
56
+ max_image_size?: Integer,
90
57
  capture_svg?: bool,
91
58
  infer_dimensions?: bool
92
59
  }
93
60
 
94
- type inline_image_format = "png" | "jpeg" | "gif" | "bmp" | "webp" | "svg" | String
95
-
96
- type inline_image_source = "img_data_uri" | "svg_element"
97
-
98
- type inline_image = {
99
- data: String,
100
- format: inline_image_format,
101
- filename: String?,
102
- description: String?,
103
- dimensions: [Integer, Integer]?,
104
- source: inline_image_source,
105
- attributes: Hash[String, String]
106
- }
107
-
108
- type inline_image_warning = {
109
- index: Integer,
110
- message: String
111
- }
112
-
113
- type html_extraction = {
114
- markdown: String,
115
- inline_images: Array[inline_image],
116
- warnings: Array[inline_image_warning]
117
- }
118
-
119
- type metadata_config = {
120
- extract_document?: bool,
121
- extract_headers?: bool,
122
- extract_links?: bool,
123
- extract_images?: bool,
124
- extract_structured_data?: bool,
125
- max_structured_data_size?: Integer
126
- }
127
-
128
61
  type text_direction = "ltr" | "rtl" | "auto" | nil
129
62
 
130
63
  type document_metadata = {
@@ -185,314 +118,20 @@ module HtmlToMarkdown
185
118
  structured_data: Array[structured_data]
186
119
  }
187
120
 
188
- type table_data = {
189
- cells: Array[Array[String]],
190
- markdown: String,
191
- is_header_row: Array[bool]
192
- }
193
-
194
- type table_extraction_result = {
195
- content: String,
196
- metadata: extended_metadata?,
197
- tables: Array[table_data]
198
- }
199
-
200
- # Native methods (implemented in Rust via Magnus/rb-sys)
201
- # These are aliased from the Rust extension and available as both module and instance methods
121
+ # Native method (implemented in Rust via Magnus/rb-sys)
202
122
  private
203
123
 
204
- def self.native_convert: (String html, conversion_options? options) -> String
205
- def self.native_options: (conversion_options? options_hash) -> Options
206
- def self.native_convert_with_options: (String html, Options options_handle) -> String
207
- def self.native_convert_with_inline_images_handle: (
208
- String html,
209
- Options options_handle,
210
- inline_image_config? image_config
211
- ) -> html_extraction
212
- def self.native_convert_with_inline_images: (
213
- String html,
214
- conversion_options? options,
215
- inline_image_config? image_config
216
- ) -> html_extraction
217
- def self.native_convert_with_metadata_handle: (
218
- String html,
219
- Options options_handle,
220
- metadata_config? metadata_config
221
- ) -> [String, extended_metadata]
222
- def self.native_convert_with_metadata: (
223
- String html,
224
- conversion_options? options,
225
- metadata_config? metadata_config
226
- ) -> [String, extended_metadata]
227
- def self.native_convert_with_visitor: (
228
- String html,
229
- conversion_options? options,
230
- visitor? visitor
231
- ) -> String
232
- def self.native_convert_with_tables: (
233
- String html,
234
- conversion_options? options,
235
- metadata_config? metadata_config
236
- ) -> table_extraction_result
237
-
238
- def native_convert: (String html, conversion_options? options) -> String
239
- def native_options: (conversion_options? options_hash) -> Options
240
- def native_convert_with_options: (String html, Options options_handle) -> String
241
- def native_convert_with_inline_images_handle: (
242
- String html,
243
- Options options_handle,
244
- inline_image_config? image_config
245
- ) -> html_extraction
246
- def native_convert_with_inline_images: (
247
- String html,
248
- conversion_options? options,
249
- inline_image_config? image_config
250
- ) -> html_extraction
251
- def native_convert_with_metadata_handle: (
252
- String html,
253
- Options options_handle,
254
- metadata_config? metadata_config
255
- ) -> [String, extended_metadata]
256
- def native_convert_with_metadata: (
257
- String html,
258
- conversion_options? options,
259
- metadata_config? metadata_config
260
- ) -> [String, extended_metadata]
261
- def native_convert_with_visitor: (
262
- String html,
263
- conversion_options? options,
264
- visitor? visitor
265
- ) -> String
266
- def native_convert_with_tables: (
267
- String html,
268
- conversion_options? options,
269
- metadata_config? metadata_config
270
- ) -> table_extraction_result
271
-
272
- # Visitor interface for customizing conversion behavior
273
- type visitor = Object
124
+ def self.native_convert: (String html, conversion_options? options) -> Hash[String, untyped]
125
+ def native_convert: (String html, conversion_options? options) -> Hash[String, untyped]
274
126
 
275
127
  public
276
128
 
277
- # Convert HTML to Markdown with optional configuration and visitor
278
- #
279
- # The optional visitor parameter allows customization of conversion behavior for specific elements.
280
- # When both options and visitor are provided, the visitor can override default conversions.
281
- #
282
- # Args:
283
- # html: HTML string to convert
284
- # options: Optional conversion configuration
285
- # visitor: Optional visitor object for customizing conversion
286
- #
287
- # Returns:
288
- # markdown: String - Converted markdown output
289
- #
290
- # Example:
291
- # markdown = HtmlToMarkdown.convert(html, { wrap: true }, my_visitor)
292
- def self.convert: (String html, ?conversion_options options, ?visitor visitor) -> String
293
-
294
- # Create a reusable options handle for performance
295
- def self.options: (?conversion_options options_hash) -> Options
296
-
297
- # Convert HTML using a pre-built options handle
298
- def self.convert_with_options: (String html, Options options_handle) -> String
299
- def self.convert_with_inline_images_handle: (
300
- String html,
301
- Options options_handle,
302
- ?inline_image_config image_config
303
- ) -> html_extraction
304
-
305
- # Convert HTML with inline image extraction
306
- #
307
- # Optionally accepts a visitor for customizing conversion behavior.
308
- #
309
- # Args:
310
- # html: HTML string to convert
311
- # options: Optional conversion configuration
312
- # image_config: Optional inline image extraction configuration
313
- # visitor: Optional visitor object for customizing conversion
314
- #
315
- # Returns:
316
- # html_extraction: Hash containing markdown, inline_images array, and warnings array
317
- #
318
- # Example:
319
- # result = HtmlToMarkdown.convert_with_inline_images(html, { wrap: true }, image_config, my_visitor)
320
- def self.convert_with_inline_images: (
321
- String html,
322
- ?conversion_options options,
323
- ?inline_image_config image_config,
324
- ?visitor visitor
325
- ) -> html_extraction
326
-
327
- # Convert HTML to Markdown with a custom visitor (deprecated)
328
- #
329
- # DEPRECATED: Use convert() with the optional visitor parameter instead.
330
- # This method is maintained for backward compatibility.
331
- #
332
- # All convert functions now accept optional visitors:
333
- # - convert(html, options, visitor)
334
- # - convert_with_inline_images(html, options, image_config, visitor)
335
- # - convert_with_metadata(html, options, metadata_config, visitor)
336
- #
337
- # The visitor object can implement any of the following methods:
338
- # - visit_element_start(ctx) -> visitor_result
339
- # - visit_element_end(ctx, output) -> visitor_result
340
- # - visit_text(ctx, text) -> visitor_result
341
- # - visit_link(ctx, href, text, title) -> visitor_result
342
- # - visit_image(ctx, src, alt, title) -> visitor_result
343
- # - visit_heading(ctx, level, text, id) -> visitor_result
344
- # - visit_code_block(ctx, lang, code) -> visitor_result
345
- # - visit_code_inline(ctx, code) -> visitor_result
346
- # - visit_list_item(ctx, ordered, marker, text) -> visitor_result
347
- # - visit_list_start(ctx, ordered) -> visitor_result
348
- # - visit_list_end(ctx, ordered, output) -> visitor_result
349
- # - visit_table_start(ctx) -> visitor_result
350
- # - visit_table_row(ctx, cells, is_header) -> visitor_result
351
- # - visit_table_end(ctx, output) -> visitor_result
352
- # - visit_blockquote(ctx, content, depth) -> visitor_result
353
- # - visit_strong(ctx, text) -> visitor_result
354
- # - visit_emphasis(ctx, text) -> visitor_result
355
- # - visit_strikethrough(ctx, text) -> visitor_result
356
- # - visit_underline(ctx, text) -> visitor_result
357
- # - visit_subscript(ctx, text) -> visitor_result
358
- # - visit_superscript(ctx, text) -> visitor_result
359
- # - visit_mark(ctx, text) -> visitor_result
360
- # - visit_line_break(ctx) -> visitor_result
361
- # - visit_horizontal_rule(ctx) -> visitor_result
362
- # - visit_custom_element(ctx, tag_name, html) -> visitor_result
363
- # - visit_definition_list_start(ctx) -> visitor_result
364
- # - visit_definition_term(ctx, text) -> visitor_result
365
- # - visit_definition_description(ctx, text) -> visitor_result
366
- # - visit_definition_list_end(ctx, output) -> visitor_result
367
- # - visit_form(ctx, action, method) -> visitor_result
368
- # - visit_input(ctx, input_type, name, value) -> visitor_result
369
- # - visit_button(ctx, text) -> visitor_result
370
- # - visit_audio(ctx, src) -> visitor_result
371
- # - visit_video(ctx, src) -> visitor_result
372
- # - visit_iframe(ctx, src) -> visitor_result
373
- # - visit_details(ctx, open) -> visitor_result
374
- # - visit_summary(ctx, text) -> visitor_result
375
- # - visit_figure_start(ctx) -> visitor_result
376
- # - visit_figcaption(ctx, text) -> visitor_result
377
- # - visit_figure_end(ctx, output) -> visitor_result
378
- #
379
- # Each method should return a Hash with at least :type key:
380
- # { type: :continue } - Continue with default behavior
381
- # { type: :custom, output: "..." } - Replace with custom markdown
382
- # { type: :skip } - Skip this element entirely
383
- # { type: :preserve_html } - Keep original HTML
384
- # { type: :error, message: "..." } - Stop conversion with error
385
- #
386
- # Args:
387
- # html: HTML string to convert
388
- # options: Optional conversion configuration
389
- # visitor: Visitor object that responds to visitor callback methods
390
- #
391
- # Returns:
392
- # markdown: String - Converted markdown output
393
- #
394
- # Example:
395
- # class MyVisitor
396
- # def visit_link(ctx, href, text, title = nil)
397
- # { type: :custom, output: "[#{text}](#{href})" }
398
- # end
399
- # end
400
- #
401
- # HtmlToMarkdown.convert_with_visitor(html, visitor: MyVisitor.new)
402
- def self.convert_with_visitor: (String html, ?conversion_options options, visitor: visitor) -> String
403
-
404
- # Convert HTML to Markdown with metadata extraction
405
- #
406
- # Extracts comprehensive metadata (headers, links, images, structured data) during conversion.
407
- # Optionally accepts a visitor for customizing conversion behavior.
408
- #
409
- # Args:
410
- # html: HTML string to convert
411
- # options: Optional conversion configuration
412
- # metadata_config: Optional metadata extraction configuration
413
- # visitor: Optional visitor object for customizing conversion
414
- #
415
- # Returns:
416
- # Array containing:
417
- # - [0] markdown: String - Converted markdown output
418
- # - [1] metadata: Hash - Extracted metadata with document, headers, links, images, structured_data
419
- #
420
- # The metadata hash contains:
421
- # - document: Document-level metadata (title, description, lang, etc.)
422
- # - headers: List of header elements with hierarchy
423
- # - links: List of extracted hyperlinks with classification
424
- # - images: List of extracted images with metadata
425
- # - structured_data: List of JSON-LD, Microdata, or RDFa blocks
426
- #
427
- # Example:
428
- # html = '<html lang="en"><head><title>Test</title></head><body><h1>Hello</h1></body></html>'
429
- # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
430
- # puts "Title: #{metadata['document']['title']}"
431
- # puts "Headers: #{metadata['headers'].length}"
432
- #
433
- # Example with visitor:
434
- # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, options, metadata_config, my_visitor)
435
- def self.convert_with_metadata: (
436
- String html,
437
- ?conversion_options options,
438
- ?metadata_config metadata_config,
439
- ?visitor visitor
440
- ) -> [String, extended_metadata]
441
- def self.convert_with_metadata_handle: (
442
- String html,
443
- Options options_handle,
444
- ?metadata_config metadata_config
445
- ) -> [String, extended_metadata]
446
-
447
- # Convert HTML and extract tables as structured data
448
- #
449
- # Args:
450
- # html: HTML string to convert
451
- # options: Optional conversion configuration
452
- # metadata_config: Optional metadata extraction configuration
453
- #
454
- # Returns:
455
- # table_extraction_result: Hash containing content, metadata, and tables array
129
+ # Convert HTML to Markdown, returning a Hash with content, metadata, tables, images, and warnings.
456
130
  #
457
131
  # Example:
458
- # result = HtmlToMarkdown.convert_with_tables(html)
459
- # puts result[:tables].length
460
- def self.convert_with_tables: (
461
- String html,
462
- ?conversion_options options,
463
- ?metadata_config metadata_config
464
- ) -> table_extraction_result
132
+ # result = HtmlToMarkdown.convert(html)
133
+ def self.convert: (String html, ?conversion_options options) -> Hash[String, untyped]
465
134
 
466
- # Instance method versions (created by module_function)
467
- def convert: (String html, ?conversion_options options, ?visitor visitor) -> String
468
- def options: (?conversion_options options_hash) -> Options
469
- def convert_with_options: (String html, Options options_handle) -> String
470
- def convert_with_inline_images_handle: (
471
- String html,
472
- Options options_handle,
473
- ?inline_image_config image_config
474
- ) -> html_extraction
475
- def convert_with_inline_images: (
476
- String html,
477
- ?conversion_options options,
478
- ?inline_image_config image_config,
479
- ?visitor visitor
480
- ) -> html_extraction
481
- def convert_with_visitor: (String html, ?conversion_options options, visitor: visitor) -> String
482
- def convert_with_metadata: (
483
- String html,
484
- ?conversion_options options,
485
- ?metadata_config metadata_config,
486
- ?visitor visitor
487
- ) -> [String, extended_metadata]
488
- def convert_with_metadata_handle: (
489
- String html,
490
- Options options_handle,
491
- ?metadata_config metadata_config
492
- ) -> [String, extended_metadata]
493
- def convert_with_tables: (
494
- String html,
495
- ?conversion_options options,
496
- ?metadata_config metadata_config
497
- ) -> table_extraction_result
135
+ # Instance method version (created by module_function)
136
+ def convert: (String html, ?conversion_options options) -> Hash[String, untyped]
498
137
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "2.29.0"
6
+ version = "3.0.0"
7
7
  edition = "2024"
8
8
  rust-version = "1.85"
9
9
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -13,18 +13,21 @@ homepage = "https://kreuzberg.dev"
13
13
 
14
14
  [workspace.dependencies]
15
15
  ahash = { version = "0.8", features = ["std", "compile-time-rng"], default-features = false }
16
- async-trait = "0.1"
17
16
  base64 = "0.22"
18
17
  clap = { version = "4.6", features = ["derive"] }
19
18
  clap_complete = "4.6"
20
- clap_mangen = "0.2"
19
+ clap_mangen = "0.3"
21
20
  encoding_rs = "0.8"
22
- ext-php-rs = "0.15.6"
21
+ ext-php-rs = "0.15.7"
23
22
  html5ever = "0.39.0"
24
23
  once_cell = "1.21"
25
24
  pyo3 = { version = "0.28.2", features = ["abi3-py310"] }
25
+ rayon = "1.11"
26
26
  regex = "1.12"
27
27
  serde = { version = "1.0", features = ["derive"] }
28
28
  serde_json = "1.0"
29
+ tempfile = "3.27"
29
30
  thiserror = "2.0"
30
31
  tl = { package = "astral-tl", version = "0.7.11" }
32
+ toml = "1.1"
33
+ which = "8"