html-to-markdown 2.29.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +18 -41
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +7 -4
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +127 -51
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
data/spec/convert_spec.rb DELETED
@@ -1,77 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'spec_helper'
4
-
5
- RSpec.describe HtmlToMarkdown do
6
- describe '.convert' do
7
- it 'converts simple headings' do
8
- expect(described_class.convert('<h1>Hello</h1>')).to eq("# Hello\n")
9
- end
10
-
11
- it 'accepts options hash' do
12
- result = described_class.convert(
13
- '<h1>Hello</h1>',
14
- heading_style: :atx_closed,
15
- default_title: true
16
- )
17
- expect(result).to include('Hello')
18
- end
19
- end
20
-
21
- describe '.convert_with_inline_images' do
22
- it 'returns inline images metadata' do
23
- html = '<p><img src="data:image/png;base64,ZmFrZQ==" alt="fake"></p>'
24
- extraction = described_class.convert_with_inline_images(html)
25
- expect(extraction).to include(:markdown, :inline_images, :warnings)
26
- expect(extraction[:inline_images].first[:description]).to eq('fake')
27
- end
28
- end
29
-
30
- describe '.options' do
31
- it 'returns a reusable options handle' do
32
- handle = described_class.options(heading_style: :atx_closed)
33
- expect(handle).to be_a(HtmlToMarkdown::Options)
34
- result = described_class.convert_with_options('<h1>Hello</h1>', handle)
35
- expect(result).to include('# Hello #')
36
- end
37
- end
38
-
39
- describe 'panic handling' do
40
- context 'when a Rust panic would occur' do
41
- it 'catches panics in convert method' do
42
- malformed_html = "#{'<' * 100_000}div#{'>' * 100_000}"
43
-
44
- begin
45
- result = described_class.convert(malformed_html)
46
- expect(result).to be_a(String)
47
- rescue RuntimeError => e
48
- expect(e.message).to match(/html-to-markdown panic during conversion/)
49
- end
50
- end
51
-
52
- it 'catches panics in convert_with_options method' do
53
- malformed_html = "#{'<' * 100_000}div#{'>' * 100_000}"
54
- handle = described_class.options(heading_style: :atx)
55
-
56
- begin
57
- result = described_class.convert_with_options(malformed_html, handle)
58
- expect(result).to be_a(String)
59
- rescue RuntimeError => e
60
- expect(e.message).to match(/html-to-markdown panic during conversion/)
61
- end
62
- end
63
-
64
- it 'catches panics in convert_with_inline_images method' do
65
- malformed_html = "#{'<' * 100_000}div#{'>' * 100_000}"
66
-
67
- begin
68
- result = described_class.convert_with_inline_images(malformed_html)
69
- expect(result).to be_a(Hash)
70
- expect(result).to include(:markdown, :inline_images, :warnings)
71
- rescue RuntimeError => e
72
- expect(e.message).to match(/html-to-markdown panic during conversion/)
73
- end
74
- end
75
- end
76
- end
77
- end
@@ -1,194 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'spec_helper'
4
-
5
- RSpec.describe HtmlToMarkdown do
6
- describe '.convert_with_tables' do
7
- it 'returns a hash with content, metadata, and tables keys' do
8
- html = '<table><tr><td>Cell</td></tr></table>'
9
- result = described_class.convert_with_tables(html)
10
-
11
- expect(result).to be_a(Hash)
12
- expect(result).to include(:content, :metadata, :tables)
13
- end
14
-
15
- context 'with a basic table with header' do
16
- let(:html) do
17
- <<~HTML
18
- <table>
19
- <thead>
20
- <tr><th>Name</th><th>Age</th></tr>
21
- </thead>
22
- <tbody>
23
- <tr><td>Alice</td><td>30</td></tr>
24
- </tbody>
25
- </table>
26
- HTML
27
- end
28
-
29
- it 'extracts exactly one table' do
30
- result = described_class.convert_with_tables(html)
31
-
32
- expect(result[:tables].length).to eq(1)
33
- end
34
-
35
- it 'extracts cells as rows of columns' do
36
- result = described_class.convert_with_tables(html)
37
- table = result[:tables][0]
38
-
39
- expect(table[:cells]).to be_an(Array)
40
- expect(table[:cells].length).to eq(2)
41
- expect(table[:cells][0]).to eq(%w[Name Age])
42
- expect(table[:cells][1]).to eq(%w[Alice 30])
43
- end
44
-
45
- it 'provides markdown representation' do
46
- result = described_class.convert_with_tables(html)
47
- table = result[:tables][0]
48
-
49
- expect(table[:markdown]).to be_a(String)
50
- expect(table[:markdown]).to include('Name')
51
- expect(table[:markdown]).to include('Alice')
52
- end
53
-
54
- it 'marks header rows correctly' do
55
- result = described_class.convert_with_tables(html)
56
- table = result[:tables][0]
57
-
58
- expect(table[:is_header_row]).to be_an(Array)
59
- expect(table[:is_header_row].length).to eq(2)
60
- expect(table[:is_header_row][0]).to be true
61
- expect(table[:is_header_row][1]).to be false
62
- end
63
-
64
- it 'includes converted markdown content' do
65
- result = described_class.convert_with_tables(html)
66
-
67
- expect(result[:content]).to be_a(String)
68
- expect(result[:content]).not_to be_empty
69
- end
70
- end
71
-
72
- context 'with empty HTML' do
73
- it 'returns empty tables array' do
74
- result = described_class.convert_with_tables('')
75
-
76
- expect(result[:tables]).to eq([])
77
- expect(result[:content]).to be_a(String)
78
- end
79
- end
80
-
81
- context 'with HTML containing no tables' do
82
- it 'returns empty tables array' do
83
- html = '<p>No tables here</p>'
84
- result = described_class.convert_with_tables(html)
85
-
86
- expect(result[:tables]).to eq([])
87
- expect(result[:content]).to include('No tables here')
88
- end
89
- end
90
-
91
- context 'with multiple tables' do
92
- let(:html) do
93
- <<~HTML
94
- <table>
95
- <tr><th>A</th></tr>
96
- <tr><td>1</td></tr>
97
- </table>
98
- <p>Some text between tables</p>
99
- <table>
100
- <tr><th>B</th><th>C</th></tr>
101
- <tr><td>2</td><td>3</td></tr>
102
- <tr><td>4</td><td>5</td></tr>
103
- </table>
104
- HTML
105
- end
106
-
107
- it 'extracts all tables' do
108
- result = described_class.convert_with_tables(html)
109
-
110
- expect(result[:tables].length).to eq(2)
111
- end
112
-
113
- it 'preserves table order' do
114
- result = described_class.convert_with_tables(html)
115
-
116
- first_table = result[:tables][0]
117
- second_table = result[:tables][1]
118
-
119
- expect(first_table[:cells][0]).to eq(['A'])
120
- expect(second_table[:cells][0]).to eq(%w[B C])
121
- end
122
-
123
- it 'extracts correct row counts per table' do
124
- result = described_class.convert_with_tables(html)
125
-
126
- expect(result[:tables][0][:cells].length).to eq(2)
127
- expect(result[:tables][1][:cells].length).to eq(3)
128
- end
129
- end
130
-
131
- context 'with special characters in cells' do
132
- let(:html) do
133
- <<~HTML
134
- <table>
135
- <tr><th>Key</th><th>Value</th></tr>
136
- <tr><td>Brackets &lt;&gt;</td><td>Ampersand &amp;</td></tr>
137
- <tr><td>Quotes "double"</td><td>Quotes 'single'</td></tr>
138
- <tr><td>Unicode: cafe\u0301</td><td>Emoji: test</td></tr>
139
- </table>
140
- HTML
141
- end
142
-
143
- it 'handles HTML entities in cells' do
144
- result = described_class.convert_with_tables(html)
145
- table = result[:tables][0]
146
-
147
- expect(table[:cells][1][0]).to include('<>')
148
- expect(table[:cells][1][1]).to include('&')
149
- end
150
-
151
- it 'handles quotes in cells' do
152
- result = described_class.convert_with_tables(html)
153
- table = result[:tables][0]
154
-
155
- expect(table[:cells][2][0]).to include('"double"')
156
- expect(table[:cells][2][1]).to include("'single'")
157
- end
158
-
159
- it 'handles unicode in cells' do
160
- result = described_class.convert_with_tables(html)
161
- table = result[:tables][0]
162
-
163
- expect(table[:cells][3][0]).to be_a(String)
164
- end
165
- end
166
-
167
- context 'with conversion options' do
168
- it 'accepts options hash' do
169
- html = '<table><tr><th>Header</th></tr><tr><td>Data</td></tr></table>'
170
- result = described_class.convert_with_tables(html, { heading_style: :atx })
171
-
172
- expect(result).to be_a(Hash)
173
- expect(result[:tables].length).to eq(1)
174
- end
175
-
176
- it 'accepts nil options' do
177
- html = '<table><tr><td>Data</td></tr></table>'
178
- result = described_class.convert_with_tables(html, nil, nil)
179
-
180
- expect(result).to be_a(Hash)
181
- expect(result[:tables].length).to eq(1)
182
- end
183
- end
184
-
185
- context 'with metadata config' do
186
- it 'includes metadata when configured' do
187
- html = '<html><head><title>Test</title></head><body><table><tr><td>Data</td></tr></table></body></html>'
188
- result = described_class.convert_with_tables(html, nil, { extract_headers: true })
189
-
190
- expect(result[:metadata]).to be_a(Hash).or(be_nil)
191
- end
192
- end
193
- end
194
- end