html-to-markdown 2.29.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +18 -41
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +7 -4
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +127 -51
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,437 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'spec_helper'
4
-
5
- RSpec.describe HtmlToMarkdown do
6
- describe '.convert_with_metadata' do
7
- it 'returns array with markdown and metadata' do
8
- html = '<html><head><title>Test</title></head><body><p>Content</p></body></html>'
9
- result = described_class.convert_with_metadata(html)
10
-
11
- expect(result).to be_an(Array)
12
- expect(result.length).to eq(2)
13
- expect(result[0]).to be_a(String)
14
- expect(result[1]).to be_a(Hash)
15
- end
16
-
17
- context 'when extracting document metadata' do
18
- it 'extracts title' do
19
- html = '<html><head><title>My Page Title</title></head><body><p>Content</p></body></html>'
20
- _, metadata = described_class.convert_with_metadata(html)
21
-
22
- expect(metadata[:document][:title]).to eq('My Page Title')
23
- end
24
-
25
- it 'extracts description' do
26
- html = <<~HTML
27
- <html>
28
- <head><meta name="description" content="Page description"></head>
29
- <body><p>Content</p></body>
30
- </html>
31
- HTML
32
- _, metadata = described_class.convert_with_metadata(html)
33
-
34
- expect(metadata[:document][:description]).to eq('Page description')
35
- end
36
-
37
- it 'extracts keywords' do
38
- html = <<~HTML
39
- <html>
40
- <head><meta name="keywords" content="keyword1, keyword2, keyword3"></head>
41
- <body><p>Content</p></body>
42
- </html>
43
- HTML
44
- _, metadata = described_class.convert_with_metadata(html)
45
-
46
- expect(metadata[:document][:keywords]).to include('keyword1', 'keyword2', 'keyword3')
47
- end
48
-
49
- it 'extracts author' do
50
- html = '<html><head><meta name="author" content="John Doe"></head><body><p>Content</p></body></html>'
51
- _, metadata = described_class.convert_with_metadata(html)
52
-
53
- expect(metadata[:document][:author]).to eq('John Doe')
54
- end
55
-
56
- it 'extracts base href' do
57
- html = '<html><head><base href="https://example.com/"></head><body><p>Content</p></body></html>'
58
- _, metadata = described_class.convert_with_metadata(html)
59
-
60
- expect(metadata[:document][:base_href]).to eq('https://example.com/')
61
- end
62
-
63
- it 'extracts canonical URL' do
64
- html = '<html><head><link rel="canonical" href="https://example.com/page"></head><body><p>Content</p></body></html>'
65
- _, metadata = described_class.convert_with_metadata(html)
66
-
67
- expect(metadata[:document][:canonical_url]).to eq('https://example.com/page')
68
- end
69
-
70
- it 'extracts language' do
71
- html = '<html lang="en"><head></head><body><p>Content</p></body></html>'
72
- _, metadata = described_class.convert_with_metadata(html)
73
-
74
- expect(metadata[:document][:language]).to eq('en')
75
- end
76
-
77
- it 'extracts text direction' do
78
- html = '<html dir="ltr"><head></head><body><p>Content</p></body></html>'
79
- _, metadata = described_class.convert_with_metadata(html)
80
-
81
- expect(metadata[:document][:text_direction]).to eq('ltr')
82
- end
83
-
84
- it 'extracts open graph metadata' do
85
- html = <<~HTML
86
- <html>
87
- <head>
88
- <meta property="og:title" content="OG Title">
89
- <meta property="og:description" content="OG Description">
90
- <meta property="og:image" content="https://example.com/image.jpg">
91
- </head>
92
- <body><p>Content</p></body>
93
- </html>
94
- HTML
95
- _, metadata = described_class.convert_with_metadata(html)
96
-
97
- expect(metadata[:document][:open_graph]).to include(
98
- 'title' => 'OG Title',
99
- 'description' => 'OG Description',
100
- 'image' => 'https://example.com/image.jpg'
101
- )
102
- end
103
-
104
- it 'extracts twitter card metadata' do
105
- html = <<~HTML
106
- <html>
107
- <head>
108
- <meta name="twitter:card" content="summary_large_image">
109
- <meta name="twitter:title" content="Twitter Title">
110
- </head>
111
- <body><p>Content</p></body>
112
- </html>
113
- HTML
114
- _, metadata = described_class.convert_with_metadata(html)
115
-
116
- expect(metadata[:document][:twitter_card]).to include(
117
- 'card' => 'summary_large_image',
118
- 'title' => 'Twitter Title'
119
- )
120
- end
121
-
122
- it 'returns empty arrays and hashes for missing metadata' do
123
- html = '<p>Content</p>'
124
- _, metadata = described_class.convert_with_metadata(html)
125
-
126
- expect(metadata[:document][:title]).to be_nil
127
- expect(metadata[:document][:description]).to be_nil
128
- expect(metadata[:document][:keywords]).to eq([])
129
- expect(metadata[:document][:open_graph]).to eq({})
130
- expect(metadata[:document][:twitter_card]).to eq({})
131
- expect(metadata[:document][:meta_tags]).to eq({})
132
- end
133
- end
134
-
135
- context 'when extracting header metadata' do
136
- it 'extracts headers with hierarchy' do
137
- html = <<~HTML
138
- <html>
139
- <body>
140
- <h1>Main Title</h1>
141
- <h2>Section</h2>
142
- <h3>Subsection</h3>
143
- </body>
144
- </html>
145
- HTML
146
- _, metadata = described_class.convert_with_metadata(html)
147
-
148
- expect(metadata[:headers].length).to eq(3)
149
- expect(metadata[:headers][0][:level]).to eq(1)
150
- expect(metadata[:headers][0][:text]).to eq('Main Title')
151
- expect(metadata[:headers][1][:level]).to eq(2)
152
- expect(metadata[:headers][1][:text]).to eq('Section')
153
- expect(metadata[:headers][2][:level]).to eq(3)
154
- expect(metadata[:headers][2][:text]).to eq('Subsection')
155
- end
156
-
157
- it 'includes header id' do
158
- html = '<html><body><h1 id="main-title">Title</h1></body></html>'
159
- _, metadata = described_class.convert_with_metadata(html)
160
-
161
- expect(metadata[:headers][0][:id]).to eq('main-title')
162
- end
163
-
164
- it 'includes depth and html_offset' do
165
- html = '<html><body><h1>Title</h1></body></html>'
166
- _, metadata = described_class.convert_with_metadata(html)
167
-
168
- header = metadata[:headers][0]
169
- expect(header).to include(:depth, :html_offset)
170
- expect(header[:depth]).to be_a(Integer)
171
- expect(header[:html_offset]).to be_a(Integer)
172
- end
173
- end
174
-
175
- context 'when extracting link metadata' do
176
- it 'extracts links with classification' do
177
- html = <<~HTML
178
- <html>
179
- <body>
180
- <a href="#section">Anchor</a>
181
- <a href="https://example.com">External</a>
182
- <a href="/page">Internal</a>
183
- <a href="mailto:test@example.com">Email</a>
184
- <a href="tel:+1234567890">Phone</a>
185
- </body>
186
- </html>
187
- HTML
188
- _, metadata = described_class.convert_with_metadata(html)
189
-
190
- links = metadata[:links]
191
- expect(links.length).to eq(5)
192
-
193
- expect(links[0][:link_type]).to eq('anchor')
194
- expect(links[1][:link_type]).to eq('external')
195
- expect(links[2][:link_type]).to eq('internal')
196
- expect(links[3][:link_type]).to eq('email')
197
- expect(links[4][:link_type]).to eq('phone')
198
- end
199
-
200
- it 'includes link text and href' do
201
- html = '<html><body><a href="https://example.com">Click here</a></body></html>'
202
- _, metadata = described_class.convert_with_metadata(html)
203
-
204
- link = metadata[:links][0]
205
- expect(link[:href]).to eq('https://example.com')
206
- expect(link[:text]).to eq('Click here')
207
- end
208
-
209
- it 'includes link title attribute' do
210
- html = '<html><body><a href="https://example.com" title="Example Site">Link</a></body></html>'
211
- _, metadata = described_class.convert_with_metadata(html)
212
-
213
- link = metadata[:links][0]
214
- expect(link[:title]).to eq('Example Site')
215
- end
216
-
217
- it 'includes link rel attributes' do
218
- html = '<html><body><a href="https://example.com" rel="nofollow external">Link</a></body></html>'
219
- _, metadata = described_class.convert_with_metadata(html)
220
-
221
- link = metadata[:links][0]
222
- expect(link[:rel]).to include('nofollow', 'external')
223
- end
224
-
225
- it 'includes link attributes' do
226
- html = '<html><body><a href="https://example.com" data-custom="value">Link</a></body></html>'
227
- _, metadata = described_class.convert_with_metadata(html)
228
-
229
- link = metadata[:links][0]
230
- expect(link[:attributes]).to include('data-custom' => 'value')
231
- end
232
- end
233
-
234
- context 'when extracting image metadata' do
235
- it 'extracts images with source type' do
236
- html = <<~HTML
237
- <html>
238
- <body>
239
- <img src="https://example.com/image.jpg" alt="External">
240
- <img src="/images/local.jpg" alt="Relative">
241
- <img src="data:image/png;base64,..." alt="Data URI">
242
- </body>
243
- </html>
244
- HTML
245
- _, metadata = described_class.convert_with_metadata(html)
246
-
247
- images = metadata[:images]
248
- expect(images.length).to eq(3)
249
-
250
- expect(images[0][:image_type]).to eq('external')
251
- expect(images[1][:image_type]).to eq('relative')
252
- expect(images[2][:image_type]).to eq('data_uri')
253
- end
254
-
255
- it 'includes image alt and title' do
256
- html = '<html><body><img src="image.jpg" alt="Alt text" title="Image title"></body></html>'
257
- _, metadata = described_class.convert_with_metadata(html)
258
-
259
- image = metadata[:images][0]
260
- expect(image[:alt]).to eq('Alt text')
261
- expect(image[:title]).to eq('Image title')
262
- end
263
-
264
- it 'includes image dimensions' do
265
- html = '<html><body><img src="image.jpg" width="800" height="600"></body></html>'
266
- _, metadata = described_class.convert_with_metadata(html)
267
-
268
- image = metadata[:images][0]
269
- expect(image[:dimensions]).to be_an(Array)
270
- expect(image[:dimensions].length).to eq(2)
271
- end
272
-
273
- it 'handles missing image attributes' do
274
- html = '<html><body><img src="image.jpg"></body></html>'
275
- _, metadata = described_class.convert_with_metadata(html)
276
-
277
- image = metadata[:images][0]
278
- expect(image[:alt]).to be_nil
279
- expect(image[:title]).to be_nil
280
- end
281
- end
282
-
283
- context 'with metadata configuration flags' do
284
- it 'respects extract_headers flag' do
285
- html = '<html><body><h1>Title</h1><p>Content</p></body></html>'
286
- config = { extract_headers: false }
287
- _, metadata = described_class.convert_with_metadata(html, nil, config)
288
-
289
- expect(metadata[:headers]).to eq([])
290
- end
291
-
292
- it 'respects extract_links flag' do
293
- html = '<html><body><a href="https://example.com">Link</a></body></html>'
294
- config = { extract_links: false }
295
- _, metadata = described_class.convert_with_metadata(html, nil, config)
296
-
297
- expect(metadata[:links]).to eq([])
298
- end
299
-
300
- it 'respects extract_images flag' do
301
- html = '<html><body><img src="image.jpg" alt="test"></body></html>'
302
- config = { extract_images: false }
303
- _, metadata = described_class.convert_with_metadata(html, nil, config)
304
-
305
- expect(metadata[:images]).to eq([])
306
- end
307
-
308
- it 'respects extract_structured_data flag' do
309
- html = '<html><body><script type="application/ld+json">{"@type":"Article"}</script></body></html>'
310
- config = { extract_structured_data: false }
311
- _, metadata = described_class.convert_with_metadata(html, nil, config)
312
-
313
- expect(metadata[:structured_data]).to eq([])
314
- end
315
- end
316
-
317
- context 'with conversion options and metadata config' do
318
- it 'accepts both conversion options and metadata config' do
319
- html = '<html><head><title>Test</title></head><body><h1>Heading</h1></body></html>'
320
- conv_opts = { heading_style: :atx_closed }
321
- meta_opts = { extract_headers: true }
322
-
323
- markdown, metadata = described_class.convert_with_metadata(html, conv_opts, meta_opts)
324
-
325
- expect(markdown).to include('# Heading #')
326
- expect(metadata[:headers].length).to eq(1)
327
- end
328
-
329
- it 'works with nil options' do
330
- html = '<html><head><title>Test</title></head><body><p>Content</p></body></html>'
331
- result = described_class.convert_with_metadata(html, nil, nil)
332
-
333
- expect(result).to be_an(Array)
334
- expect(result.length).to eq(2)
335
- end
336
- end
337
-
338
- context 'when extracting structured data' do
339
- it 'extracts JSON-LD blocks' do
340
- html = <<~HTML
341
- <html>
342
- <head>
343
- <script type="application/ld+json">
344
- {"@context":"https://schema.org","@type":"Article","headline":"Test"}
345
- </script>
346
- </head>
347
- <body><p>Content</p></body>
348
- </html>
349
- HTML
350
- _, metadata = described_class.convert_with_metadata(html)
351
-
352
- expect(metadata[:structured_data]).to be_an(Array)
353
- end
354
- end
355
-
356
- context 'with edge cases' do
357
- it 'handles empty HTML' do
358
- html = ''
359
- markdown, metadata = described_class.convert_with_metadata(html)
360
-
361
- expect(markdown).to be_a(String)
362
- expect(metadata).to be_a(Hash)
363
- end
364
-
365
- it 'handles malformed HTML' do
366
- html = '<html><head><title>Unclosed'
367
- markdown, metadata = described_class.convert_with_metadata(html)
368
-
369
- expect(markdown).to be_a(String)
370
- expect(metadata).to be_a(Hash)
371
- end
372
-
373
- it 'handles special characters in metadata' do
374
- html = '<html><head><title>Title with "quotes" & <brackets></title></head><body><p>Content</p></body></html>'
375
- _, metadata = described_class.convert_with_metadata(html)
376
-
377
- expect(metadata[:document][:title]).to be_a(String)
378
- end
379
-
380
- it 'handles whitespace in metadata' do
381
- html = '<html><head><title> Title with spaces </title></head><body><p>Content</p></body></html>'
382
- _, metadata = described_class.convert_with_metadata(html)
383
-
384
- expect(metadata[:document][:title]).to match(/Title.*spaces/)
385
- end
386
-
387
- it 'handles multiple values for same metadata key' do
388
- html = <<~HTML
389
- <html>
390
- <head>
391
- <meta name="author" content="Author 1">
392
- <meta name="author" content="Author 2">
393
- </head>
394
- <body><p>Content</p></body>
395
- </html>
396
- HTML
397
- _, metadata = described_class.convert_with_metadata(html)
398
-
399
- expect(metadata[:document][:author]).to be_a(String)
400
- end
401
- end
402
-
403
- context 'when returning value structure' do
404
- it 'returns proper metadata hash structure' do
405
- html = <<~HTML
406
- <html>
407
- <head><title>Test</title><base href="https://example.com"></head>
408
- <body><h1>H1</h1><a href="link">Link</a><img src="img.jpg"></body>
409
- </html>
410
- HTML
411
- _, metadata = described_class.convert_with_metadata(html)
412
-
413
- expect(metadata).to include(
414
- :document,
415
- :headers,
416
- :links,
417
- :images,
418
- :structured_data
419
- )
420
-
421
- expect(metadata[:document]).to include(
422
- :title,
423
- :description,
424
- :keywords,
425
- :author,
426
- :canonical_url,
427
- :base_href,
428
- :language,
429
- :text_direction,
430
- :open_graph,
431
- :twitter_card,
432
- :meta_tags
433
- )
434
- end
435
- end
436
- end
437
- end