html-to-markdown 2.29.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +18 -41
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +7 -4
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +127 -51
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,605 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # rubocop:disable RSpec/VerifiedDoubles, RSpec/DescribeMethod
4
- require 'spec_helper'
5
-
6
- RSpec.describe HtmlToMarkdown, 'Issue #187: Visitor tag_name context validation' do
7
- describe 'tag_name in visit_element_start context' do
8
- it 'receives correct tag_name for div element' do
9
- html = '<div>Content</div>'
10
- visitor = double(Object)
11
- tag_names_visited = []
12
-
13
- allow(visitor).to receive(:visit_element_start) do |ctx|
14
- tag_names_visited << ctx[:tag_name]
15
- { type: :continue }
16
- end
17
-
18
- allow(visitor).to receive_messages(
19
- visit_element_end: { type: :continue },
20
- visit_text: { type: :continue }
21
- )
22
-
23
- described_class.convert_with_visitor(html, nil, visitor)
24
-
25
- expect(tag_names_visited).to include('div')
26
- end
27
-
28
- it 'receives correct tag_name for script element' do
29
- html = '<div><script>var x = 1;</script></div>'
30
- visitor = double(Object)
31
- tag_names_visited = []
32
-
33
- allow(visitor).to receive(:visit_element_start) do |ctx|
34
- tag_names_visited << ctx[:tag_name]
35
- { type: :continue }
36
- end
37
-
38
- # NOTE: script and style elements are filtered out by default, so this test verifies
39
- # that when they do appear in the context, tag_name is correct
40
- allow(visitor).to receive_messages(
41
- visit_element_end: { type: :continue },
42
- visit_text: { type: :continue }
43
- )
44
-
45
- described_class.convert_with_visitor(html, nil, visitor)
46
-
47
- # script tags are often filtered out for security; verify div is there at minimum
48
- expect(tag_names_visited).to include('div')
49
- end
50
-
51
- it 'receives correct tag_name for style element' do
52
- html = '<style>.cls { color: red; }</style><p>Text</p>'
53
- visitor = double(Object)
54
- tag_names_visited = []
55
-
56
- allow(visitor).to receive(:visit_element_start) do |ctx|
57
- tag_names_visited << ctx[:tag_name]
58
- { type: :continue }
59
- end
60
-
61
- allow(visitor).to receive_messages(
62
- visit_element_end: { type: :continue },
63
- visit_text: { type: :continue }
64
- )
65
-
66
- described_class.convert_with_visitor(html, nil, visitor)
67
-
68
- # style tags are often filtered out for security; verify p is there at minimum
69
- expect(tag_names_visited).to include('p')
70
- end
71
-
72
- it 'receives correct tag_name for p element' do
73
- html = '<div><p>Paragraph</p></div>'
74
- visitor = double(Object)
75
- tag_names_visited = []
76
-
77
- allow(visitor).to receive(:visit_element_start) do |ctx|
78
- tag_names_visited << ctx[:tag_name]
79
- { type: :continue }
80
- end
81
-
82
- allow(visitor).to receive_messages(
83
- visit_element_end: { type: :continue },
84
- visit_text: { type: :continue }
85
- )
86
-
87
- described_class.convert_with_visitor(html, nil, visitor)
88
-
89
- expect(tag_names_visited).to include('p')
90
- end
91
-
92
- it 'receives all expected tag names for mixed HTML' do
93
- html = '<div><p>Text</p><h1>Heading</h1><span>Span</span></div>'
94
- visitor = double(Object)
95
- tag_names_visited = []
96
-
97
- allow(visitor).to receive(:visit_element_start) do |ctx|
98
- tag_names_visited << ctx[:tag_name]
99
- { type: :continue }
100
- end
101
-
102
- allow(visitor).to receive_messages(
103
- visit_element_end: { type: :continue },
104
- visit_text: { type: :continue },
105
- visit_heading: { type: :continue }
106
- )
107
-
108
- described_class.convert_with_visitor(html, nil, visitor)
109
-
110
- expect(tag_names_visited).to include('div')
111
- expect(tag_names_visited).to include('p')
112
- expect(tag_names_visited).to include('h1')
113
- expect(tag_names_visited).to include('span')
114
- end
115
- end
116
-
117
- describe 'tag_name type validation' do
118
- it 'tag_name is always a string' do
119
- html = '<section id="main"><article>Content</article></section>'
120
- visitor = double(Object)
121
- tag_name_types = []
122
-
123
- allow(visitor).to receive(:visit_element_start) do |ctx|
124
- tag_name_types << ctx[:tag_name].class
125
- { type: :continue }
126
- end
127
-
128
- allow(visitor).to receive_messages(
129
- visit_element_end: { type: :continue },
130
- visit_text: { type: :continue }
131
- )
132
-
133
- described_class.convert_with_visitor(html, nil, visitor)
134
-
135
- expect(tag_name_types).not_to be_empty
136
- expect(tag_name_types).to all(eq(String))
137
- end
138
-
139
- it 'tag_name is never nil' do
140
- html = '<div><span>Text</span></div>'
141
- visitor = double(Object)
142
- nil_tag_names = []
143
-
144
- allow(visitor).to receive(:visit_element_start) do |ctx|
145
- nil_tag_names << ctx[:tag_name] if ctx[:tag_name].nil?
146
- { type: :continue }
147
- end
148
-
149
- allow(visitor).to receive_messages(
150
- visit_element_end: { type: :continue },
151
- visit_text: { type: :continue }
152
- )
153
-
154
- described_class.convert_with_visitor(html, nil, visitor)
155
-
156
- expect(nil_tag_names).to be_empty
157
- end
158
-
159
- it 'tag_name is never empty string' do
160
- html = '<div><p>Test</p></div>'
161
- visitor = double(Object)
162
- empty_tag_names = []
163
-
164
- allow(visitor).to receive(:visit_element_start) do |ctx|
165
- empty_tag_names << ctx[:tag_name] if ctx[:tag_name].empty?
166
- { type: :continue }
167
- end
168
-
169
- allow(visitor).to receive_messages(
170
- visit_element_end: { type: :continue },
171
- visit_text: { type: :continue }
172
- )
173
-
174
- described_class.convert_with_visitor(html, nil, visitor)
175
-
176
- expect(empty_tag_names).to be_empty
177
- end
178
- end
179
-
180
- describe 'filtering by tag name' do
181
- it 'filters divs by tag name in context' do
182
- html = '<div id="d1"><div id="d2">Nested</div></div><p>Paragraph</p>'
183
- visitor = double(Object)
184
- divs_found = []
185
-
186
- allow(visitor).to receive(:visit_element_start) do |ctx|
187
- divs_found << ctx[:attributes]['id'] if ctx[:tag_name] == 'div'
188
- { type: :continue }
189
- end
190
-
191
- allow(visitor).to receive_messages(
192
- visit_element_end: { type: :continue },
193
- visit_text: { type: :continue }
194
- )
195
-
196
- described_class.convert_with_visitor(html, nil, visitor)
197
-
198
- expect(divs_found).to include('d1')
199
- expect(divs_found).to include('d2')
200
- expect(divs_found.length).to eq(2)
201
- end
202
-
203
- it 'filters elements by tag name and applies custom transformation' do
204
- html = '<div class="remove">Skip me</div><div class="keep">Keep me</div><p>Text</p>'
205
- visitor = double(Object)
206
-
207
- allow(visitor).to receive(:visit_element_start) do |ctx|
208
- if ctx[:tag_name] == 'div' && ctx[:attributes]['class'] == 'remove'
209
- { type: :skip }
210
- else
211
- { type: :continue }
212
- end
213
- end
214
-
215
- allow(visitor).to receive_messages(
216
- visit_element_end: { type: :continue },
217
- visit_text: { type: :continue }
218
- )
219
-
220
- result = described_class.convert_with_visitor(html, nil, visitor)
221
-
222
- expect(result).to include('Keep me')
223
- expect(result).not_to include('Skip me')
224
- end
225
-
226
- it 'collects tag names by class attribute' do
227
- html = '<div class="container"><p class="content">Text</p></div>'
228
- visitor = double(Object)
229
- tags_by_class = {}
230
-
231
- allow(visitor).to receive(:visit_element_start) do |ctx|
232
- class_name = ctx[:attributes]['class']
233
- if class_name
234
- tags_by_class[class_name] ||= []
235
- tags_by_class[class_name] << ctx[:tag_name]
236
- end
237
- { type: :continue }
238
- end
239
-
240
- allow(visitor).to receive_messages(
241
- visit_element_end: { type: :continue },
242
- visit_text: { type: :continue }
243
- )
244
-
245
- described_class.convert_with_visitor(html, nil, visitor)
246
-
247
- expect(tags_by_class['container']).to include('div')
248
- expect(tags_by_class['content']).to include('p')
249
- end
250
-
251
- it 'filters and counts specific tag names' do
252
- html = '<h1>H1</h1><h2>H2</h2><p>Para</p><h3>H3</h3>'
253
- visitor = double(Object)
254
- heading_count = { h1: 0, h2: 0, h3: 0 }
255
-
256
- allow(visitor).to receive(:visit_element_start) do |ctx|
257
- case ctx[:tag_name]
258
- when 'h1'
259
- heading_count[:h1] += 1
260
- when 'h2'
261
- heading_count[:h2] += 1
262
- when 'h3'
263
- heading_count[:h3] += 1
264
- end
265
- { type: :continue }
266
- end
267
-
268
- allow(visitor).to receive_messages(
269
- visit_element_end: { type: :continue },
270
- visit_text: { type: :continue },
271
- visit_heading: { type: :continue }
272
- )
273
-
274
- described_class.convert_with_visitor(html, nil, visitor)
275
-
276
- expect(heading_count[:h1]).to eq(1)
277
- expect(heading_count[:h2]).to eq(1)
278
- expect(heading_count[:h3]).to eq(1)
279
- end
280
- end
281
-
282
- describe 'filtering divs by class attribute combined with tag_name' do
283
- it 'identifies and filters divs with specific classes' do
284
- html = '
285
- <div class="header">Header</div>
286
- <div class="content">Content</div>
287
- <div class="footer">Footer</div>
288
- <p class="text">Paragraph</p>
289
- '
290
- visitor = double(Object)
291
- content_divs = []
292
-
293
- allow(visitor).to receive(:visit_element_start) do |ctx|
294
- content_divs << ctx if ctx[:tag_name] == 'div' && ctx[:attributes]['class'] == 'content'
295
- { type: :continue }
296
- end
297
-
298
- allow(visitor).to receive_messages(
299
- visit_element_end: { type: :continue },
300
- visit_text: { type: :continue }
301
- )
302
-
303
- described_class.convert_with_visitor(html, nil, visitor)
304
-
305
- expect(content_divs.length).to eq(1)
306
- expect(content_divs[0][:tag_name]).to eq('div')
307
- expect(content_divs[0][:attributes]['class']).to eq('content')
308
- end
309
-
310
- it 'skips divs matching filter criteria' do
311
- html = '
312
- <div class="advertisement">Ad</div>
313
- <p>Paragraph 1</p>
314
- <div class="advertisement">Another ad</div>
315
- <p>Paragraph 2</p>
316
- '
317
- visitor = double(Object)
318
-
319
- allow(visitor).to receive(:visit_element_start) do |ctx|
320
- if ctx[:tag_name] == 'div' && ctx[:attributes]['class']&.include?('advertisement')
321
- { type: :skip }
322
- else
323
- { type: :continue }
324
- end
325
- end
326
-
327
- allow(visitor).to receive_messages(
328
- visit_element_end: { type: :continue },
329
- visit_text: { type: :continue }
330
- )
331
-
332
- result = described_class.convert_with_visitor(html, nil, visitor)
333
-
334
- expect(result).to include('Paragraph 1')
335
- expect(result).to include('Paragraph 2')
336
- expect(result).not_to include('Ad')
337
- expect(result).not_to include('Another ad')
338
- end
339
-
340
- it 'transforms divs with specific class' do
341
- html = '<div class="warning">Important</div><div>Normal</div>'
342
- visitor = double(Object)
343
-
344
- allow(visitor).to receive(:visit_element_start) do |ctx|
345
- if ctx[:tag_name] == 'div' && ctx[:attributes]['class'] == 'warning'
346
- { type: :custom, output: '**WARNING: Important**' }
347
- else
348
- { type: :continue }
349
- end
350
- end
351
-
352
- allow(visitor).to receive_messages(
353
- visit_element_end: { type: :continue },
354
- visit_text: { type: :continue }
355
- )
356
-
357
- result = described_class.convert_with_visitor(html, nil, visitor)
358
-
359
- expect(result).to include('WARNING: Important')
360
- end
361
-
362
- it 'preserves HTML for certain divs based on class' do
363
- html = '<div class="custom">Custom HTML</div><div>Normal</div>'
364
- visitor = double(Object)
365
-
366
- allow(visitor).to receive(:visit_element_start) do |ctx|
367
- if ctx[:tag_name] == 'div' && ctx[:attributes]['class'] == 'custom'
368
- { type: :preserve_html }
369
- else
370
- { type: :continue }
371
- end
372
- end
373
-
374
- allow(visitor).to receive_messages(
375
- visit_element_end: { type: :continue },
376
- visit_text: { type: :continue }
377
- )
378
-
379
- result = described_class.convert_with_visitor(html, nil, visitor)
380
-
381
- expect(result).to be_a(String)
382
- end
383
- end
384
-
385
- describe 'tag_name consistency across visitor lifecycle' do
386
- it 'tag_name is consistent in visit_element_start and visit_element_end' do
387
- html = '<section><article>Content</article></section>'
388
- visitor = double(Object)
389
- element_lifecycle = {}
390
-
391
- allow(visitor).to receive(:visit_element_start) do |ctx|
392
- element_lifecycle[ctx[:tag_name]] ||= { start: 0, end: 0 }
393
- element_lifecycle[ctx[:tag_name]][:start] += 1
394
- { type: :continue }
395
- end
396
-
397
- allow(visitor).to receive(:visit_element_end) do |ctx, _output|
398
- element_lifecycle[ctx[:tag_name]][:end] += 1
399
- { type: :continue }
400
- end
401
-
402
- allow(visitor).to receive(:visit_text).and_return({ type: :continue })
403
-
404
- described_class.convert_with_visitor(html, nil, visitor)
405
-
406
- expect(element_lifecycle['section'][:start]).to eq(element_lifecycle['section'][:end])
407
- expect(element_lifecycle['article'][:start]).to eq(element_lifecycle['article'][:end])
408
- end
409
-
410
- it 'tag_name is consistent for nested elements of same type' do
411
- html = '<div><div><div>Deep</div></div></div>'
412
- visitor = double(Object)
413
- div_count = 0
414
-
415
- allow(visitor).to receive(:visit_element_start) do |ctx|
416
- div_count += 1 if ctx[:tag_name] == 'div'
417
- { type: :continue }
418
- end
419
-
420
- allow(visitor).to receive_messages(
421
- visit_element_end: { type: :continue },
422
- visit_text: { type: :continue }
423
- )
424
-
425
- described_class.convert_with_visitor(html, nil, visitor)
426
-
427
- expect(div_count).to eq(3)
428
- end
429
- end
430
-
431
- describe 'tag_name with complex HTML structures' do
432
- it 'maintains tag_name accuracy with complex nested structure' do
433
- html = '
434
- <article>
435
- <header><h1>Title</h1></header>
436
- <section>
437
- <div class="content">
438
- <p>Paragraph <strong>bold</strong></p>
439
- <ul>
440
- <li>Item 1</li>
441
- <li>Item 2</li>
442
- </ul>
443
- </div>
444
- </section>
445
- <footer>Footer</footer>
446
- </article>
447
- '
448
- visitor = double(Object)
449
- tag_names = []
450
-
451
- allow(visitor).to receive(:visit_element_start) do |ctx|
452
- tag_names << ctx[:tag_name]
453
- { type: :continue }
454
- end
455
-
456
- allow(visitor).to receive_messages(
457
- visit_element_end: { type: :continue },
458
- visit_text: { type: :continue },
459
- visit_heading: { type: :continue },
460
- visit_list_start: { type: :continue },
461
- visit_list_end: { type: :continue },
462
- visit_list_item: { type: :continue },
463
- visit_strong: { type: :continue }
464
- )
465
-
466
- described_class.convert_with_visitor(html, nil, visitor)
467
-
468
- expect(tag_names).to include('article')
469
- expect(tag_names).to include('header')
470
- expect(tag_names).to include('h1')
471
- expect(tag_names).to include('section')
472
- expect(tag_names).to include('div')
473
- expect(tag_names).to include('p')
474
- expect(tag_names).to include('strong')
475
- expect(tag_names).to include('ul')
476
- expect(tag_names).to include('li')
477
- expect(tag_names).to include('footer')
478
- end
479
-
480
- it 'correctly filters table elements by tag_name' do
481
- html = '
482
- <table>
483
- <tr><th>Header</th></tr>
484
- <tr><td>Data</td></tr>
485
- </table>
486
- '
487
- visitor = double(Object)
488
- tag_names = []
489
-
490
- allow(visitor).to receive(:visit_element_start) do |ctx|
491
- tag_names << ctx[:tag_name]
492
- { type: :continue }
493
- end
494
-
495
- allow(visitor).to receive_messages(
496
- visit_element_end: { type: :continue },
497
- visit_text: { type: :continue },
498
- visit_table_start: { type: :continue },
499
- visit_table_end: { type: :continue },
500
- visit_table_row: { type: :continue }
501
- )
502
-
503
- described_class.convert_with_visitor(html, nil, visitor)
504
-
505
- # Tables are handled at higher level, verify core table tag is there
506
- expect(tag_names).to include('table')
507
- end
508
- end
509
-
510
- describe 'tag_name edge cases' do
511
- it 'handles self-closing tags correctly' do
512
- html = '<p>Text<br/>More text<hr/></p>'
513
- visitor = double(Object)
514
- tag_names = []
515
-
516
- allow(visitor).to receive(:visit_element_start) do |ctx|
517
- tag_names << ctx[:tag_name]
518
- { type: :continue }
519
- end
520
-
521
- allow(visitor).to receive_messages(
522
- visit_element_end: { type: :continue },
523
- visit_text: { type: :continue }
524
- )
525
-
526
- described_class.convert_with_visitor(html, nil, visitor)
527
-
528
- expect(tag_names).to include('p')
529
- expect(tag_names).to include('br')
530
- expect(tag_names).to include('hr')
531
- end
532
-
533
- it 'handles lowercase tag names' do
534
- html = '<DIV class="Test">Content</DIV>'
535
- visitor = double(Object)
536
- tag_names = []
537
-
538
- allow(visitor).to receive(:visit_element_start) do |ctx|
539
- tag_names << ctx[:tag_name]
540
- { type: :continue }
541
- end
542
-
543
- allow(visitor).to receive_messages(
544
- visit_element_end: { type: :continue },
545
- visit_text: { type: :continue }
546
- )
547
-
548
- described_class.convert_with_visitor(html, nil, visitor)
549
-
550
- # HTML5 normalizes tags to lowercase
551
- expect(tag_names).to include('div')
552
- end
553
-
554
- it 'handles special/semantic elements' do
555
- html = '<main><nav>Navigation</nav><aside>Sidebar</aside></main>'
556
- visitor = double(Object)
557
- tag_names = []
558
-
559
- allow(visitor).to receive(:visit_element_start) do |ctx|
560
- tag_names << ctx[:tag_name]
561
- { type: :continue }
562
- end
563
-
564
- allow(visitor).to receive_messages(
565
- visit_element_end: { type: :continue },
566
- visit_text: { type: :continue }
567
- )
568
-
569
- described_class.convert_with_visitor(html, nil, visitor)
570
-
571
- expect(tag_names).to include('main')
572
- expect(tag_names).to include('nav')
573
- expect(tag_names).to include('aside')
574
- end
575
-
576
- it 'handles form elements correctly' do
577
- html = '
578
- <form>
579
- <input type="text" name="username"/>
580
- <button>Submit</button>
581
- </form>
582
- '
583
- visitor = double(Object)
584
- tag_names = []
585
-
586
- allow(visitor).to receive(:visit_element_start) do |ctx|
587
- tag_names << ctx[:tag_name]
588
- { type: :continue }
589
- end
590
-
591
- allow(visitor).to receive_messages(
592
- visit_element_end: { type: :continue },
593
- visit_text: { type: :continue }
594
- )
595
-
596
- described_class.convert_with_visitor(html, nil, visitor)
597
-
598
- expect(tag_names).to include('form')
599
- expect(tag_names).to include('input')
600
- expect(tag_names).to include('button')
601
- end
602
- end
603
- end
604
-
605
- # rubocop:enable RSpec/VerifiedDoubles, RSpec/DescribeMethod