html-to-markdown 2.16.0 → 2.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1149 @@
1
+ # frozen_string_literal: true
2
+
3
+ # rubocop:disable RSpec/ContextWording, RSpec/VerifiedDoubles
4
+ require 'spec_helper'
5
+
6
+ RSpec.describe HtmlToMarkdown do
7
+ describe '.convert_with_visitor' do
8
+ # ============================================================================
9
+ # ============================================================================
10
+
11
+ def setup_visitor_stubs(visitor, methods)
12
+ methods.each do |method_name, return_value|
13
+ allow(visitor).to receive(method_name).and_return(return_value)
14
+ end
15
+ end
16
+
17
+ def apply_visitor_overrides(visitor, overrides)
18
+ overrides.each do |method_name, behavior|
19
+ if behavior.is_a?(Proc)
20
+ allow(visitor).to receive(method_name, &behavior)
21
+ else
22
+ allow(visitor).to receive(method_name).and_return(behavior)
23
+ end
24
+ end
25
+ end
26
+
27
+ def default_visitor_methods
28
+ %w[
29
+ visit_element_start visit_element_end visit_text visit_link visit_image
30
+ visit_heading visit_code_block visit_code_inline visit_list_item visit_list_start
31
+ visit_list_end visit_table_start visit_table_row visit_table_end visit_blockquote
32
+ visit_strong visit_emphasis visit_strikethrough visit_underline visit_subscript
33
+ visit_superscript visit_mark visit_line_break visit_horizontal_rule visit_custom_element
34
+ visit_definition_list_start visit_definition_term visit_definition_description
35
+ visit_definition_list_end visit_form visit_input visit_button visit_audio visit_video
36
+ visit_iframe visit_details visit_summary visit_figure_start visit_figcaption
37
+ visit_figure_end
38
+ ].each_with_object({}) { |name, hash| hash[name.to_sym] = { type: :continue } }
39
+ end
40
+
41
+ def create_visitor(**overrides)
42
+ visitor = double(Object)
43
+ setup_visitor_stubs(visitor, default_visitor_methods)
44
+ apply_visitor_overrides(visitor, overrides)
45
+ visitor
46
+ end
47
+
48
+ context 'visit_text callback' do
49
+ it 'is called for text nodes' do
50
+ html = '<p>Hello World</p>'
51
+ visitor = create_visitor
52
+
53
+ allow(visitor).to receive(:visit_text).and_return({ type: :continue })
54
+
55
+ result = described_class.convert_with_visitor(html, nil, visitor)
56
+ expect(visitor).to have_received(:visit_text)
57
+ expect(result).to include('Hello World')
58
+ end
59
+
60
+ it 'receives text content and context' do
61
+ html = '<p>Test content</p>'
62
+ visited_texts = []
63
+ visitor = create_visitor
64
+
65
+ allow(visitor).to receive(:visit_text) do |_ctx, text|
66
+ visited_texts << text
67
+ { type: :continue }
68
+ end
69
+
70
+ described_class.convert_with_visitor(html, nil, visitor)
71
+ expect(visited_texts).to include('Test content')
72
+ end
73
+
74
+ it 'validates node context for text node' do
75
+ html = '<p>Hello</p>'
76
+ visitor = create_visitor
77
+ captured_ctx = nil
78
+
79
+ allow(visitor).to receive(:visit_text) do |ctx, _text|
80
+ captured_ctx = ctx
81
+ { type: :continue }
82
+ end
83
+
84
+ described_class.convert_with_visitor(html, nil, visitor)
85
+
86
+ expect(captured_ctx).not_to be_nil
87
+ expect(captured_ctx[:tag_name]).to be_a(String)
88
+ expect(captured_ctx[:depth]).to be_an(Integer)
89
+ end
90
+ end
91
+
92
+ context 'visit_link callback' do
93
+ it 'is called for anchor links' do
94
+ html = '<a href="https://example.com">Click here</a>'
95
+ visitor = create_visitor
96
+
97
+ allow(visitor).to receive(:visit_link).and_return({ type: :continue })
98
+
99
+ described_class.convert_with_visitor(html, nil, visitor)
100
+ expect(visitor).to have_received(:visit_link)
101
+ end
102
+
103
+ it 'receives href, text, and optional title' do
104
+ html = '<a href="https://example.com" title="Example">Click</a>'
105
+ link_data = nil
106
+ visitor = create_visitor
107
+
108
+ allow(visitor).to receive(:visit_link) do |ctx, href, text, title|
109
+ link_data = { ctx: ctx, href: href, text: text, title: title }
110
+ { type: :continue }
111
+ end
112
+
113
+ described_class.convert_with_visitor(html, nil, visitor)
114
+
115
+ expect(link_data).not_to be_nil
116
+ expect(link_data[:href]).to eq('https://example.com')
117
+ expect(link_data[:text]).to eq('Click')
118
+ expect(link_data[:title]).to eq('Example')
119
+ end
120
+
121
+ it 'handles links without title attribute' do
122
+ html = '<a href="/path">Link</a>'
123
+ link_data = nil
124
+ visitor = create_visitor
125
+
126
+ allow(visitor).to receive(:visit_link) do |_ctx, href, text, title|
127
+ link_data = { href: href, text: text, title: title }
128
+ { type: :continue }
129
+ end
130
+
131
+ described_class.convert_with_visitor(html, nil, visitor)
132
+
133
+ expect(link_data[:title]).to be_nil
134
+ end
135
+
136
+ it 'validates node context contains link metadata' do
137
+ html = '<a href="https://example.com">Link</a>'
138
+ visitor = create_visitor
139
+ captured_ctx = nil
140
+
141
+ allow(visitor).to receive(:visit_link) do |ctx, _href, _text, _title|
142
+ captured_ctx = ctx
143
+ { type: :continue }
144
+ end
145
+
146
+ described_class.convert_with_visitor(html, nil, visitor)
147
+
148
+ expect(captured_ctx[:attributes]).to be_a(Hash)
149
+ expect(captured_ctx[:attributes]['href']).to eq('https://example.com')
150
+ end
151
+ end
152
+
153
+ context 'visit_image callback' do
154
+ it 'is called for image elements' do
155
+ html = '<img src="image.jpg" alt="An image">'
156
+ visitor = create_visitor
157
+
158
+ allow(visitor).to receive(:visit_image).and_return({ type: :continue })
159
+
160
+ described_class.convert_with_visitor(html, nil, visitor)
161
+ expect(visitor).to have_received(:visit_image)
162
+ end
163
+
164
+ it 'receives src, alt, and optional title' do
165
+ html = '<img src="photo.jpg" alt="Beautiful" title="Photo">'
166
+ image_data = nil
167
+ visitor = create_visitor
168
+
169
+ allow(visitor).to receive(:visit_image) do |ctx, src, alt, title|
170
+ image_data = { ctx: ctx, src: src, alt: alt, title: title }
171
+ { type: :continue }
172
+ end
173
+
174
+ described_class.convert_with_visitor(html, nil, visitor)
175
+
176
+ expect(image_data[:src]).to eq('photo.jpg')
177
+ expect(image_data[:alt]).to eq('Beautiful')
178
+ expect(image_data[:title]).to eq('Photo')
179
+ end
180
+
181
+ it 'handles images without title attribute' do
182
+ html = '<img src="pic.png" alt="Picture">'
183
+ image_data = nil
184
+ visitor = create_visitor
185
+
186
+ allow(visitor).to receive(:visit_image) do |_ctx, src, alt, title|
187
+ image_data = { src: src, alt: alt, title: title }
188
+ { type: :continue }
189
+ end
190
+
191
+ described_class.convert_with_visitor(html, nil, visitor)
192
+
193
+ expect(image_data[:title]).to be_nil
194
+ end
195
+ end
196
+
197
+ context 'visit_heading callback' do
198
+ it 'is called for heading elements' do
199
+ html = '<h1>Title</h1>'
200
+ visitor = create_visitor
201
+
202
+ allow(visitor).to receive(:visit_heading).and_return({ type: :continue })
203
+
204
+ described_class.convert_with_visitor(html, nil, visitor)
205
+ expect(visitor).to have_received(:visit_heading)
206
+ end
207
+
208
+ it 'receives heading level, text, and optional id' do
209
+ html = '<h2 id="section">Chapter</h2>'
210
+ heading_data = nil
211
+ visitor = create_visitor
212
+
213
+ allow(visitor).to receive(:visit_heading) do |ctx, level, text, id|
214
+ heading_data = { ctx: ctx, level: level, text: text, id: id }
215
+ { type: :continue }
216
+ end
217
+
218
+ described_class.convert_with_visitor(html, nil, visitor)
219
+
220
+ expect(heading_data[:level]).to eq(2)
221
+ expect(heading_data[:text]).to eq('Chapter')
222
+ expect(heading_data[:id]).to eq('section')
223
+ end
224
+
225
+ it 'handles headings without id attribute' do
226
+ html = '<h3>Subsection</h3>'
227
+ heading_data = nil
228
+ visitor = create_visitor
229
+
230
+ allow(visitor).to receive(:visit_heading) do |_ctx, level, text, id|
231
+ heading_data = { level: level, text: text, id: id }
232
+ { type: :continue }
233
+ end
234
+
235
+ described_class.convert_with_visitor(html, nil, visitor)
236
+
237
+ expect(heading_data[:id]).to be_nil
238
+ end
239
+
240
+ it 'supports all heading levels (h1-h6)' do
241
+ heading_levels = []
242
+ visitor = create_visitor
243
+
244
+ allow(visitor).to receive(:visit_heading) do |_ctx, level, _text, _id|
245
+ heading_levels << level
246
+ { type: :continue }
247
+ end
248
+
249
+ html = '<h1>H1</h1><h2>H2</h2><h3>H3</h3><h4>H4</h4><h5>H5</h5><h6>H6</h6>'
250
+ described_class.convert_with_visitor(html, nil, visitor)
251
+
252
+ expect(heading_levels).to contain_exactly(1, 2, 3, 4, 5, 6)
253
+ end
254
+ end
255
+
256
+ context 'visit_element_start callback' do
257
+ it 'is called when entering an element' do
258
+ html = '<div><p>Content</p></div>'
259
+ visitor = create_visitor
260
+
261
+ allow(visitor).to receive(:visit_element_start).and_return({ type: :continue })
262
+
263
+ described_class.convert_with_visitor(html, nil, visitor)
264
+ expect(visitor).to have_received(:visit_element_start).at_least(:once)
265
+ end
266
+
267
+ it 'receives node context with tag information' do
268
+ html = '<section id="main" class="container">Text</section>'
269
+ contexts = []
270
+ visitor = create_visitor
271
+
272
+ allow(visitor).to receive(:visit_element_start) do |ctx|
273
+ contexts << ctx
274
+ { type: :continue }
275
+ end
276
+
277
+ described_class.convert_with_visitor(html, nil, visitor)
278
+
279
+ section_ctx = contexts.find { |ctx| ctx[:tag_name] == 'section' }
280
+ expect(section_ctx).not_to be_nil
281
+ expect(section_ctx[:attributes]['id']).to eq('main')
282
+ end
283
+ end
284
+
285
+ context 'visit_element_end callback' do
286
+ it 'is called when exiting an element' do
287
+ html = '<div>Content</div>'
288
+ visitor = create_visitor
289
+
290
+ allow(visitor).to receive(:visit_element_end).and_return({ type: :continue })
291
+
292
+ described_class.convert_with_visitor(html, nil, visitor)
293
+ expect(visitor).to have_received(:visit_element_end)
294
+ end
295
+
296
+ it 'receives context and generated output' do
297
+ html = '<p>Text content</p>'
298
+ element_end_data = nil
299
+ visitor = create_visitor
300
+
301
+ allow(visitor).to receive(:visit_element_end) do |ctx, output|
302
+ element_end_data = { ctx: ctx, output: output }
303
+ { type: :continue }
304
+ end
305
+
306
+ described_class.convert_with_visitor(html, nil, visitor)
307
+
308
+ expect(element_end_data).not_to be_nil
309
+ expect(element_end_data[:output]).to be_a(String)
310
+ end
311
+ end
312
+
313
+ context 'VisitResult::Continue' do
314
+ it 'continues with default behavior' do
315
+ html = '<p>Hello</p>'
316
+ visitor = create_visitor
317
+
318
+ allow(visitor).to receive(:visit_text).and_return({ type: :continue })
319
+
320
+ result = described_class.convert_with_visitor(html, nil, visitor)
321
+ expect(result).to include('Hello')
322
+ end
323
+
324
+ it 'allows chaining of multiple visitors' do
325
+ html = '<p>Test</p>'
326
+ calls = []
327
+ visitor = create_visitor
328
+
329
+ allow(visitor).to receive(:visit_text) do |_ctx, text|
330
+ calls << text
331
+ { type: :continue }
332
+ end
333
+
334
+ described_class.convert_with_visitor(html, nil, visitor)
335
+ expect(calls).to include('Test')
336
+ end
337
+ end
338
+
339
+ context 'VisitResult::Custom' do
340
+ it 'replaces element output with custom text' do
341
+ html = '<p>Original</p>'
342
+ visitor = create_visitor
343
+
344
+ allow(visitor).to receive(:visit_text).and_return({ type: :custom, output: 'MODIFIED' })
345
+
346
+ result = described_class.convert_with_visitor(html, nil, visitor)
347
+ expect(result).to include('MODIFIED')
348
+ end
349
+
350
+ it 'overrides link rendering' do
351
+ html = '<a href="https://example.com">Link</a>'
352
+ visitor = create_visitor
353
+
354
+ allow(visitor).to receive(:visit_link).and_return({ type: :custom, output: '**CUSTOM LINK**' })
355
+
356
+ result = described_class.convert_with_visitor(html, nil, visitor)
357
+ expect(result).to include('CUSTOM LINK')
358
+ end
359
+
360
+ it 'overrides image rendering' do
361
+ html = '<img src="test.jpg" alt="Test">'
362
+ visitor = create_visitor
363
+
364
+ allow(visitor).to receive(:visit_image).and_return({ type: :custom, output: '[IMAGE PLACEHOLDER]' })
365
+
366
+ result = described_class.convert_with_visitor(html, nil, visitor)
367
+ expect(result).to include('IMAGE PLACEHOLDER')
368
+ end
369
+
370
+ it 'overrides heading rendering' do
371
+ html = '<h1>Title</h1>'
372
+ visitor = create_visitor
373
+
374
+ allow(visitor).to receive(:visit_heading).and_return({ type: :custom, output: '>>> TITLE <<<' })
375
+
376
+ result = described_class.convert_with_visitor(html, nil, visitor)
377
+ expect(result).to include('TITLE')
378
+ end
379
+
380
+ it 'supports unicode in custom output' do
381
+ html = '<p>Text</p>'
382
+ visitor = create_visitor
383
+
384
+ allow(visitor).to receive(:visit_text).and_return({ type: :custom, output: '✓ Custom ✨' })
385
+
386
+ result = described_class.convert_with_visitor(html, nil, visitor)
387
+ expect(result).to include('✓')
388
+ expect(result).to include('✨')
389
+ end
390
+ end
391
+
392
+ context 'VisitResult::Skip' do
393
+ it 'removes element from output entirely' do
394
+ html = '<p>Keep</p><img src="skip.jpg" alt="Skip">'
395
+ visitor = create_visitor
396
+
397
+ allow(visitor).to receive(:visit_image).and_return({ type: :skip })
398
+
399
+ allow(visitor).to receive_messages(
400
+ visit_element_start: { type: :continue },
401
+ visit_element_end: { type: :continue },
402
+ visit_text: { type: :continue }
403
+ )
404
+
405
+ result = described_class.convert_with_visitor(html, nil, visitor)
406
+ expect(result).to include('Keep')
407
+ expect(result).not_to include('skip')
408
+ end
409
+
410
+ it 'skips link entirely' do
411
+ html = '<p>Before <a href="#">hidden</a> after</p>'
412
+ visitor = create_visitor
413
+
414
+ allow(visitor).to receive(:visit_link).and_return({ type: :skip })
415
+
416
+ allow(visitor).to receive_messages(
417
+ visit_element_start: { type: :continue },
418
+ visit_element_end: { type: :continue },
419
+ visit_text: { type: :continue }
420
+ )
421
+
422
+ result = described_class.convert_with_visitor(html, nil, visitor)
423
+ expect(result).to include('Before')
424
+ expect(result).to include('after')
425
+ end
426
+
427
+ it 'skips multiple elements selectively' do
428
+ html = '<p>1</p><img src="a.jpg" alt="A"><p>2</p><img src="b.jpg" alt="B"><p>3</p>'
429
+ image_count = 0
430
+ visitor = create_visitor
431
+
432
+ allow(visitor).to receive(:visit_image) do
433
+ image_count += 1
434
+ { type: :skip }
435
+ end
436
+
437
+ allow(visitor).to receive_messages(
438
+ visit_element_start: { type: :continue },
439
+ visit_element_end: { type: :continue },
440
+ visit_text: { type: :continue }
441
+ )
442
+
443
+ result = described_class.convert_with_visitor(html, nil, visitor)
444
+ expect(image_count).to eq(2)
445
+ expect(result).to include('1')
446
+ expect(result).to include('2')
447
+ expect(result).to include('3')
448
+ end
449
+ end
450
+
451
+ context 'VisitResult::PreserveHtml' do
452
+ it 'preserves element as raw HTML in output' do
453
+ html = '<p>Text <span class="custom">styled</span> here</p>'
454
+ visitor = create_visitor
455
+
456
+ allow(visitor).to receive_messages(
457
+ visit_element_start: { type: :continue },
458
+ visit_element_end: { type: :continue },
459
+ visit_text: { type: :continue }
460
+ )
461
+
462
+ result = described_class.convert_with_visitor(html, nil, visitor)
463
+ expect(result).to be_a(String)
464
+ end
465
+
466
+ it 'preserves links as HTML' do
467
+ html = '<p><a href="javascript:alert()">Click</a></p>'
468
+ visitor = create_visitor
469
+
470
+ allow(visitor).to receive(:visit_link).and_return({ type: :preserve_html })
471
+
472
+ allow(visitor).to receive_messages(
473
+ visit_element_start: { type: :continue },
474
+ visit_element_end: { type: :continue },
475
+ visit_text: { type: :continue }
476
+ )
477
+
478
+ result = described_class.convert_with_visitor(html, nil, visitor)
479
+ expect(result).to be_a(String)
480
+ end
481
+ end
482
+
483
+ context 'VisitResult::Error' do
484
+ it 'stops conversion with error message' do
485
+ html = '<p>Text</p>'
486
+ visitor = create_visitor
487
+
488
+ allow(visitor).to receive(:visit_text).and_return({ type: :error, message: 'Custom conversion error' })
489
+
490
+ expect do
491
+ described_class.convert_with_visitor(html, nil, visitor)
492
+ end.to raise_error(StandardError)
493
+ end
494
+
495
+ it 'includes custom error message' do
496
+ html = '<img src="invalid" alt="Bad">'
497
+ visitor = create_visitor
498
+
499
+ allow(visitor).to receive(:visit_image).and_return({ type: :error, message: 'Unsupported image format' })
500
+
501
+ allow(visitor).to receive_messages(
502
+ visit_element_start: { type: :continue },
503
+ visit_element_end: { type: :continue }
504
+ )
505
+
506
+ expect do
507
+ described_class.convert_with_visitor(html, nil, visitor)
508
+ end.to raise_error(StandardError) { |err| expect(err.message).to include('Unsupported') }
509
+ end
510
+
511
+ it 'halts conversion at error point' do
512
+ html = '<h1>Title</h1><p>Paragraph</p>'
513
+ visited_elements = []
514
+ visitor = create_visitor
515
+
516
+ allow(visitor).to receive(:visit_heading) do |_ctx, _level, _text, _id|
517
+ visited_elements << :heading
518
+ { type: :error, message: 'Stop here' }
519
+ end
520
+
521
+ allow(visitor).to receive_messages(
522
+ visit_element_start: { type: :continue },
523
+ visit_element_end: { type: :continue },
524
+ visit_text: { type: :continue }
525
+ )
526
+
527
+ expect do
528
+ described_class.convert_with_visitor(html, nil, visitor)
529
+ end.to raise_error(StandardError)
530
+
531
+ expect(visited_elements).to include(:heading)
532
+ end
533
+ end
534
+
535
+ context 'NodeContext validation' do
536
+ it 'provides tag_name in context' do
537
+ html = '<article>Content</article>'
538
+ contexts = []
539
+ visitor = create_visitor
540
+
541
+ allow(visitor).to receive(:visit_element_start) do |ctx|
542
+ contexts << ctx
543
+ { type: :continue }
544
+ end
545
+
546
+ described_class.convert_with_visitor(html, nil, visitor)
547
+
548
+ article_ctx = contexts.find { |ctx| ctx[:tag_name] == 'article' }
549
+ expect(article_ctx[:tag_name]).to eq('article')
550
+ end
551
+
552
+ it 'provides attributes hash in context' do
553
+ html = '<div data-id="123" class="box">Content</div>'
554
+ visitor = create_visitor
555
+ captured_ctx = nil
556
+
557
+ allow(visitor).to receive(:visit_element_start) do |ctx|
558
+ captured_ctx = ctx if ctx[:tag_name] == 'div'
559
+ { type: :continue }
560
+ end
561
+
562
+ described_class.convert_with_visitor(html, nil, visitor)
563
+
564
+ expect(captured_ctx[:attributes]).to be_a(Hash)
565
+ expect(captured_ctx[:attributes]['data-id']).to eq('123')
566
+ expect(captured_ctx[:attributes]['class']).to eq('box')
567
+ end
568
+
569
+ it 'provides depth information' do
570
+ html = '<div><section><p>Nested</p></section></div>'
571
+ depths = {}
572
+ visitor = create_visitor
573
+
574
+ allow(visitor).to receive(:visit_element_start) do |ctx|
575
+ depths[ctx[:tag_name]] = ctx[:depth]
576
+ { type: :continue }
577
+ end
578
+
579
+ described_class.convert_with_visitor(html, nil, visitor)
580
+
581
+ expect(depths['div']).to be < depths['section']
582
+ expect(depths['section']).to be < depths['p']
583
+ end
584
+
585
+ it 'provides parent_tag information' do
586
+ html = '<ul><li>Item</li></ul>'
587
+ visitor = create_visitor
588
+ li_parent = nil
589
+
590
+ allow(visitor).to receive(:visit_element_start) do |ctx|
591
+ li_parent = ctx[:parent_tag] if ctx[:tag_name] == 'li'
592
+ { type: :continue }
593
+ end
594
+
595
+ described_class.convert_with_visitor(html, nil, visitor)
596
+
597
+ expect(li_parent).to eq('ul')
598
+ end
599
+
600
+ it 'provides is_inline flag' do
601
+ html = '<p><strong>Bold</strong> and <em>italic</em></p>'
602
+ inline_elements = []
603
+ visitor = create_visitor
604
+
605
+ allow(visitor).to receive(:visit_element_start) do |ctx|
606
+ inline_elements << ctx[:tag_name] if ctx[:is_inline]
607
+ { type: :continue }
608
+ end
609
+
610
+ described_class.convert_with_visitor(html, nil, visitor)
611
+
612
+ expect(inline_elements).to include('strong')
613
+ expect(inline_elements).to include('em')
614
+ end
615
+
616
+ it 'provides index_in_parent information' do
617
+ html = '<ol><li>First</li><li>Second</li><li>Third</li></ol>'
618
+ indices = []
619
+ visitor = create_visitor
620
+
621
+ allow(visitor).to receive(:visit_element_start) do |ctx|
622
+ indices << ctx[:index_in_parent] if ctx[:tag_name] == 'li'
623
+ { type: :continue }
624
+ end
625
+
626
+ described_class.convert_with_visitor(html, nil, visitor)
627
+
628
+ expect(indices.length).to eq(3)
629
+ expect(indices).to include(0, 1, 2)
630
+ end
631
+ end
632
+
633
+ context 'error handling' do
634
+ it 'handles visitor exceptions gracefully' do
635
+ html = '<p>Text</p>'
636
+ visitor = create_visitor
637
+
638
+ allow(visitor).to receive(:visit_text) do
639
+ raise 'Visitor error'
640
+ end
641
+
642
+ expect do
643
+ described_class.convert_with_visitor(html, nil, visitor)
644
+ end.to raise_error(RuntimeError)
645
+ end
646
+
647
+ it 'handles nil visitor gracefully' do
648
+ html = '<p>Content</p>'
649
+ result = described_class.convert_with_visitor(html, nil, nil)
650
+ expect(result).to include('Content')
651
+ end
652
+
653
+ it 'handles missing visitor methods' do
654
+ html = '<p>Text</p>'
655
+ visitor = create_visitor
656
+
657
+ allow(visitor).to receive_messages(
658
+ visit_element_start: { type: :continue },
659
+ visit_element_end: { type: :continue }
660
+ )
661
+
662
+ expect do
663
+ described_class.convert_with_visitor(html, nil, visitor)
664
+ end.not_to raise_error
665
+ end
666
+ end
667
+
668
+ context 'integration with ConversionOptions' do
669
+ it 'accepts ConversionOptions with visitor' do
670
+ html = '<h1>Title</h1>'
671
+ options = described_class.options(heading_style: :atx)
672
+ visitor = create_visitor
673
+
674
+ allow(visitor).to receive_messages(
675
+ visit_element_start: { type: :continue },
676
+ visit_element_end: { type: :continue },
677
+ visit_text: { type: :continue },
678
+ visit_heading: { type: :continue }
679
+ )
680
+
681
+ result = described_class.convert_with_visitor(html, options, visitor)
682
+ expect(result).to include('# Title')
683
+ end
684
+
685
+ it 'accepts options hash with visitor' do
686
+ html = '<h2>Heading</h2>'
687
+ options = { heading_style: :atx }
688
+ visitor = create_visitor
689
+
690
+ allow(visitor).to receive_messages(
691
+ visit_element_start: { type: :continue },
692
+ visit_element_end: { type: :continue },
693
+ visit_text: { type: :continue },
694
+ visit_heading: { type: :continue }
695
+ )
696
+
697
+ result = described_class.convert_with_visitor(html, options, visitor)
698
+ expect(result).to include('## Heading')
699
+ end
700
+
701
+ it 'respects heading_style in options with visitor override' do
702
+ html = '<h1>Title</h1>'
703
+ options = { heading_style: :atx_closed }
704
+ visitor = create_visitor
705
+
706
+ allow(visitor).to receive_messages(
707
+ visit_element_start: { type: :continue },
708
+ visit_element_end: { type: :continue },
709
+ visit_text: { type: :continue },
710
+ visit_heading: { type: :continue }
711
+ )
712
+
713
+ result = described_class.convert_with_visitor(html, options, visitor)
714
+ expect(result).to include('#')
715
+ end
716
+ end
717
+
718
+ context 'multiple visitor methods' do
719
+ it 'calls multiple methods for complex HTML' do
720
+ html = '<h1>Title</h1><p>Text with <a href="#link">link</a> and <img src="pic.jpg" alt="pic"></p>'
721
+ calls = []
722
+ visitor = create_visitor
723
+
724
+ allow(visitor).to receive(:visit_heading) do |_ctx, _level, _text, _id|
725
+ calls << :heading
726
+ { type: :continue }
727
+ end
728
+
729
+ allow(visitor).to receive(:visit_text) do |_ctx, text|
730
+ calls << :text unless text.strip.empty?
731
+ { type: :continue }
732
+ end
733
+
734
+ allow(visitor).to receive(:visit_link) do |_ctx, _href, _text, _title|
735
+ calls << :link
736
+ { type: :continue }
737
+ end
738
+
739
+ allow(visitor).to receive(:visit_image) do |_ctx, _src, _alt, _title|
740
+ calls << :image
741
+ { type: :continue }
742
+ end
743
+
744
+ allow(visitor).to receive_messages(
745
+ visit_element_start: { type: :continue },
746
+ visit_element_end: { type: :continue }
747
+ )
748
+
749
+ described_class.convert_with_visitor(html, nil, visitor)
750
+
751
+ expect(calls).to include(:heading)
752
+ expect(calls).to include(:link)
753
+ expect(calls).to include(:image)
754
+ end
755
+
756
+ it 'allows selective overrides of specific callbacks' do
757
+ html = '<h1>Title</h1><p><a href="#">Link</a></p>'
758
+ visitor = create_visitor
759
+
760
+ allow(visitor).to receive_messages(visit_heading: { type: :custom, output: '>>> HEADING <<<' },
761
+ visit_text: { type: :continue })
762
+
763
+ allow(visitor).to receive_messages(
764
+ visit_element_start: { type: :continue },
765
+ visit_element_end: { type: :continue },
766
+ visit_link: { type: :continue }
767
+ )
768
+
769
+ result = described_class.convert_with_visitor(html, nil, visitor)
770
+ expect(result).to include('HEADING')
771
+ expect(result).to include('Link')
772
+ end
773
+
774
+ it 'supports different results for different elements' do
775
+ html = '<h1>Header</h1><img src="skip.jpg" alt="skip"><p>Text</p>'
776
+ visitor = create_visitor
777
+
778
+ allow(visitor).to receive_messages(visit_heading: { type: :custom, output: 'CUSTOM HEADING' },
779
+ visit_image: { type: :skip }, visit_text: { type: :continue })
780
+
781
+ allow(visitor).to receive_messages(
782
+ visit_element_start: { type: :continue },
783
+ visit_element_end: { type: :continue }
784
+ )
785
+
786
+ result = described_class.convert_with_visitor(html, nil, visitor)
787
+ expect(result).to include('CUSTOM HEADING')
788
+ expect(result).to include('Text')
789
+ end
790
+ end
791
+
792
+ context 'nested elements' do
793
+ it 'visits deeply nested elements in order' do
794
+ html = '<div><ul><li><strong>Nested <em>content</em></strong></li></ul></div>'
795
+ visited_tags = []
796
+ visitor = create_visitor
797
+
798
+ allow(visitor).to receive(:visit_element_start) do |ctx|
799
+ visited_tags << ctx[:tag_name]
800
+ { type: :continue }
801
+ end
802
+
803
+ allow(visitor).to receive_messages(
804
+ visit_element_end: { type: :continue },
805
+ visit_text: { type: :continue }
806
+ )
807
+
808
+ described_class.convert_with_visitor(html, nil, visitor)
809
+
810
+ expect(visited_tags).to include('div')
811
+ expect(visited_tags).to include('ul')
812
+ expect(visited_tags).to include('li')
813
+ expect(visited_tags).to include('strong')
814
+ expect(visited_tags).to include('em')
815
+ end
816
+
817
+ it 'provides correct depth for nested elements' do
818
+ html = '<div><div><p>Deep</p></div></div>'
819
+ depths = {}
820
+ visitor = create_visitor
821
+
822
+ allow(visitor).to receive(:visit_element_start) do |ctx|
823
+ depths[ctx[:tag_name]] ||= []
824
+ depths[ctx[:tag_name]] << ctx[:depth]
825
+ { type: :continue }
826
+ end
827
+
828
+ allow(visitor).to receive_messages(
829
+ visit_element_end: { type: :continue },
830
+ visit_text: { type: :continue }
831
+ )
832
+
833
+ described_class.convert_with_visitor(html, nil, visitor)
834
+
835
+ expect(depths['div'].first).to be < depths['div'].last
836
+ expect(depths['p'].first).to be > depths['div'].last
837
+ end
838
+
839
+ it 'handles custom output in nested context' do
840
+ html = '<ul><li><a href="#">link</a></li></ul>'
841
+ visitor = create_visitor
842
+
843
+ allow(visitor).to receive(:visit_link).and_return({ type: :custom, output: '[MODIFIED]' })
844
+
845
+ allow(visitor).to receive_messages(
846
+ visit_element_start: { type: :continue },
847
+ visit_element_end: { type: :continue },
848
+ visit_text: { type: :continue }
849
+ )
850
+
851
+ result = described_class.convert_with_visitor(html, nil, visitor)
852
+ expect(result).to include('[MODIFIED]')
853
+ end
854
+
855
+ it 'allows skipping nested elements' do
856
+ html = '<div><p>Keep</p><span>Skip this</span><p>Keep too</p></div>'
857
+ visitor = create_visitor
858
+
859
+ allow(visitor).to receive(:visit_element_start) do |ctx|
860
+ if ctx[:tag_name] == 'span'
861
+ { type: :skip }
862
+ else
863
+ { type: :continue }
864
+ end
865
+ end
866
+
867
+ allow(visitor).to receive_messages(
868
+ visit_element_end: { type: :continue },
869
+ visit_text: { type: :continue }
870
+ )
871
+
872
+ result = described_class.convert_with_visitor(html, nil, visitor)
873
+ expect(result).to include('Keep')
874
+ expect(result).to include('Keep too')
875
+ end
876
+ end
877
+
878
+ context 'less common visitor methods' do
879
+ it 'calls visit_strong for bold elements' do
880
+ html = '<strong>Bold</strong>'
881
+ visitor = create_visitor
882
+
883
+ allow(visitor).to receive(:visit_strong).and_return({ type: :continue })
884
+
885
+ allow(visitor).to receive_messages(
886
+ visit_element_start: { type: :continue },
887
+ visit_element_end: { type: :continue },
888
+ visit_text: { type: :continue }
889
+ )
890
+
891
+ described_class.convert_with_visitor(html, nil, visitor)
892
+ expect(visitor).to have_received(:visit_strong)
893
+ end
894
+
895
+ it 'calls visit_emphasis for italic elements' do
896
+ html = '<em>Italic</em>'
897
+ visitor = create_visitor
898
+
899
+ allow(visitor).to receive(:visit_emphasis).and_return({ type: :continue })
900
+
901
+ allow(visitor).to receive_messages(
902
+ visit_element_start: { type: :continue },
903
+ visit_element_end: { type: :continue },
904
+ visit_text: { type: :continue }
905
+ )
906
+
907
+ described_class.convert_with_visitor(html, nil, visitor)
908
+ expect(visitor).to have_received(:visit_emphasis)
909
+ end
910
+
911
+ it 'calls visit_code_block for pre/code' do
912
+ html = '<pre><code>function() {}</code></pre>'
913
+ visitor = create_visitor
914
+
915
+ allow(visitor).to receive(:visit_code_block).and_return({ type: :continue })
916
+
917
+ allow(visitor).to receive_messages(
918
+ visit_element_start: { type: :continue },
919
+ visit_element_end: { type: :continue },
920
+ visit_text: { type: :continue }
921
+ )
922
+
923
+ described_class.convert_with_visitor(html, nil, visitor)
924
+ expect(visitor).to have_received(:visit_code_block)
925
+ end
926
+
927
+ it 'calls visit_blockquote for quotes' do
928
+ html = '<blockquote>Quote text</blockquote>'
929
+ visitor = create_visitor
930
+
931
+ allow(visitor).to receive(:visit_blockquote).and_return({ type: :continue })
932
+
933
+ allow(visitor).to receive_messages(
934
+ visit_element_start: { type: :continue },
935
+ visit_element_end: { type: :continue },
936
+ visit_text: { type: :continue }
937
+ )
938
+
939
+ described_class.convert_with_visitor(html, nil, visitor)
940
+ expect(visitor).to have_received(:visit_blockquote)
941
+ end
942
+
943
+ it 'calls visit_list_item for list items' do
944
+ html = '<ul><li>Item</li></ul>'
945
+ visitor = create_visitor
946
+
947
+ allow(visitor).to receive(:visit_list_item).and_return({ type: :continue })
948
+
949
+ allow(visitor).to receive_messages(
950
+ visit_element_start: { type: :continue },
951
+ visit_element_end: { type: :continue },
952
+ visit_text: { type: :continue },
953
+ visit_list_start: { type: :continue },
954
+ visit_list_end: { type: :continue }
955
+ )
956
+
957
+ described_class.convert_with_visitor(html, nil, visitor)
958
+ expect(visitor).to have_received(:visit_list_item)
959
+ end
960
+ end
961
+
962
+ context 'unicode and special characters' do
963
+ it 'handles unicode text in visitor' do
964
+ html = '<p>日本語テキスト</p>'
965
+ text_received = nil
966
+ visitor = create_visitor
967
+
968
+ allow(visitor).to receive(:visit_text) do |_ctx, text|
969
+ text_received = text
970
+ { type: :continue }
971
+ end
972
+
973
+ described_class.convert_with_visitor(html, nil, visitor)
974
+ expect(text_received).to eq('日本語テキスト')
975
+ end
976
+
977
+ it 'handles unicode in custom output' do
978
+ html = '<p>Text</p>'
979
+ visitor = create_visitor
980
+
981
+ allow(visitor).to receive(:visit_text).and_return({ type: :custom, output: '引用:引用' })
982
+
983
+ result = described_class.convert_with_visitor(html, nil, visitor)
984
+ expect(result).to include('引用')
985
+ end
986
+
987
+ it 'handles HTML entities in visitor' do
988
+ html = '<p>&lt;code&gt; and &amp; symbols</p>'
989
+ text_received = nil
990
+ visitor = create_visitor
991
+
992
+ allow(visitor).to receive(:visit_text) do |_ctx, text|
993
+ text_received = text
994
+ { type: :continue }
995
+ end
996
+
997
+ described_class.convert_with_visitor(html, nil, visitor)
998
+ expect(text_received).to include('<code>')
999
+ expect(text_received).to include('&')
1000
+ end
1001
+ end
1002
+
1003
+ context 'state management in visitor' do
1004
+ it 'allows visitor to maintain state across calls' do
1005
+ html = '<p>One</p><p>Two</p><p>Three</p>'
1006
+ visitor = create_visitor
1007
+ element_count = 0
1008
+
1009
+ allow(visitor).to receive(:visit_element_start) do |ctx|
1010
+ element_count += 1 if ctx[:tag_name] == 'p'
1011
+ { type: :continue }
1012
+ end
1013
+
1014
+ allow(visitor).to receive_messages(
1015
+ visit_element_end: { type: :continue },
1016
+ visit_text: { type: :continue }
1017
+ )
1018
+
1019
+ described_class.convert_with_visitor(html, nil, visitor)
1020
+ expect(element_count).to eq(3)
1021
+ end
1022
+
1023
+ it 'allows visitor to conditionally modify based on accumulated state' do
1024
+ html = '<p>A</p><p>B</p><p>C</p>'
1025
+ visitor = create_visitor
1026
+ paragraph_count = 0
1027
+
1028
+ allow(visitor).to receive(:visit_element_start) do |ctx|
1029
+ if ctx[:tag_name] == 'p'
1030
+ paragraph_count += 1
1031
+ if paragraph_count == 2
1032
+ { type: :custom, output: '[SECOND_PARAGRAPH]' }
1033
+ else
1034
+ { type: :continue }
1035
+ end
1036
+ else
1037
+ { type: :continue }
1038
+ end
1039
+ end
1040
+
1041
+ allow(visitor).to receive_messages(
1042
+ visit_element_end: { type: :continue },
1043
+ visit_text: { type: :continue }
1044
+ )
1045
+
1046
+ result = described_class.convert_with_visitor(html, nil, visitor)
1047
+ expect(result).to include('[SECOND_PARAGRAPH]')
1048
+ end
1049
+ end
1050
+
1051
+ context 'edge cases' do
1052
+ it 'handles empty HTML' do
1053
+ html = ''
1054
+ visitor = create_visitor
1055
+
1056
+ allow(visitor).to receive_messages(
1057
+ visit_element_start: { type: :continue },
1058
+ visit_element_end: { type: :continue },
1059
+ visit_text: { type: :continue }
1060
+ )
1061
+
1062
+ result = described_class.convert_with_visitor(html, nil, visitor)
1063
+ expect(result).to be_a(String)
1064
+ end
1065
+
1066
+ it 'handles HTML with only whitespace' do
1067
+ html = ' \n\t '
1068
+ visitor = create_visitor
1069
+
1070
+ allow(visitor).to receive_messages(
1071
+ visit_element_start: { type: :continue },
1072
+ visit_element_end: { type: :continue },
1073
+ visit_text: { type: :continue }
1074
+ )
1075
+
1076
+ result = described_class.convert_with_visitor(html, nil, visitor)
1077
+ expect(result).to be_a(String)
1078
+ end
1079
+
1080
+ it 'handles very long text content' do
1081
+ long_text = 'A' * 10_000
1082
+ html = "<p>#{long_text}</p>"
1083
+ visitor = create_visitor
1084
+ text_received = nil
1085
+
1086
+ allow(visitor).to receive(:visit_text) do |_ctx, text|
1087
+ text_received = text
1088
+ { type: :continue }
1089
+ end
1090
+
1091
+ described_class.convert_with_visitor(html, nil, visitor)
1092
+ expect(text_received.length).to eq(10_000)
1093
+ end
1094
+
1095
+ it 'handles deeply nested HTML (stress test)' do
1096
+ html = "#{'<div>' * 50}Deep#{'</div>' * 50}"
1097
+ visitor = create_visitor
1098
+ element_count = 0
1099
+
1100
+ allow(visitor).to receive(:visit_element_start) do |_ctx|
1101
+ element_count += 1
1102
+ { type: :continue }
1103
+ end
1104
+
1105
+ allow(visitor).to receive_messages(
1106
+ visit_element_end: { type: :continue },
1107
+ visit_text: { type: :continue }
1108
+ )
1109
+
1110
+ result = described_class.convert_with_visitor(html, nil, visitor)
1111
+ expect(element_count).to be > 0
1112
+ expect(result).to include('Deep')
1113
+ end
1114
+
1115
+ it 'handles self-closing tags' do
1116
+ html = '<p>Before<br/>After</p>'
1117
+ visitor = create_visitor
1118
+
1119
+ allow(visitor).to receive(:visit_line_break).and_return({ type: :continue })
1120
+
1121
+ allow(visitor).to receive_messages(
1122
+ visit_element_start: { type: :continue },
1123
+ visit_element_end: { type: :continue },
1124
+ visit_text: { type: :continue }
1125
+ )
1126
+
1127
+ result = described_class.convert_with_visitor(html, nil, visitor)
1128
+ expect(result).to include('Before')
1129
+ expect(result).to include('After')
1130
+ end
1131
+
1132
+ it 'handles malformed HTML gracefully' do
1133
+ html = '<p>Unclosed <div>tag<p>Another'
1134
+ visitor = create_visitor
1135
+
1136
+ allow(visitor).to receive_messages(
1137
+ visit_element_start: { type: :continue },
1138
+ visit_element_end: { type: :continue },
1139
+ visit_text: { type: :continue }
1140
+ )
1141
+
1142
+ expect do
1143
+ described_class.convert_with_visitor(html, nil, visitor)
1144
+ end.not_to raise_error
1145
+ end
1146
+ end
1147
+ end
1148
+ end
1149
+ # rubocop:enable RSpec/ContextWording, RSpec/VerifiedDoubles