html-to-markdown 2.29.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +18 -41
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +7 -4
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +127 -51
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
data/spec/convert_spec.rb
DELETED
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'spec_helper'
|
|
4
|
-
|
|
5
|
-
RSpec.describe HtmlToMarkdown do
|
|
6
|
-
describe '.convert' do
|
|
7
|
-
it 'converts simple headings' do
|
|
8
|
-
expect(described_class.convert('<h1>Hello</h1>')).to eq("# Hello\n")
|
|
9
|
-
end
|
|
10
|
-
|
|
11
|
-
it 'accepts options hash' do
|
|
12
|
-
result = described_class.convert(
|
|
13
|
-
'<h1>Hello</h1>',
|
|
14
|
-
heading_style: :atx_closed,
|
|
15
|
-
default_title: true
|
|
16
|
-
)
|
|
17
|
-
expect(result).to include('Hello')
|
|
18
|
-
end
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
describe '.convert_with_inline_images' do
|
|
22
|
-
it 'returns inline images metadata' do
|
|
23
|
-
html = '<p><img src="data:image/png;base64,ZmFrZQ==" alt="fake"></p>'
|
|
24
|
-
extraction = described_class.convert_with_inline_images(html)
|
|
25
|
-
expect(extraction).to include(:markdown, :inline_images, :warnings)
|
|
26
|
-
expect(extraction[:inline_images].first[:description]).to eq('fake')
|
|
27
|
-
end
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
describe '.options' do
|
|
31
|
-
it 'returns a reusable options handle' do
|
|
32
|
-
handle = described_class.options(heading_style: :atx_closed)
|
|
33
|
-
expect(handle).to be_a(HtmlToMarkdown::Options)
|
|
34
|
-
result = described_class.convert_with_options('<h1>Hello</h1>', handle)
|
|
35
|
-
expect(result).to include('# Hello #')
|
|
36
|
-
end
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
describe 'panic handling' do
|
|
40
|
-
context 'when a Rust panic would occur' do
|
|
41
|
-
it 'catches panics in convert method' do
|
|
42
|
-
malformed_html = "#{'<' * 100_000}div#{'>' * 100_000}"
|
|
43
|
-
|
|
44
|
-
begin
|
|
45
|
-
result = described_class.convert(malformed_html)
|
|
46
|
-
expect(result).to be_a(String)
|
|
47
|
-
rescue RuntimeError => e
|
|
48
|
-
expect(e.message).to match(/html-to-markdown panic during conversion/)
|
|
49
|
-
end
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
it 'catches panics in convert_with_options method' do
|
|
53
|
-
malformed_html = "#{'<' * 100_000}div#{'>' * 100_000}"
|
|
54
|
-
handle = described_class.options(heading_style: :atx)
|
|
55
|
-
|
|
56
|
-
begin
|
|
57
|
-
result = described_class.convert_with_options(malformed_html, handle)
|
|
58
|
-
expect(result).to be_a(String)
|
|
59
|
-
rescue RuntimeError => e
|
|
60
|
-
expect(e.message).to match(/html-to-markdown panic during conversion/)
|
|
61
|
-
end
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
it 'catches panics in convert_with_inline_images method' do
|
|
65
|
-
malformed_html = "#{'<' * 100_000}div#{'>' * 100_000}"
|
|
66
|
-
|
|
67
|
-
begin
|
|
68
|
-
result = described_class.convert_with_inline_images(malformed_html)
|
|
69
|
-
expect(result).to be_a(Hash)
|
|
70
|
-
expect(result).to include(:markdown, :inline_images, :warnings)
|
|
71
|
-
rescue RuntimeError => e
|
|
72
|
-
expect(e.message).to match(/html-to-markdown panic during conversion/)
|
|
73
|
-
end
|
|
74
|
-
end
|
|
75
|
-
end
|
|
76
|
-
end
|
|
77
|
-
end
|
|
@@ -1,194 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'spec_helper'
|
|
4
|
-
|
|
5
|
-
RSpec.describe HtmlToMarkdown do
|
|
6
|
-
describe '.convert_with_tables' do
|
|
7
|
-
it 'returns a hash with content, metadata, and tables keys' do
|
|
8
|
-
html = '<table><tr><td>Cell</td></tr></table>'
|
|
9
|
-
result = described_class.convert_with_tables(html)
|
|
10
|
-
|
|
11
|
-
expect(result).to be_a(Hash)
|
|
12
|
-
expect(result).to include(:content, :metadata, :tables)
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
context 'with a basic table with header' do
|
|
16
|
-
let(:html) do
|
|
17
|
-
<<~HTML
|
|
18
|
-
<table>
|
|
19
|
-
<thead>
|
|
20
|
-
<tr><th>Name</th><th>Age</th></tr>
|
|
21
|
-
</thead>
|
|
22
|
-
<tbody>
|
|
23
|
-
<tr><td>Alice</td><td>30</td></tr>
|
|
24
|
-
</tbody>
|
|
25
|
-
</table>
|
|
26
|
-
HTML
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
it 'extracts exactly one table' do
|
|
30
|
-
result = described_class.convert_with_tables(html)
|
|
31
|
-
|
|
32
|
-
expect(result[:tables].length).to eq(1)
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
it 'extracts cells as rows of columns' do
|
|
36
|
-
result = described_class.convert_with_tables(html)
|
|
37
|
-
table = result[:tables][0]
|
|
38
|
-
|
|
39
|
-
expect(table[:cells]).to be_an(Array)
|
|
40
|
-
expect(table[:cells].length).to eq(2)
|
|
41
|
-
expect(table[:cells][0]).to eq(%w[Name Age])
|
|
42
|
-
expect(table[:cells][1]).to eq(%w[Alice 30])
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
it 'provides markdown representation' do
|
|
46
|
-
result = described_class.convert_with_tables(html)
|
|
47
|
-
table = result[:tables][0]
|
|
48
|
-
|
|
49
|
-
expect(table[:markdown]).to be_a(String)
|
|
50
|
-
expect(table[:markdown]).to include('Name')
|
|
51
|
-
expect(table[:markdown]).to include('Alice')
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
it 'marks header rows correctly' do
|
|
55
|
-
result = described_class.convert_with_tables(html)
|
|
56
|
-
table = result[:tables][0]
|
|
57
|
-
|
|
58
|
-
expect(table[:is_header_row]).to be_an(Array)
|
|
59
|
-
expect(table[:is_header_row].length).to eq(2)
|
|
60
|
-
expect(table[:is_header_row][0]).to be true
|
|
61
|
-
expect(table[:is_header_row][1]).to be false
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
it 'includes converted markdown content' do
|
|
65
|
-
result = described_class.convert_with_tables(html)
|
|
66
|
-
|
|
67
|
-
expect(result[:content]).to be_a(String)
|
|
68
|
-
expect(result[:content]).not_to be_empty
|
|
69
|
-
end
|
|
70
|
-
end
|
|
71
|
-
|
|
72
|
-
context 'with empty HTML' do
|
|
73
|
-
it 'returns empty tables array' do
|
|
74
|
-
result = described_class.convert_with_tables('')
|
|
75
|
-
|
|
76
|
-
expect(result[:tables]).to eq([])
|
|
77
|
-
expect(result[:content]).to be_a(String)
|
|
78
|
-
end
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
context 'with HTML containing no tables' do
|
|
82
|
-
it 'returns empty tables array' do
|
|
83
|
-
html = '<p>No tables here</p>'
|
|
84
|
-
result = described_class.convert_with_tables(html)
|
|
85
|
-
|
|
86
|
-
expect(result[:tables]).to eq([])
|
|
87
|
-
expect(result[:content]).to include('No tables here')
|
|
88
|
-
end
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
context 'with multiple tables' do
|
|
92
|
-
let(:html) do
|
|
93
|
-
<<~HTML
|
|
94
|
-
<table>
|
|
95
|
-
<tr><th>A</th></tr>
|
|
96
|
-
<tr><td>1</td></tr>
|
|
97
|
-
</table>
|
|
98
|
-
<p>Some text between tables</p>
|
|
99
|
-
<table>
|
|
100
|
-
<tr><th>B</th><th>C</th></tr>
|
|
101
|
-
<tr><td>2</td><td>3</td></tr>
|
|
102
|
-
<tr><td>4</td><td>5</td></tr>
|
|
103
|
-
</table>
|
|
104
|
-
HTML
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
it 'extracts all tables' do
|
|
108
|
-
result = described_class.convert_with_tables(html)
|
|
109
|
-
|
|
110
|
-
expect(result[:tables].length).to eq(2)
|
|
111
|
-
end
|
|
112
|
-
|
|
113
|
-
it 'preserves table order' do
|
|
114
|
-
result = described_class.convert_with_tables(html)
|
|
115
|
-
|
|
116
|
-
first_table = result[:tables][0]
|
|
117
|
-
second_table = result[:tables][1]
|
|
118
|
-
|
|
119
|
-
expect(first_table[:cells][0]).to eq(['A'])
|
|
120
|
-
expect(second_table[:cells][0]).to eq(%w[B C])
|
|
121
|
-
end
|
|
122
|
-
|
|
123
|
-
it 'extracts correct row counts per table' do
|
|
124
|
-
result = described_class.convert_with_tables(html)
|
|
125
|
-
|
|
126
|
-
expect(result[:tables][0][:cells].length).to eq(2)
|
|
127
|
-
expect(result[:tables][1][:cells].length).to eq(3)
|
|
128
|
-
end
|
|
129
|
-
end
|
|
130
|
-
|
|
131
|
-
context 'with special characters in cells' do
|
|
132
|
-
let(:html) do
|
|
133
|
-
<<~HTML
|
|
134
|
-
<table>
|
|
135
|
-
<tr><th>Key</th><th>Value</th></tr>
|
|
136
|
-
<tr><td>Brackets <></td><td>Ampersand &</td></tr>
|
|
137
|
-
<tr><td>Quotes "double"</td><td>Quotes 'single'</td></tr>
|
|
138
|
-
<tr><td>Unicode: cafe\u0301</td><td>Emoji: test</td></tr>
|
|
139
|
-
</table>
|
|
140
|
-
HTML
|
|
141
|
-
end
|
|
142
|
-
|
|
143
|
-
it 'handles HTML entities in cells' do
|
|
144
|
-
result = described_class.convert_with_tables(html)
|
|
145
|
-
table = result[:tables][0]
|
|
146
|
-
|
|
147
|
-
expect(table[:cells][1][0]).to include('<>')
|
|
148
|
-
expect(table[:cells][1][1]).to include('&')
|
|
149
|
-
end
|
|
150
|
-
|
|
151
|
-
it 'handles quotes in cells' do
|
|
152
|
-
result = described_class.convert_with_tables(html)
|
|
153
|
-
table = result[:tables][0]
|
|
154
|
-
|
|
155
|
-
expect(table[:cells][2][0]).to include('"double"')
|
|
156
|
-
expect(table[:cells][2][1]).to include("'single'")
|
|
157
|
-
end
|
|
158
|
-
|
|
159
|
-
it 'handles unicode in cells' do
|
|
160
|
-
result = described_class.convert_with_tables(html)
|
|
161
|
-
table = result[:tables][0]
|
|
162
|
-
|
|
163
|
-
expect(table[:cells][3][0]).to be_a(String)
|
|
164
|
-
end
|
|
165
|
-
end
|
|
166
|
-
|
|
167
|
-
context 'with conversion options' do
|
|
168
|
-
it 'accepts options hash' do
|
|
169
|
-
html = '<table><tr><th>Header</th></tr><tr><td>Data</td></tr></table>'
|
|
170
|
-
result = described_class.convert_with_tables(html, { heading_style: :atx })
|
|
171
|
-
|
|
172
|
-
expect(result).to be_a(Hash)
|
|
173
|
-
expect(result[:tables].length).to eq(1)
|
|
174
|
-
end
|
|
175
|
-
|
|
176
|
-
it 'accepts nil options' do
|
|
177
|
-
html = '<table><tr><td>Data</td></tr></table>'
|
|
178
|
-
result = described_class.convert_with_tables(html, nil, nil)
|
|
179
|
-
|
|
180
|
-
expect(result).to be_a(Hash)
|
|
181
|
-
expect(result[:tables].length).to eq(1)
|
|
182
|
-
end
|
|
183
|
-
end
|
|
184
|
-
|
|
185
|
-
context 'with metadata config' do
|
|
186
|
-
it 'includes metadata when configured' do
|
|
187
|
-
html = '<html><head><title>Test</title></head><body><table><tr><td>Data</td></tr></table></body></html>'
|
|
188
|
-
result = described_class.convert_with_tables(html, nil, { extract_headers: true })
|
|
189
|
-
|
|
190
|
-
expect(result[:metadata]).to be_a(Hash).or(be_nil)
|
|
191
|
-
end
|
|
192
|
-
end
|
|
193
|
-
end
|
|
194
|
-
end
|