html-to-markdown 2.26.2 → 2.26.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +24 -10
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/rust-vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/rust-vendor/html-to-markdown-rs/src/converter/inline/semantic/typography.rs +4 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/text_node.rs +6 -0
- data/rust-vendor/html-to-markdown-rs/tests/integration_test.rs +60 -0
- data/spec/visitor_spec.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 23d0242cd4fc575d8081e675fb8d16f09faa7fb1c6c0df9b18d21338c0391880
|
|
4
|
+
data.tar.gz: cf86724440a34a26e1f17c134a232b1b321edaacb95758e42f9eab59dc710f8b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 63afe8bdf9d36f4cc225859e3a7ebb62452e97feafccc5ea1e20564a47b7e037900b6af7300508eec4313d4306608e3b00bc5f6a3115001ee250d5c560880bb6
|
|
7
|
+
data.tar.gz: c8fcaa6e61fea4325b08ce39ebf0a2bd92ff4e6d58702497e38cc6a081c1eec3de095d72986ccdc828ee6e219697d53c12483087ee563d39f767fe989d72ffdb
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (2.26.
|
|
4
|
+
html-to-markdown (2.26.3)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -20,6 +20,8 @@ GEM
|
|
|
20
20
|
securerandom (>= 0.3)
|
|
21
21
|
tzinfo (~> 2.0, >= 2.0.5)
|
|
22
22
|
uri (>= 0.13.1)
|
|
23
|
+
addressable (2.8.9)
|
|
24
|
+
public_suffix (>= 2.0.2, < 8.0)
|
|
23
25
|
ast (2.4.3)
|
|
24
26
|
base64 (0.3.0)
|
|
25
27
|
bigdecimal (4.0.1)
|
|
@@ -37,6 +39,9 @@ GEM
|
|
|
37
39
|
i18n (1.14.8)
|
|
38
40
|
concurrent-ruby (~> 1.0)
|
|
39
41
|
json (2.18.1)
|
|
42
|
+
json-schema (6.1.0)
|
|
43
|
+
addressable (~> 2.8)
|
|
44
|
+
bigdecimal (>= 3.1, < 5)
|
|
40
45
|
language_server-protocol (3.17.0.5)
|
|
41
46
|
lint_roller (1.1.0)
|
|
42
47
|
listen (3.10.0)
|
|
@@ -44,14 +49,18 @@ GEM
|
|
|
44
49
|
rb-fsevent (~> 0.10, >= 0.10.3)
|
|
45
50
|
rb-inotify (~> 0.9, >= 0.9.10)
|
|
46
51
|
logger (1.7.0)
|
|
47
|
-
|
|
52
|
+
mcp (0.7.1)
|
|
53
|
+
json-schema (>= 4.1)
|
|
54
|
+
minitest (6.0.2)
|
|
55
|
+
drb (~> 2.0)
|
|
48
56
|
prism (~> 1.5)
|
|
49
57
|
mutex_m (0.3.0)
|
|
50
58
|
parallel (1.27.0)
|
|
51
|
-
parser (3.3.10.
|
|
59
|
+
parser (3.3.10.2)
|
|
52
60
|
ast (~> 2.4.1)
|
|
53
61
|
racc
|
|
54
62
|
prism (1.9.0)
|
|
63
|
+
public_suffix (7.0.2)
|
|
55
64
|
racc (1.8.1)
|
|
56
65
|
rainbow (3.1.1)
|
|
57
66
|
rake (13.3.1)
|
|
@@ -76,14 +85,15 @@ GEM
|
|
|
76
85
|
rspec-expectations (3.13.5)
|
|
77
86
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
78
87
|
rspec-support (~> 3.13.0)
|
|
79
|
-
rspec-mocks (3.13.
|
|
88
|
+
rspec-mocks (3.13.8)
|
|
80
89
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
81
90
|
rspec-support (~> 3.13.0)
|
|
82
91
|
rspec-support (3.13.7)
|
|
83
|
-
rubocop (1.
|
|
92
|
+
rubocop (1.85.0)
|
|
84
93
|
json (~> 2.3)
|
|
85
94
|
language_server-protocol (~> 3.17.0.2)
|
|
86
95
|
lint_roller (~> 1.1.0)
|
|
96
|
+
mcp (~> 0.6)
|
|
87
97
|
parallel (~> 1.10)
|
|
88
98
|
parser (>= 3.3.0.2)
|
|
89
99
|
rainbow (>= 2.2.2, < 4.0)
|
|
@@ -147,6 +157,7 @@ DEPENDENCIES
|
|
|
147
157
|
|
|
148
158
|
CHECKSUMS
|
|
149
159
|
activesupport (8.1.2) sha256=88842578ccd0d40f658289b0e8c842acfe9af751afee2e0744a7873f50b6fdae
|
|
160
|
+
addressable (2.8.9) sha256=cc154fcbe689711808a43601dee7b980238ce54368d23e127421753e46895485
|
|
150
161
|
ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
|
|
151
162
|
base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
|
|
152
163
|
bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
|
|
@@ -161,18 +172,21 @@ CHECKSUMS
|
|
|
161
172
|
ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
|
|
162
173
|
ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
|
|
163
174
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
164
|
-
html-to-markdown (2.26.
|
|
175
|
+
html-to-markdown (2.26.3)
|
|
165
176
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
166
177
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
178
|
+
json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
|
|
167
179
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
168
180
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
169
181
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
170
182
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
171
|
-
|
|
183
|
+
mcp (0.7.1) sha256=fa967895d6952bad0d981ea907731d8528d2c246d2079d56a9c8bae83d14f1c7
|
|
184
|
+
minitest (6.0.2) sha256=db6e57956f6ecc6134683b4c87467d6dd792323c7f0eea7b93f66bd284adbc3d
|
|
172
185
|
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
173
186
|
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
|
|
174
|
-
parser (3.3.10.
|
|
187
|
+
parser (3.3.10.2) sha256=6f60c84aa4bdcedb6d1a2434b738fe8a8136807b6adc8f7f53b97da9bc4e9357
|
|
175
188
|
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
|
189
|
+
public_suffix (7.0.2) sha256=9114090c8e4e7135c1fd0e7acfea33afaab38101884320c65aaa0ffb8e26a857
|
|
176
190
|
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
|
|
177
191
|
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
|
|
178
192
|
rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
|
|
@@ -186,9 +200,9 @@ CHECKSUMS
|
|
|
186
200
|
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
|
187
201
|
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
|
|
188
202
|
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
189
|
-
rspec-mocks (3.13.
|
|
203
|
+
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
|
|
190
204
|
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
|
|
191
|
-
rubocop (1.
|
|
205
|
+
rubocop (1.85.0) sha256=317407feb681a07d54f64d2f9e1d6b6af1ce7678e51cd658e3ad8bd66da48c01
|
|
192
206
|
rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
|
|
193
207
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
194
208
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
@@ -87,6 +87,8 @@ pub fn handle_subscript(
|
|
|
87
87
|
} else {
|
|
88
88
|
output.push_str(&options.sub_symbol);
|
|
89
89
|
}
|
|
90
|
+
} else {
|
|
91
|
+
output.push_str(trimmed);
|
|
90
92
|
}
|
|
91
93
|
append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
|
|
92
94
|
}
|
|
@@ -139,6 +141,8 @@ pub fn handle_superscript(
|
|
|
139
141
|
} else {
|
|
140
142
|
output.push_str(&options.sup_symbol);
|
|
141
143
|
}
|
|
144
|
+
} else {
|
|
145
|
+
output.push_str(trimmed);
|
|
142
146
|
}
|
|
143
147
|
append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
|
|
144
148
|
}
|
|
@@ -82,6 +82,12 @@ pub fn process_text_node(
|
|
|
82
82
|
if !output.ends_with("\n\n") {
|
|
83
83
|
if let Some(next_tag) = get_next_sibling_tag(node_handle, parser, dom_ctx) {
|
|
84
84
|
if is_inline_element(next_tag) {
|
|
85
|
+
// Newlines between inline elements collapse to a single space
|
|
86
|
+
// in HTML rendering (per CSS white-space: normal). Preserve
|
|
87
|
+
// this word boundary so adjacent inline content doesn't merge.
|
|
88
|
+
if !output.ends_with(' ') && !output.ends_with('\n') {
|
|
89
|
+
output.push(' ');
|
|
90
|
+
}
|
|
85
91
|
return;
|
|
86
92
|
}
|
|
87
93
|
}
|
|
@@ -373,6 +373,66 @@ fn test_superscript_leading_whitespace() {
|
|
|
373
373
|
assert_eq!(result, "hello ^world^\n");
|
|
374
374
|
}
|
|
375
375
|
|
|
376
|
+
#[test]
|
|
377
|
+
fn test_subscript_default_passthrough() {
|
|
378
|
+
let html = "<p>H<sub>2</sub>O</p>";
|
|
379
|
+
let result = convert(html, None).unwrap();
|
|
380
|
+
assert_eq!(result, "H2O\n");
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
#[test]
|
|
384
|
+
fn test_superscript_default_passthrough() {
|
|
385
|
+
let html = "<p>x<sup>2</sup> + y<sup>3</sup></p>";
|
|
386
|
+
let result = convert(html, None).unwrap();
|
|
387
|
+
assert_eq!(result, "x2 + y3\n");
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
#[test]
|
|
391
|
+
fn test_subscript_superscript_combined_default() {
|
|
392
|
+
let html = "<p>CO<sub>2</sub><sup>*</sup></p>";
|
|
393
|
+
let result = convert(html, None).unwrap();
|
|
394
|
+
assert_eq!(result, "CO2*\n");
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
#[test]
|
|
398
|
+
fn test_subscript_html_tag_symbol() {
|
|
399
|
+
let html = "<p>H<sub>2</sub>O</p>";
|
|
400
|
+
let opts = ConversionOptions {
|
|
401
|
+
sub_symbol: "<sub>".to_string(),
|
|
402
|
+
..Default::default()
|
|
403
|
+
};
|
|
404
|
+
let result = convert(html, Some(opts)).unwrap();
|
|
405
|
+
assert_eq!(result, "H<sub>2</sub>O\n");
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
#[test]
|
|
409
|
+
fn test_adjacent_links_with_newline_separator() {
|
|
410
|
+
let html = "<p>\n<a href=\"/page1\">Link 1</a>\n<a href=\"/page2\">Link 2</a>\n</p>";
|
|
411
|
+
let result = convert(html, None).unwrap();
|
|
412
|
+
assert_eq!(result, "[Link 1](/page1) [Link 2](/page2)\n");
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
#[test]
|
|
416
|
+
fn test_adjacent_links_no_whitespace() {
|
|
417
|
+
let html = "<p><a href=\"/page1\">Link 1</a><a href=\"/page2\">Link 2</a></p>";
|
|
418
|
+
let result = convert(html, None).unwrap();
|
|
419
|
+
assert_eq!(result, "[Link 1](/page1)[Link 2](/page2)\n");
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
#[test]
|
|
423
|
+
fn test_adjacent_links_with_space() {
|
|
424
|
+
let html = "<p><a href=\"/page1\">Link 1</a> <a href=\"/page2\">Link 2</a></p>";
|
|
425
|
+
let result = convert(html, None).unwrap();
|
|
426
|
+
assert_eq!(result, "[Link 1](/page1) [Link 2](/page2)\n");
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
#[test]
|
|
430
|
+
fn test_adjacent_inline_elements_with_newline() {
|
|
431
|
+
let html = "<p><strong>bold</strong>\n<em>italic</em></p>";
|
|
432
|
+
let result = convert(html, None).unwrap();
|
|
433
|
+
assert_eq!(result, "**bold** *italic*\n");
|
|
434
|
+
}
|
|
435
|
+
|
|
376
436
|
#[test]
|
|
377
437
|
fn test_autolink() {
|
|
378
438
|
let html = "<p><a href=\"https://example.com\">https://example.com</a></p>";
|
data/spec/visitor_spec.rb
CHANGED
|
@@ -35,7 +35,7 @@ RSpec.describe HtmlToMarkdown do
|
|
|
35
35
|
visit_definition_list_end visit_form visit_input visit_button visit_audio visit_video
|
|
36
36
|
visit_iframe visit_details visit_summary visit_figure_start visit_figcaption
|
|
37
37
|
visit_figure_end
|
|
38
|
-
].
|
|
38
|
+
].to_h { |name| [name.to_sym, { type: :continue }] }
|
|
39
39
|
end
|
|
40
40
|
|
|
41
41
|
def create_visitor(**overrides)
|