html-to-markdown 3.0.1 → 3.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +10 -6
- data/README.md +1 -1
- data/ext/html-to-markdown-rb/native/Cargo.lock +12 -20
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/vendor/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +14 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +4 -0
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +6 -0
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +14 -0
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +12 -0
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +12 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +2 -1
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +2 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e2761dc167e2c7f7e0e27da4367660d7dc4d18f853b5b5add976019434a37de0
|
|
4
|
+
data.tar.gz: 8d6822eb08fc782524c4ec35446c48ff017bd2aa640f2515e79c05e4d89508b7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 93b26fafdae4c4beca9fc6134a3ba2948369210900c8145e1ad72b1714db86e453523361e39308b8d78d5b94d8dc83e33259c2ae2eca727c1b20f3c1629b81c7
|
|
7
|
+
data.tar.gz: 69d16a9ff3a3d67a3cd267d9c0ff7ed48b9f3736e01fa87f5ec7ae2dd1b79263dea40d9e54392ab50546d12ccfd0c9486b3a728e45344009769a1f1fd2a65bd2
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (3.0.
|
|
4
|
+
html-to-markdown (3.0.2)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -29,6 +29,7 @@ GEM
|
|
|
29
29
|
diff-lcs (1.6.2)
|
|
30
30
|
drb (2.2.3)
|
|
31
31
|
ffi (1.17.4-arm64-darwin)
|
|
32
|
+
ffi (1.17.4-x86_64-linux-gnu)
|
|
32
33
|
fileutils (1.8.0)
|
|
33
34
|
i18n (1.14.8)
|
|
34
35
|
concurrent-ruby (~> 1.0)
|
|
@@ -40,7 +41,7 @@ GEM
|
|
|
40
41
|
rb-fsevent (~> 0.10, >= 0.10.3)
|
|
41
42
|
rb-inotify (~> 0.9, >= 0.9.10)
|
|
42
43
|
logger (1.7.0)
|
|
43
|
-
minitest (6.0.
|
|
44
|
+
minitest (6.0.3)
|
|
44
45
|
drb (~> 2.0)
|
|
45
46
|
prism (~> 1.5)
|
|
46
47
|
mutex_m (0.3.0)
|
|
@@ -58,7 +59,8 @@ GEM
|
|
|
58
59
|
rb-fsevent (0.11.2)
|
|
59
60
|
rb-inotify (0.11.1)
|
|
60
61
|
ffi (~> 1.0)
|
|
61
|
-
rb_sys (0.9.
|
|
62
|
+
rb_sys (0.9.125)
|
|
63
|
+
json (>= 2)
|
|
62
64
|
rake-compiler-dock (= 1.11.0)
|
|
63
65
|
rbs (3.10.4)
|
|
64
66
|
logger
|
|
@@ -126,6 +128,7 @@ GEM
|
|
|
126
128
|
|
|
127
129
|
PLATFORMS
|
|
128
130
|
arm64-darwin
|
|
131
|
+
x86_64-linux
|
|
129
132
|
|
|
130
133
|
DEPENDENCIES
|
|
131
134
|
html-to-markdown!
|
|
@@ -148,15 +151,16 @@ CHECKSUMS
|
|
|
148
151
|
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
|
|
149
152
|
drb (2.2.3) sha256=0b00d6fdb50995fe4a45dea13663493c841112e4068656854646f418fda13373
|
|
150
153
|
ffi (1.17.4-arm64-darwin) sha256=19071aaf1419251b0a46852abf960e77330a3b334d13a4ab51d58b31a937001b
|
|
154
|
+
ffi (1.17.4-x86_64-linux-gnu) sha256=9d3db14c2eae074b382fa9c083fe95aec6e0a1451da249eab096c34002bc752d
|
|
151
155
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
152
|
-
html-to-markdown (3.0.
|
|
156
|
+
html-to-markdown (3.0.2)
|
|
153
157
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
154
158
|
json (2.19.3) sha256=289b0bb53052a1fa8c34ab33cc750b659ba14a5c45f3fcf4b18762dc67c78646
|
|
155
159
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
156
160
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
157
161
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
158
162
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
159
|
-
minitest (6.0.
|
|
163
|
+
minitest (6.0.3) sha256=88ac8a1de36c00692420e7cb3cc11a0773bbcb126aee1c249f320160a7d11411
|
|
160
164
|
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
161
165
|
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
|
|
162
166
|
parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54
|
|
@@ -168,7 +172,7 @@ CHECKSUMS
|
|
|
168
172
|
rake-compiler-dock (1.11.0) sha256=eab51f2cd533eb35cea6b624a75281f047123e70a64c58b607471bb49428f8c2
|
|
169
173
|
rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
|
|
170
174
|
rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
|
|
171
|
-
rb_sys (0.9.
|
|
175
|
+
rb_sys (0.9.125) sha256=14efd4e07eaf7c07edb1bab548d2a4767869a47a8821fc5ea52d9bf982ef00a8
|
|
172
176
|
rbs (3.10.4) sha256=b17d7c4be4bb31a11a3b529830f0aa206a807ca42f2e7921a3027dfc6b7e5ce8
|
|
173
177
|
regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
|
|
174
178
|
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
data/README.md
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
|
|
19
19
|
</a>
|
|
20
20
|
<a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
|
|
21
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.0.
|
|
21
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.0.2" alt="Go">
|
|
22
22
|
</a>
|
|
23
23
|
<a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
|
|
24
24
|
<img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
|
|
@@ -59,16 +59,14 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
|
|
59
59
|
|
|
60
60
|
[[package]]
|
|
61
61
|
name = "bindgen"
|
|
62
|
-
version = "0.
|
|
62
|
+
version = "0.72.1"
|
|
63
63
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
64
|
-
checksum = "
|
|
64
|
+
checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
|
|
65
65
|
dependencies = [
|
|
66
66
|
"bitflags",
|
|
67
67
|
"cexpr",
|
|
68
68
|
"clang-sys",
|
|
69
69
|
"itertools",
|
|
70
|
-
"lazy_static",
|
|
71
|
-
"lazycell",
|
|
72
70
|
"proc-macro2",
|
|
73
71
|
"quote",
|
|
74
72
|
"regex",
|
|
@@ -260,7 +258,7 @@ dependencies = [
|
|
|
260
258
|
|
|
261
259
|
[[package]]
|
|
262
260
|
name = "html-to-markdown-rb"
|
|
263
|
-
version = "3.0.
|
|
261
|
+
version = "3.0.1"
|
|
264
262
|
dependencies = [
|
|
265
263
|
"html-to-markdown-rs",
|
|
266
264
|
"magnus",
|
|
@@ -269,7 +267,7 @@ dependencies = [
|
|
|
269
267
|
|
|
270
268
|
[[package]]
|
|
271
269
|
name = "html-to-markdown-rs"
|
|
272
|
-
version = "3.0.
|
|
270
|
+
version = "3.0.1"
|
|
273
271
|
dependencies = [
|
|
274
272
|
"ahash",
|
|
275
273
|
"astral-tl",
|
|
@@ -326,9 +324,9 @@ dependencies = [
|
|
|
326
324
|
|
|
327
325
|
[[package]]
|
|
328
326
|
name = "itertools"
|
|
329
|
-
version = "0.
|
|
327
|
+
version = "0.13.0"
|
|
330
328
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
331
|
-
checksum = "
|
|
329
|
+
checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
|
|
332
330
|
dependencies = [
|
|
333
331
|
"either",
|
|
334
332
|
]
|
|
@@ -345,12 +343,6 @@ version = "1.5.0"
|
|
|
345
343
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
346
344
|
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
|
347
345
|
|
|
348
|
-
[[package]]
|
|
349
|
-
name = "lazycell"
|
|
350
|
-
version = "1.3.0"
|
|
351
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
352
|
-
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
353
|
-
|
|
354
346
|
[[package]]
|
|
355
347
|
name = "libc"
|
|
356
348
|
version = "0.2.183"
|
|
@@ -609,18 +601,18 @@ dependencies = [
|
|
|
609
601
|
|
|
610
602
|
[[package]]
|
|
611
603
|
name = "rb-sys"
|
|
612
|
-
version = "0.9.
|
|
604
|
+
version = "0.9.125"
|
|
613
605
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
614
|
-
checksum = "
|
|
606
|
+
checksum = "85b37650fabd8ba515910a0dc089dcb6348eb3c35fbf91698cb226435be2babc"
|
|
615
607
|
dependencies = [
|
|
616
608
|
"rb-sys-build",
|
|
617
609
|
]
|
|
618
610
|
|
|
619
611
|
[[package]]
|
|
620
612
|
name = "rb-sys-build"
|
|
621
|
-
version = "0.9.
|
|
613
|
+
version = "0.9.125"
|
|
622
614
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
623
|
-
checksum = "
|
|
615
|
+
checksum = "c73b806faa66006e491458b48a78725621c1ac5a2a6efe2614c90711a7780b80"
|
|
624
616
|
dependencies = [
|
|
625
617
|
"bindgen",
|
|
626
618
|
"lazy_static",
|
|
@@ -677,9 +669,9 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
|
|
|
677
669
|
|
|
678
670
|
[[package]]
|
|
679
671
|
name = "rustc-hash"
|
|
680
|
-
version = "
|
|
672
|
+
version = "2.1.2"
|
|
681
673
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
682
|
-
checksum = "
|
|
674
|
+
checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
|
|
683
675
|
|
|
684
676
|
[[package]]
|
|
685
677
|
name = "scopeguard"
|
data/vendor/Cargo.toml
CHANGED
|
@@ -128,16 +128,20 @@ pub(crate) fn handle(
|
|
|
128
128
|
}
|
|
129
129
|
|
|
130
130
|
// Notify the structure collector if present.
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
131
|
+
// Skip headings inside table cells — they are part of the table content,
|
|
132
|
+
// not standalone structural headings.
|
|
133
|
+
if !ctx.in_table_cell {
|
|
134
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
135
|
+
if let Some(node) = node_handle.get(parser) {
|
|
136
|
+
if let tl::Node::Tag(tag) = node {
|
|
137
|
+
let id = tag
|
|
138
|
+
.attributes()
|
|
139
|
+
.get("id")
|
|
140
|
+
.flatten()
|
|
141
|
+
.map(|v| v.as_utf8_str().to_string());
|
|
142
|
+
sc.borrow_mut()
|
|
143
|
+
.push_heading(level as u8, normalized.as_ref(), id.as_deref());
|
|
144
|
+
}
|
|
141
145
|
}
|
|
142
146
|
}
|
|
143
147
|
}
|
|
@@ -301,6 +301,10 @@ pub fn handle_pre(
|
|
|
301
301
|
{
|
|
302
302
|
format_code_block(&processed_content, language.as_deref(), output, options, ctx);
|
|
303
303
|
}
|
|
304
|
+
|
|
305
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
306
|
+
sc.borrow_mut().push_code(&processed_content, language.as_deref());
|
|
307
|
+
}
|
|
304
308
|
}
|
|
305
309
|
}
|
|
306
310
|
|
|
@@ -192,6 +192,12 @@ pub fn handle_img(
|
|
|
192
192
|
}
|
|
193
193
|
}
|
|
194
194
|
}
|
|
195
|
+
|
|
196
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
197
|
+
let src_opt = if src.is_empty() { None } else { Some(src.as_ref()) };
|
|
198
|
+
let alt_opt = if alt.is_empty() { None } else { Some(alt.as_ref()) };
|
|
199
|
+
sc.borrow_mut().push_image(src_opt, alt_opt);
|
|
200
|
+
}
|
|
195
201
|
}
|
|
196
202
|
|
|
197
203
|
/// Format an image as Markdown syntax.
|
|
@@ -204,6 +204,8 @@ pub(crate) fn handle_li(
|
|
|
204
204
|
}
|
|
205
205
|
}
|
|
206
206
|
|
|
207
|
+
let item_start_pos = output.len();
|
|
208
|
+
|
|
207
209
|
let children = tag.children();
|
|
208
210
|
{
|
|
209
211
|
for child_handle in children.top().iter() {
|
|
@@ -213,6 +215,18 @@ pub(crate) fn handle_li(
|
|
|
213
215
|
|
|
214
216
|
trim_trailing_whitespace(output);
|
|
215
217
|
|
|
218
|
+
if !ctx.in_table_cell {
|
|
219
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
220
|
+
if item_start_pos <= output.len() && output.is_char_boundary(item_start_pos) {
|
|
221
|
+
let rendered = &output[item_start_pos..];
|
|
222
|
+
let content = rendered.trim();
|
|
223
|
+
if !content.is_empty() {
|
|
224
|
+
sc.borrow_mut().push_list_item(content);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
216
230
|
#[cfg(feature = "visitor")]
|
|
217
231
|
if let Some(ref visitor_handle) = ctx.visitor {
|
|
218
232
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
@@ -107,6 +107,12 @@ pub(crate) fn handle_ol(
|
|
|
107
107
|
}
|
|
108
108
|
}
|
|
109
109
|
|
|
110
|
+
if !ctx.in_table_cell {
|
|
111
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
112
|
+
sc.borrow_mut().push_list_start(true);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
110
116
|
process_list_children(
|
|
111
117
|
*node_handle,
|
|
112
118
|
parser,
|
|
@@ -121,6 +127,12 @@ pub(crate) fn handle_ol(
|
|
|
121
127
|
dom_ctx,
|
|
122
128
|
);
|
|
123
129
|
|
|
130
|
+
if !ctx.in_table_cell {
|
|
131
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
132
|
+
sc.borrow_mut().push_list_end();
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
124
136
|
add_nested_list_trailing_separator(output, ctx);
|
|
125
137
|
|
|
126
138
|
#[cfg(feature = "visitor")]
|
|
@@ -101,6 +101,12 @@ pub(crate) fn handle_ul(
|
|
|
101
101
|
}
|
|
102
102
|
}
|
|
103
103
|
|
|
104
|
+
if !ctx.in_table_cell {
|
|
105
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
106
|
+
sc.borrow_mut().push_list_start(false);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
104
110
|
process_list_children(
|
|
105
111
|
*node_handle,
|
|
106
112
|
parser,
|
|
@@ -115,6 +121,12 @@ pub(crate) fn handle_ul(
|
|
|
115
121
|
dom_ctx,
|
|
116
122
|
);
|
|
117
123
|
|
|
124
|
+
if !ctx.in_table_cell {
|
|
125
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
126
|
+
sc.borrow_mut().push_list_end();
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
118
130
|
add_nested_list_trailing_separator(output, ctx);
|
|
119
131
|
|
|
120
132
|
#[cfg(feature = "visitor")]
|
|
@@ -226,7 +226,8 @@ fn make_node_id(node_type: &str, text: &str, index: usize) -> String {
|
|
|
226
226
|
let mut hasher = DefaultHasher::new();
|
|
227
227
|
node_type.hash(&mut hasher);
|
|
228
228
|
// Only hash a prefix of the text to keep cost bounded.
|
|
229
|
-
text
|
|
229
|
+
let end = crate::converter::utility::content::floor_char_boundary(text, text.len().min(64));
|
|
230
|
+
text[..end].hash(&mut hasher);
|
|
230
231
|
index.hash(&mut hasher);
|
|
231
232
|
let digest = hasher.finish();
|
|
232
233
|
format!("{node_type}-{digest:016x}")
|
|
@@ -347,7 +347,8 @@ impl StructureCollector {
|
|
|
347
347
|
|
|
348
348
|
let mut hasher = DefaultHasher::new();
|
|
349
349
|
node_type.hash(&mut hasher);
|
|
350
|
-
text
|
|
350
|
+
let end = crate::converter::utility::content::floor_char_boundary(text, text.len().min(64));
|
|
351
|
+
text[..end].hash(&mut hasher);
|
|
351
352
|
index.hash(&mut hasher);
|
|
352
353
|
let digest = hasher.finish();
|
|
353
354
|
format!("{node_type}-{digest:016x}")
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.0.
|
|
4
|
+
version: 3.0.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-04-01 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|