html-to-markdown 2.27.1 → 2.27.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/rust-vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/rust-vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -9
- data/rust-vendor/html-to-markdown-rs/tests/plain_output_test.rs +59 -6
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 4eaa23699bd7eae0f731da14c7c8a651f65734ae05d7f0e62b855ae885ee8bcb
|
|
4
|
+
data.tar.gz: 75e5050d21661d008d8ca8ee9d912666305b198f6ae55231c8b0394dab3a46a4
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d4bf930d4ddbaff7a94d6613f0b4f8a329d9cf9b49ed40c93bd8ca81499b2c6e4e85226d11abf91c83aa04689d3a711b003c24c886e961253de54a1c687a3acf
|
|
7
|
+
data.tar.gz: 7a9f8057810a314b6f7b363427dd8b6eff21cf6501b5899f7ea5a6b8d48710df28f8230aad58c073cbc68ad5c1969e8c7c4b4d701df17af7c20bc00141eedd56
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (2.27.
|
|
4
|
+
html-to-markdown (2.27.2)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -172,7 +172,7 @@ CHECKSUMS
|
|
|
172
172
|
ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
|
|
173
173
|
ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
|
|
174
174
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
175
|
-
html-to-markdown (2.27.
|
|
175
|
+
html-to-markdown (2.27.2)
|
|
176
176
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
177
177
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
178
178
|
json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
|
|
@@ -4,9 +4,23 @@
|
|
|
4
4
|
//! visible text content with structural whitespace, bypassing the full
|
|
5
5
|
//! Markdown/Djot conversion pipeline.
|
|
6
6
|
|
|
7
|
+
use std::fmt::Write;
|
|
8
|
+
|
|
7
9
|
use crate::options::ConversionOptions;
|
|
8
10
|
use crate::text;
|
|
9
11
|
|
|
12
|
+
/// Tracks list context for proper marker emission on `<li>` elements.
|
|
13
|
+
#[derive(Clone, Debug)]
|
|
14
|
+
enum ListContext {
|
|
15
|
+
/// Not inside any list.
|
|
16
|
+
None,
|
|
17
|
+
/// Inside `<ul>` — each `<li>` gets a `- ` prefix.
|
|
18
|
+
Unordered,
|
|
19
|
+
/// Inside `<ol>` — each `<li>` gets a sequential `N. ` prefix.
|
|
20
|
+
/// The `next_index` is incremented after each `<li>`.
|
|
21
|
+
Ordered { next_index: u32 },
|
|
22
|
+
}
|
|
23
|
+
|
|
10
24
|
/// Tags whose content should be skipped entirely.
|
|
11
25
|
const SKIP_TAGS: &[&str] = &["script", "style", "head", "template", "noscript", "svg", "math"];
|
|
12
26
|
|
|
@@ -49,9 +63,10 @@ const BLOCK_TAGS: &[&str] = &[
|
|
|
49
63
|
/// - Inline elements are recursed without markers
|
|
50
64
|
pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &ConversionOptions) -> String {
|
|
51
65
|
let mut buf = String::with_capacity(1024);
|
|
66
|
+
let mut list_ctx = ListContext::None;
|
|
52
67
|
|
|
53
68
|
for child_handle in dom.children() {
|
|
54
|
-
walk_plain(child_handle, parser, &mut buf, options, false);
|
|
69
|
+
walk_plain(child_handle, parser, &mut buf, options, false, &mut list_ctx);
|
|
55
70
|
}
|
|
56
71
|
|
|
57
72
|
post_process(&mut buf);
|
|
@@ -65,6 +80,7 @@ fn walk_plain(
|
|
|
65
80
|
buf: &mut String,
|
|
66
81
|
options: &ConversionOptions,
|
|
67
82
|
in_pre: bool,
|
|
83
|
+
list_ctx: &mut ListContext,
|
|
68
84
|
) {
|
|
69
85
|
let Some(node) = node_handle.get(parser) else {
|
|
70
86
|
return;
|
|
@@ -105,7 +121,7 @@ fn walk_plain(
|
|
|
105
121
|
}
|
|
106
122
|
"pre" => {
|
|
107
123
|
ensure_blank_line(buf);
|
|
108
|
-
walk_children(tag, parser, buf, options, true);
|
|
124
|
+
walk_children(tag, parser, buf, options, true, list_ctx);
|
|
109
125
|
ensure_blank_line(buf);
|
|
110
126
|
}
|
|
111
127
|
"img" => {
|
|
@@ -123,19 +139,50 @@ fn walk_plain(
|
|
|
123
139
|
walk_table(tag, parser, buf, options);
|
|
124
140
|
ensure_blank_line(buf);
|
|
125
141
|
}
|
|
142
|
+
"ul" => {
|
|
143
|
+
ensure_newline(buf);
|
|
144
|
+
let mut child_ctx = ListContext::Unordered;
|
|
145
|
+
walk_children(tag, parser, buf, options, false, &mut child_ctx);
|
|
146
|
+
ensure_newline(buf);
|
|
147
|
+
}
|
|
148
|
+
"ol" => {
|
|
149
|
+
let start = tag
|
|
150
|
+
.attributes()
|
|
151
|
+
.get("start")
|
|
152
|
+
.flatten()
|
|
153
|
+
.and_then(|v| v.as_utf8_str().parse::<u32>().ok())
|
|
154
|
+
.unwrap_or(1);
|
|
155
|
+
ensure_newline(buf);
|
|
156
|
+
let mut child_ctx = ListContext::Ordered { next_index: start };
|
|
157
|
+
walk_children(tag, parser, buf, options, false, &mut child_ctx);
|
|
158
|
+
ensure_newline(buf);
|
|
159
|
+
}
|
|
126
160
|
"li" => {
|
|
127
161
|
ensure_newline(buf);
|
|
128
|
-
|
|
162
|
+
match list_ctx {
|
|
163
|
+
ListContext::Unordered => {
|
|
164
|
+
buf.push_str("- ");
|
|
165
|
+
}
|
|
166
|
+
ListContext::Ordered { next_index } => {
|
|
167
|
+
let _ = write!(buf, "{}. ", next_index);
|
|
168
|
+
*next_index += 1;
|
|
169
|
+
}
|
|
170
|
+
ListContext::None => {
|
|
171
|
+
// <li> outside a list — emit with bullet as fallback
|
|
172
|
+
buf.push_str("- ");
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
walk_children(tag, parser, buf, options, false, list_ctx);
|
|
129
176
|
ensure_newline(buf);
|
|
130
177
|
}
|
|
131
178
|
_ if BLOCK_TAGS.contains(&tag_str) => {
|
|
132
179
|
ensure_blank_line(buf);
|
|
133
|
-
walk_children(tag, parser, buf, options, in_pre);
|
|
180
|
+
walk_children(tag, parser, buf, options, in_pre, list_ctx);
|
|
134
181
|
ensure_blank_line(buf);
|
|
135
182
|
}
|
|
136
183
|
_ => {
|
|
137
|
-
// Inline elements and structural containers (html, body,
|
|
138
|
-
walk_children(tag, parser, buf, options, in_pre);
|
|
184
|
+
// Inline elements and structural containers (html, body, etc.)
|
|
185
|
+
walk_children(tag, parser, buf, options, in_pre, list_ctx);
|
|
139
186
|
}
|
|
140
187
|
}
|
|
141
188
|
}
|
|
@@ -144,11 +191,18 @@ fn walk_plain(
|
|
|
144
191
|
}
|
|
145
192
|
|
|
146
193
|
/// Walk all children of a tag.
|
|
147
|
-
fn walk_children(
|
|
194
|
+
fn walk_children(
|
|
195
|
+
tag: &tl::HTMLTag,
|
|
196
|
+
parser: &tl::Parser,
|
|
197
|
+
buf: &mut String,
|
|
198
|
+
options: &ConversionOptions,
|
|
199
|
+
in_pre: bool,
|
|
200
|
+
list_ctx: &mut ListContext,
|
|
201
|
+
) {
|
|
148
202
|
let children = tag.children();
|
|
149
203
|
let top = children.top();
|
|
150
204
|
for child in top.iter() {
|
|
151
|
-
walk_plain(child, parser, buf, options, in_pre);
|
|
205
|
+
walk_plain(child, parser, buf, options, in_pre, list_ctx);
|
|
152
206
|
}
|
|
153
207
|
}
|
|
154
208
|
|
|
@@ -185,7 +239,8 @@ fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, op
|
|
|
185
239
|
}
|
|
186
240
|
let mut cell_buf = String::new();
|
|
187
241
|
if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
|
|
188
|
-
|
|
242
|
+
let mut cell_list_ctx = ListContext::None;
|
|
243
|
+
walk_children(cell_tag, parser, &mut cell_buf, options, false, &mut cell_list_ctx);
|
|
189
244
|
}
|
|
190
245
|
buf.push_str(cell_buf.trim());
|
|
191
246
|
}
|
|
@@ -75,12 +75,7 @@ fn test_plain_blockquote_no_prefix() {
|
|
|
75
75
|
fn test_plain_list_items_on_separate_lines() {
|
|
76
76
|
let html = "<ul><li>First</li><li>Second</li><li>Third</li></ul>";
|
|
77
77
|
let result = convert(html, Some(plain_options())).unwrap();
|
|
78
|
-
|
|
79
|
-
assert!(result.contains("Second"));
|
|
80
|
-
assert!(result.contains("Third"));
|
|
81
|
-
// Items should be on separate lines
|
|
82
|
-
let lines: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect();
|
|
83
|
-
assert!(lines.len() >= 3, "Expected at least 3 lines, got: {result}");
|
|
78
|
+
assert_eq!(result, "- First\n- Second\n- Third\n");
|
|
84
79
|
}
|
|
85
80
|
|
|
86
81
|
#[test]
|
|
@@ -212,3 +207,61 @@ fn test_plain_pre_preserves_whitespace() {
|
|
|
212
207
|
"Pre blocks should preserve whitespace, got: {result}"
|
|
213
208
|
);
|
|
214
209
|
}
|
|
210
|
+
|
|
211
|
+
#[test]
|
|
212
|
+
fn test_plain_unordered_list_markers() {
|
|
213
|
+
let html = "<ul><li>Alpha</li><li>Beta</li><li>Gamma</li></ul>";
|
|
214
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
215
|
+
assert_eq!(result, "- Alpha\n- Beta\n- Gamma\n");
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
#[test]
|
|
219
|
+
fn test_plain_ordered_list_markers() {
|
|
220
|
+
let html = "<ol><li>First</li><li>Second</li><li>Third</li></ol>";
|
|
221
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
222
|
+
assert_eq!(result, "1. First\n2. Second\n3. Third\n");
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
#[test]
|
|
226
|
+
fn test_plain_ordered_list_custom_start() {
|
|
227
|
+
let html = r#"<ol start="42"><li>First item starting at 42</li><li>Second item</li></ol>"#;
|
|
228
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
229
|
+
assert_eq!(result, "42. First item starting at 42\n43. Second item\n");
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
#[test]
|
|
233
|
+
fn test_plain_nested_lists() {
|
|
234
|
+
let html = "<ul><li>Outer 1<ul><li>Inner A</li><li>Inner B</li></ul></li><li>Outer 2</li></ul>";
|
|
235
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
236
|
+
// The outer items should have `- ` prefix and inner items should also have `- ` prefix
|
|
237
|
+
assert!(
|
|
238
|
+
result.contains("- Outer 1"),
|
|
239
|
+
"Expected '- Outer 1' in output, got: {result}"
|
|
240
|
+
);
|
|
241
|
+
assert!(
|
|
242
|
+
result.contains("- Inner A"),
|
|
243
|
+
"Expected '- Inner A' in output, got: {result}"
|
|
244
|
+
);
|
|
245
|
+
assert!(
|
|
246
|
+
result.contains("- Inner B"),
|
|
247
|
+
"Expected '- Inner B' in output, got: {result}"
|
|
248
|
+
);
|
|
249
|
+
assert!(
|
|
250
|
+
result.contains("- Outer 2"),
|
|
251
|
+
"Expected '- Outer 2' in output, got: {result}"
|
|
252
|
+
);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
#[test]
|
|
256
|
+
fn test_plain_ordered_list_inside_unordered() {
|
|
257
|
+
let html = "<ul><li>Bullet<ol><li>Numbered</li></ol></li></ul>";
|
|
258
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
259
|
+
assert!(
|
|
260
|
+
result.contains("- Bullet"),
|
|
261
|
+
"Expected '- Bullet' in output, got: {result}"
|
|
262
|
+
);
|
|
263
|
+
assert!(
|
|
264
|
+
result.contains("1. Numbered"),
|
|
265
|
+
"Expected '1. Numbered' in output, got: {result}"
|
|
266
|
+
);
|
|
267
|
+
}
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.27.
|
|
4
|
+
version: 2.27.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-03-
|
|
11
|
+
date: 2026-03-02 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|