html-to-markdown 2.27.1 → 2.27.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 41d0b097b3f46c377ddadf6da05274a283efeddf1142795a1766e908d7c78290
4
- data.tar.gz: 2e6585fb07a8e8cf3fc1f474cbadf6b2a3135f00ef4707029d3e35dce02726c5
3
+ metadata.gz: 4eaa23699bd7eae0f731da14c7c8a651f65734ae05d7f0e62b855ae885ee8bcb
4
+ data.tar.gz: 75e5050d21661d008d8ca8ee9d912666305b198f6ae55231c8b0394dab3a46a4
5
5
  SHA512:
6
- metadata.gz: df64f0ec15405f15043aa6594aa0560885dfa9c957fcf40ec5b4de35457f06442efd0cd3bb868d06a2a53c67a3a096719ce026e260173114857f94ab5c6249cb
7
- data.tar.gz: cbaa15dee930c2940b9aaf4f768d63cd95373cc4aea4c093d58a049e6612c3e533645cdbeaf9a64aca0571ad4a2995873ecb1b98a3034de83d5771439673989c
6
+ metadata.gz: d4bf930d4ddbaff7a94d6613f0b4f8a329d9cf9b49ed40c93bd8ca81499b2c6e4e85226d11abf91c83aa04689d3a711b003c24c886e961253de54a1c687a3acf
7
+ data.tar.gz: 7a9f8057810a314b6f7b363427dd8b6eff21cf6501b5899f7ea5a6b8d48710df28f8230aad58c073cbc68ad5c1969e8c7c4b4d701df17af7c20bc00141eedd56
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.27.1)
4
+ html-to-markdown (2.27.2)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
@@ -172,7 +172,7 @@ CHECKSUMS
172
172
  ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
173
173
  ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
174
174
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
175
- html-to-markdown (2.27.1)
175
+ html-to-markdown (2.27.2)
176
176
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
177
177
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
178
178
  json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version ="2.27.1"
3
+ version ="2.27.2"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.27.1'
4
+ VERSION = '2.27.2'
5
5
  end
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "2.27.1"
3
+ version = "2.27.2"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -4,9 +4,23 @@
4
4
  //! visible text content with structural whitespace, bypassing the full
5
5
  //! Markdown/Djot conversion pipeline.
6
6
 
7
+ use std::fmt::Write;
8
+
7
9
  use crate::options::ConversionOptions;
8
10
  use crate::text;
9
11
 
12
+ /// Tracks list context for proper marker emission on `<li>` elements.
13
+ #[derive(Clone, Debug)]
14
+ enum ListContext {
15
+ /// Not inside any list.
16
+ None,
17
+ /// Inside `<ul>` — each `<li>` gets a `- ` prefix.
18
+ Unordered,
19
+ /// Inside `<ol>` — each `<li>` gets a sequential `N. ` prefix.
20
+ /// The `next_index` is incremented after each `<li>`.
21
+ Ordered { next_index: u32 },
22
+ }
23
+
10
24
  /// Tags whose content should be skipped entirely.
11
25
  const SKIP_TAGS: &[&str] = &["script", "style", "head", "template", "noscript", "svg", "math"];
12
26
 
@@ -49,9 +63,10 @@ const BLOCK_TAGS: &[&str] = &[
49
63
  /// - Inline elements are recursed without markers
50
64
  pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &ConversionOptions) -> String {
51
65
  let mut buf = String::with_capacity(1024);
66
+ let mut list_ctx = ListContext::None;
52
67
 
53
68
  for child_handle in dom.children() {
54
- walk_plain(child_handle, parser, &mut buf, options, false);
69
+ walk_plain(child_handle, parser, &mut buf, options, false, &mut list_ctx);
55
70
  }
56
71
 
57
72
  post_process(&mut buf);
@@ -65,6 +80,7 @@ fn walk_plain(
65
80
  buf: &mut String,
66
81
  options: &ConversionOptions,
67
82
  in_pre: bool,
83
+ list_ctx: &mut ListContext,
68
84
  ) {
69
85
  let Some(node) = node_handle.get(parser) else {
70
86
  return;
@@ -105,7 +121,7 @@ fn walk_plain(
105
121
  }
106
122
  "pre" => {
107
123
  ensure_blank_line(buf);
108
- walk_children(tag, parser, buf, options, true);
124
+ walk_children(tag, parser, buf, options, true, list_ctx);
109
125
  ensure_blank_line(buf);
110
126
  }
111
127
  "img" => {
@@ -123,19 +139,50 @@ fn walk_plain(
123
139
  walk_table(tag, parser, buf, options);
124
140
  ensure_blank_line(buf);
125
141
  }
142
+ "ul" => {
143
+ ensure_newline(buf);
144
+ let mut child_ctx = ListContext::Unordered;
145
+ walk_children(tag, parser, buf, options, false, &mut child_ctx);
146
+ ensure_newline(buf);
147
+ }
148
+ "ol" => {
149
+ let start = tag
150
+ .attributes()
151
+ .get("start")
152
+ .flatten()
153
+ .and_then(|v| v.as_utf8_str().parse::<u32>().ok())
154
+ .unwrap_or(1);
155
+ ensure_newline(buf);
156
+ let mut child_ctx = ListContext::Ordered { next_index: start };
157
+ walk_children(tag, parser, buf, options, false, &mut child_ctx);
158
+ ensure_newline(buf);
159
+ }
126
160
  "li" => {
127
161
  ensure_newline(buf);
128
- walk_children(tag, parser, buf, options, false);
162
+ match list_ctx {
163
+ ListContext::Unordered => {
164
+ buf.push_str("- ");
165
+ }
166
+ ListContext::Ordered { next_index } => {
167
+ let _ = write!(buf, "{}. ", next_index);
168
+ *next_index += 1;
169
+ }
170
+ ListContext::None => {
171
+ // <li> outside a list — emit with bullet as fallback
172
+ buf.push_str("- ");
173
+ }
174
+ }
175
+ walk_children(tag, parser, buf, options, false, list_ctx);
129
176
  ensure_newline(buf);
130
177
  }
131
178
  _ if BLOCK_TAGS.contains(&tag_str) => {
132
179
  ensure_blank_line(buf);
133
- walk_children(tag, parser, buf, options, in_pre);
180
+ walk_children(tag, parser, buf, options, in_pre, list_ctx);
134
181
  ensure_blank_line(buf);
135
182
  }
136
183
  _ => {
137
- // Inline elements and structural containers (html, body, ul, ol, etc.)
138
- walk_children(tag, parser, buf, options, in_pre);
184
+ // Inline elements and structural containers (html, body, etc.)
185
+ walk_children(tag, parser, buf, options, in_pre, list_ctx);
139
186
  }
140
187
  }
141
188
  }
@@ -144,11 +191,18 @@ fn walk_plain(
144
191
  }
145
192
 
146
193
  /// Walk all children of a tag.
147
- fn walk_children(tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions, in_pre: bool) {
194
+ fn walk_children(
195
+ tag: &tl::HTMLTag,
196
+ parser: &tl::Parser,
197
+ buf: &mut String,
198
+ options: &ConversionOptions,
199
+ in_pre: bool,
200
+ list_ctx: &mut ListContext,
201
+ ) {
148
202
  let children = tag.children();
149
203
  let top = children.top();
150
204
  for child in top.iter() {
151
- walk_plain(child, parser, buf, options, in_pre);
205
+ walk_plain(child, parser, buf, options, in_pre, list_ctx);
152
206
  }
153
207
  }
154
208
 
@@ -185,7 +239,8 @@ fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, op
185
239
  }
186
240
  let mut cell_buf = String::new();
187
241
  if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
188
- walk_children(cell_tag, parser, &mut cell_buf, options, false);
242
+ let mut cell_list_ctx = ListContext::None;
243
+ walk_children(cell_tag, parser, &mut cell_buf, options, false, &mut cell_list_ctx);
189
244
  }
190
245
  buf.push_str(cell_buf.trim());
191
246
  }
@@ -75,12 +75,7 @@ fn test_plain_blockquote_no_prefix() {
75
75
  fn test_plain_list_items_on_separate_lines() {
76
76
  let html = "<ul><li>First</li><li>Second</li><li>Third</li></ul>";
77
77
  let result = convert(html, Some(plain_options())).unwrap();
78
- assert!(result.contains("First"));
79
- assert!(result.contains("Second"));
80
- assert!(result.contains("Third"));
81
- // Items should be on separate lines
82
- let lines: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect();
83
- assert!(lines.len() >= 3, "Expected at least 3 lines, got: {result}");
78
+ assert_eq!(result, "- First\n- Second\n- Third\n");
84
79
  }
85
80
 
86
81
  #[test]
@@ -212,3 +207,61 @@ fn test_plain_pre_preserves_whitespace() {
212
207
  "Pre blocks should preserve whitespace, got: {result}"
213
208
  );
214
209
  }
210
+
211
+ #[test]
212
+ fn test_plain_unordered_list_markers() {
213
+ let html = "<ul><li>Alpha</li><li>Beta</li><li>Gamma</li></ul>";
214
+ let result = convert(html, Some(plain_options())).unwrap();
215
+ assert_eq!(result, "- Alpha\n- Beta\n- Gamma\n");
216
+ }
217
+
218
+ #[test]
219
+ fn test_plain_ordered_list_markers() {
220
+ let html = "<ol><li>First</li><li>Second</li><li>Third</li></ol>";
221
+ let result = convert(html, Some(plain_options())).unwrap();
222
+ assert_eq!(result, "1. First\n2. Second\n3. Third\n");
223
+ }
224
+
225
+ #[test]
226
+ fn test_plain_ordered_list_custom_start() {
227
+ let html = r#"<ol start="42"><li>First item starting at 42</li><li>Second item</li></ol>"#;
228
+ let result = convert(html, Some(plain_options())).unwrap();
229
+ assert_eq!(result, "42. First item starting at 42\n43. Second item\n");
230
+ }
231
+
232
+ #[test]
233
+ fn test_plain_nested_lists() {
234
+ let html = "<ul><li>Outer 1<ul><li>Inner A</li><li>Inner B</li></ul></li><li>Outer 2</li></ul>";
235
+ let result = convert(html, Some(plain_options())).unwrap();
236
+ // The outer items should have `- ` prefix and inner items should also have `- ` prefix
237
+ assert!(
238
+ result.contains("- Outer 1"),
239
+ "Expected '- Outer 1' in output, got: {result}"
240
+ );
241
+ assert!(
242
+ result.contains("- Inner A"),
243
+ "Expected '- Inner A' in output, got: {result}"
244
+ );
245
+ assert!(
246
+ result.contains("- Inner B"),
247
+ "Expected '- Inner B' in output, got: {result}"
248
+ );
249
+ assert!(
250
+ result.contains("- Outer 2"),
251
+ "Expected '- Outer 2' in output, got: {result}"
252
+ );
253
+ }
254
+
255
+ #[test]
256
+ fn test_plain_ordered_list_inside_unordered() {
257
+ let html = "<ul><li>Bullet<ol><li>Numbered</li></ol></li></ul>";
258
+ let result = convert(html, Some(plain_options())).unwrap();
259
+ assert!(
260
+ result.contains("- Bullet"),
261
+ "Expected '- Bullet' in output, got: {result}"
262
+ );
263
+ assert!(
264
+ result.contains("1. Numbered"),
265
+ "Expected '1. Numbered' in output, got: {result}"
266
+ );
267
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.27.1
4
+ version: 2.27.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-03-01 00:00:00.000000000 Z
11
+ date: 2026-03-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys