html-to-markdown 3.2.4 → 3.4.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Steepfile +6 -0
- data/ext/html_to_markdown_rb/Cargo.toml +2 -2
- data/ext/html_to_markdown_rb/native/Cargo.toml +28 -0
- data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +10 -0
- data/ext/html_to_markdown_rb/src/html-to-markdown.rb +13 -0
- data/ext/html_to_markdown_rb/src/lib.rs +2088 -268
- data/lib/bin/html-to-markdown +0 -0
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +5 -3
- data/sig/types.rbs +769 -0
- data/vendor/Cargo.toml +2 -2
- data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/examples/basic.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_deser.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +1 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +15 -25
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/container.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +6 -7
- data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -108
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +2 -4
- data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/context.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -3
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/main.rs +57 -31
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +8 -8
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +5 -5
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +6 -17
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -11
- data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +80 -22
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +5 -9
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +10 -10
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +13 -13
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +4 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +6 -14
- data/vendor/html-to-markdown-rs/src/inline_images.rs +6 -0
- data/vendor/html-to-markdown-rs/src/lib.rs +17 -18
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +31 -0
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -12
- data/vendor/html-to-markdown-rs/src/text.rs +0 -44
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +2 -0
- data/vendor/html-to-markdown-rs/src/visitor/types.rs +5 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +4 -1
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +136 -0
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +137 -0
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +5 -2
- data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +77 -0
- data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +82 -0
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +6 -6
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +103 -35
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +1 -1
- metadata +21 -43
- data/.bundle/config +0 -2
- data/.gitignore +0 -3
- data/.rubocop.yml +0 -59
- data/Gemfile +0 -18
- data/Gemfile.lock +0 -173
- data/README.md +0 -331
- data/Rakefile +0 -26
- data/exe/html-to-markdown +0 -6
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +0 -6
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +0 -9
- data/html-to-markdown-rb.gemspec +0 -99
- data/lib/html_to_markdown_rs.rb +0 -3
- data/sig/html_to_markdown.rbs +0 -149
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +0 -94
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -86
- data/vendor/html-to-markdown-rs/src/safety.rs +0 -70
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
#![cfg(feature = "visitor")]
|
|
9
9
|
|
|
10
10
|
use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, NodeType, VisitResult};
|
|
11
|
-
use html_to_markdown_rs::{ConversionOptions,
|
|
11
|
+
use html_to_markdown_rs::{ConversionOptions, convert};
|
|
12
12
|
use std::cell::RefCell;
|
|
13
13
|
use std::rc::Rc;
|
|
14
14
|
|
|
@@ -99,7 +99,10 @@ fn test_custom_visitor_transforms_text() {
|
|
|
99
99
|
let html = r"<p>Hello world</p>";
|
|
100
100
|
let visitor = Rc::new(RefCell::new(CustomizingVisitor));
|
|
101
101
|
|
|
102
|
-
let result =
|
|
102
|
+
let result = convert(html, None, Some(visitor))
|
|
103
|
+
.expect("conversion failed")
|
|
104
|
+
.content
|
|
105
|
+
.unwrap_or_default();
|
|
103
106
|
|
|
104
107
|
assert!(result.contains("[TEXT:"), "Should contain custom text format");
|
|
105
108
|
}
|
|
@@ -109,7 +112,10 @@ fn test_custom_visitor_transforms_links() {
|
|
|
109
112
|
let html = r#"<a href="https://example.com">Example</a>"#;
|
|
110
113
|
let visitor = Rc::new(RefCell::new(CustomizingVisitor));
|
|
111
114
|
|
|
112
|
-
let result =
|
|
115
|
+
let result = convert(html, None, Some(visitor))
|
|
116
|
+
.expect("conversion failed")
|
|
117
|
+
.content
|
|
118
|
+
.unwrap_or_default();
|
|
113
119
|
|
|
114
120
|
assert!(
|
|
115
121
|
result.contains("[LINK:Example -> https://example.com]"),
|
|
@@ -122,7 +128,10 @@ fn test_custom_visitor_transforms_images() {
|
|
|
122
128
|
let html = r#"<img src="/test.png" alt="Test">"#;
|
|
123
129
|
let visitor = Rc::new(RefCell::new(CustomizingVisitor));
|
|
124
130
|
|
|
125
|
-
let result =
|
|
131
|
+
let result = convert(html, None, Some(visitor))
|
|
132
|
+
.expect("conversion failed")
|
|
133
|
+
.content
|
|
134
|
+
.unwrap_or_default();
|
|
126
135
|
|
|
127
136
|
assert!(
|
|
128
137
|
result.contains("[IMAGE:Test @ /test.png]"),
|
|
@@ -135,7 +144,10 @@ fn test_custom_visitor_transforms_headings() {
|
|
|
135
144
|
let html = r"<h2>My Heading</h2>";
|
|
136
145
|
let visitor = Rc::new(RefCell::new(CustomizingVisitor));
|
|
137
146
|
|
|
138
|
-
let result =
|
|
147
|
+
let result = convert(html, None, Some(visitor))
|
|
148
|
+
.expect("conversion failed")
|
|
149
|
+
.content
|
|
150
|
+
.unwrap_or_default();
|
|
139
151
|
|
|
140
152
|
assert!(
|
|
141
153
|
result.contains("[H2: My Heading]"),
|
|
@@ -151,7 +163,10 @@ fn test_skipping_visitor_removes_links() {
|
|
|
151
163
|
skip_images: false,
|
|
152
164
|
}));
|
|
153
165
|
|
|
154
|
-
let result =
|
|
166
|
+
let result = convert(html, None, Some(visitor))
|
|
167
|
+
.expect("conversion failed")
|
|
168
|
+
.content
|
|
169
|
+
.unwrap_or_default();
|
|
155
170
|
|
|
156
171
|
assert!(
|
|
157
172
|
!result.contains("example.com"),
|
|
@@ -167,7 +182,10 @@ fn test_skipping_visitor_removes_images() {
|
|
|
167
182
|
skip_images: true,
|
|
168
183
|
}));
|
|
169
184
|
|
|
170
|
-
let result =
|
|
185
|
+
let result = convert(html, None, Some(visitor))
|
|
186
|
+
.expect("conversion failed")
|
|
187
|
+
.content
|
|
188
|
+
.unwrap_or_default();
|
|
171
189
|
|
|
172
190
|
assert!(
|
|
173
191
|
!result.contains("test.png") && !result.contains("!["),
|
|
@@ -180,7 +198,10 @@ fn test_preserving_visitor_keeps_html() {
|
|
|
180
198
|
let html = r#"<a href="https://example.com" class="special">Example</a>"#;
|
|
181
199
|
let visitor = Rc::new(RefCell::new(PreservingVisitor { preserve_links: true }));
|
|
182
200
|
|
|
183
|
-
let result =
|
|
201
|
+
let result = convert(html, None, Some(visitor))
|
|
202
|
+
.expect("conversion failed")
|
|
203
|
+
.content
|
|
204
|
+
.unwrap_or_default();
|
|
184
205
|
|
|
185
206
|
assert!(
|
|
186
207
|
result.contains("<a") && result.contains("href"),
|
|
@@ -193,7 +214,7 @@ fn test_visitor_receives_node_context() {
|
|
|
193
214
|
let html = r#"<h1 id="title" class="main">Title</h1>"#;
|
|
194
215
|
let visitor = Rc::new(RefCell::new(ContextCheckingVisitor::default()));
|
|
195
216
|
|
|
196
|
-
let _result =
|
|
217
|
+
let _result = convert(html, None, Some(visitor)).expect("conversion failed");
|
|
197
218
|
}
|
|
198
219
|
|
|
199
220
|
#[test]
|
|
@@ -216,7 +237,10 @@ fn test_visitor_works_with_complex_document() {
|
|
|
216
237
|
|
|
217
238
|
let visitor = Rc::new(RefCell::new(CustomizingVisitor));
|
|
218
239
|
|
|
219
|
-
let result =
|
|
240
|
+
let result = convert(html, None, Some(visitor))
|
|
241
|
+
.expect("conversion failed")
|
|
242
|
+
.content
|
|
243
|
+
.unwrap_or_default();
|
|
220
244
|
|
|
221
245
|
assert!(result.contains("[H1:"));
|
|
222
246
|
assert!(result.contains("[H2:"));
|
|
@@ -243,7 +267,10 @@ fn test_visitor_with_conversion_options() {
|
|
|
243
267
|
|
|
244
268
|
let visitor = Rc::new(RefCell::new(ContinueVisitor));
|
|
245
269
|
|
|
246
|
-
let result =
|
|
270
|
+
let result = convert(html, Some(options), Some(visitor))
|
|
271
|
+
.expect("conversion failed")
|
|
272
|
+
.content
|
|
273
|
+
.unwrap_or_default();
|
|
247
274
|
|
|
248
275
|
assert!(
|
|
249
276
|
result.contains(r"\*") || result.contains(r"\_"),
|
|
@@ -265,7 +292,10 @@ fn test_visitor_continue_result_produces_default_markdown() {
|
|
|
265
292
|
let html = r"<h1>Title</h1>";
|
|
266
293
|
let visitor = Rc::new(RefCell::new(ContinueVisitor));
|
|
267
294
|
|
|
268
|
-
let result =
|
|
295
|
+
let result = convert(html, None, Some(visitor))
|
|
296
|
+
.expect("conversion failed")
|
|
297
|
+
.content
|
|
298
|
+
.unwrap_or_default();
|
|
269
299
|
|
|
270
300
|
assert!(
|
|
271
301
|
result.contains("# Title"),
|
|
@@ -294,7 +324,10 @@ fn test_visitor_skip_vs_continue() {
|
|
|
294
324
|
let html = r#"<p><a href="/first">First</a> and <a href="/second">Second</a></p>"#;
|
|
295
325
|
let visitor = Rc::new(RefCell::new(SelectiveSkipper { skip_first_link: true }));
|
|
296
326
|
|
|
297
|
-
let result =
|
|
327
|
+
let result = convert(html, None, Some(visitor))
|
|
328
|
+
.expect("conversion failed")
|
|
329
|
+
.content
|
|
330
|
+
.unwrap_or_default();
|
|
298
331
|
|
|
299
332
|
assert!(!result.contains("/first"));
|
|
300
333
|
assert!(result.contains("/second"));
|
|
@@ -305,7 +338,10 @@ fn test_multiple_elements_of_same_type() {
|
|
|
305
338
|
let html = r"<h1>First</h1><h2>Second</h2><h3>Third</h3>";
|
|
306
339
|
let visitor = Rc::new(RefCell::new(CustomizingVisitor));
|
|
307
340
|
|
|
308
|
-
let result =
|
|
341
|
+
let result = convert(html, None, Some(visitor))
|
|
342
|
+
.expect("conversion failed")
|
|
343
|
+
.content
|
|
344
|
+
.unwrap_or_default();
|
|
309
345
|
|
|
310
346
|
assert!(result.contains("[H1: First]"));
|
|
311
347
|
assert!(result.contains("[H2: Second]"));
|
|
@@ -317,7 +353,10 @@ fn test_nested_elements_invoke_visitor() {
|
|
|
317
353
|
let html = r#"<p>Text with <a href="/url">a <strong>bold</strong> link</a></p>"#;
|
|
318
354
|
let visitor = Rc::new(RefCell::new(CustomizingVisitor));
|
|
319
355
|
|
|
320
|
-
let result =
|
|
356
|
+
let result = convert(html, None, Some(visitor))
|
|
357
|
+
.expect("conversion failed")
|
|
358
|
+
.content
|
|
359
|
+
.unwrap_or_default();
|
|
321
360
|
|
|
322
361
|
assert!(result.contains("[TEXT:"));
|
|
323
362
|
assert!(result.contains("[LINK:"));
|
|
@@ -336,7 +375,7 @@ fn test_visitor_error_stops_conversion() {
|
|
|
336
375
|
|
|
337
376
|
let html = "<p>text</p>";
|
|
338
377
|
let visitor = Rc::new(RefCell::new(ErrorVisitor));
|
|
339
|
-
let result =
|
|
378
|
+
let result = convert(html, None, Some(visitor));
|
|
340
379
|
|
|
341
380
|
assert!(result.is_err(), "Should return error when visitor returns Error");
|
|
342
381
|
assert!(
|
|
@@ -359,7 +398,10 @@ fn test_visitor_code_block() {
|
|
|
359
398
|
|
|
360
399
|
let html = r#"<pre><code class="language-rust">fn main() {}</code></pre>"#;
|
|
361
400
|
let visitor = Rc::new(RefCell::new(CodeBlockVisitor));
|
|
362
|
-
let result =
|
|
401
|
+
let result = convert(html, None, Some(visitor))
|
|
402
|
+
.expect("conversion failed")
|
|
403
|
+
.content
|
|
404
|
+
.unwrap_or_default();
|
|
363
405
|
|
|
364
406
|
assert!(
|
|
365
407
|
result.contains("[CODE_BLOCK:rust -> fn main() {}]"),
|
|
@@ -380,7 +422,10 @@ fn test_visitor_code_inline() {
|
|
|
380
422
|
|
|
381
423
|
let html = r"<p>Use <code>println!</code> macro</p>";
|
|
382
424
|
let visitor = Rc::new(RefCell::new(InlineCodeVisitor));
|
|
383
|
-
let result =
|
|
425
|
+
let result = convert(html, None, Some(visitor))
|
|
426
|
+
.expect("conversion failed")
|
|
427
|
+
.content
|
|
428
|
+
.unwrap_or_default();
|
|
384
429
|
|
|
385
430
|
assert!(
|
|
386
431
|
result.contains("[CODE:println!]"),
|
|
@@ -418,7 +463,10 @@ fn test_visitor_list_callbacks() {
|
|
|
418
463
|
|
|
419
464
|
let html = r"<ul><li>First</li><li>Second</li></ul>";
|
|
420
465
|
let visitor = Rc::new(RefCell::new(ListVisitor::default()));
|
|
421
|
-
let result =
|
|
466
|
+
let result = convert(html, None, Some(visitor))
|
|
467
|
+
.expect("conversion failed")
|
|
468
|
+
.content
|
|
469
|
+
.unwrap_or_default();
|
|
422
470
|
|
|
423
471
|
assert!(
|
|
424
472
|
result.contains("[LIST_START:UL:1]"),
|
|
@@ -462,7 +510,10 @@ fn test_visitor_table_callbacks() {
|
|
|
462
510
|
|
|
463
511
|
let html = r"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>";
|
|
464
512
|
let visitor = Rc::new(RefCell::new(TableVisitor::default()));
|
|
465
|
-
let result =
|
|
513
|
+
let result = convert(html, None, Some(visitor))
|
|
514
|
+
.expect("conversion failed")
|
|
515
|
+
.content
|
|
516
|
+
.unwrap_or_default();
|
|
466
517
|
|
|
467
518
|
assert!(
|
|
468
519
|
result.contains("[TABLE_START]"),
|
|
@@ -492,7 +543,10 @@ fn test_visitor_blockquote() {
|
|
|
492
543
|
|
|
493
544
|
let html = r"<blockquote>This is a quote</blockquote>";
|
|
494
545
|
let visitor = Rc::new(RefCell::new(BlockquoteVisitor));
|
|
495
|
-
let result =
|
|
546
|
+
let result = convert(html, None, Some(visitor))
|
|
547
|
+
.expect("conversion failed")
|
|
548
|
+
.content
|
|
549
|
+
.unwrap_or_default();
|
|
496
550
|
|
|
497
551
|
assert!(
|
|
498
552
|
result.contains("[QUOTE:This is a quote]"),
|
|
@@ -521,7 +575,10 @@ fn test_visitor_inline_formatting() {
|
|
|
521
575
|
|
|
522
576
|
let html = r"<p><strong>bold</strong> <em>italic</em> <del>struck</del></p>";
|
|
523
577
|
let visitor = Rc::new(RefCell::new(FormattingVisitor));
|
|
524
|
-
let result =
|
|
578
|
+
let result = convert(html, None, Some(visitor))
|
|
579
|
+
.expect("conversion failed")
|
|
580
|
+
.content
|
|
581
|
+
.unwrap_or_default();
|
|
525
582
|
|
|
526
583
|
assert!(result.contains("[STRONG:bold]"), "Should see strong, got: {result}");
|
|
527
584
|
assert!(result.contains("[EM:italic]"), "Should see emphasis, got: {result}");
|
|
@@ -551,7 +608,7 @@ fn test_no_double_visit_in_links() {
|
|
|
551
608
|
|
|
552
609
|
let html = r#"<a href="/url">link text</a>"#;
|
|
553
610
|
let visitor = Rc::new(RefCell::new(CountingVisitor::default()));
|
|
554
|
-
let _result =
|
|
611
|
+
let _result = convert(html, None, Some(visitor.clone())).expect("conversion failed");
|
|
555
612
|
|
|
556
613
|
assert_eq!(
|
|
557
614
|
visitor.borrow().text_visits,
|
|
@@ -581,7 +638,7 @@ fn test_no_double_visit_in_headings() {
|
|
|
581
638
|
|
|
582
639
|
let html = r"<h1>heading text</h1>";
|
|
583
640
|
let visitor = Rc::new(RefCell::new(CountingVisitor::default()));
|
|
584
|
-
let _result =
|
|
641
|
+
let _result = convert(html, None, Some(visitor.clone())).expect("conversion failed");
|
|
585
642
|
|
|
586
643
|
assert_eq!(
|
|
587
644
|
visitor.borrow().text_visits,
|
|
@@ -624,8 +681,10 @@ fn test_visitor_with_skip_images() {
|
|
|
624
681
|
};
|
|
625
682
|
|
|
626
683
|
let visitor = Rc::new(RefCell::new(SkipImageVisitor::default()));
|
|
627
|
-
let result =
|
|
628
|
-
.expect("conversion with skip_images and visitor should succeed")
|
|
684
|
+
let result = convert(html, Some(options), Some(visitor))
|
|
685
|
+
.expect("conversion with skip_images and visitor should succeed")
|
|
686
|
+
.content
|
|
687
|
+
.unwrap_or_default();
|
|
629
688
|
|
|
630
689
|
// When skip_images is true, images should not appear in output
|
|
631
690
|
assert!(
|
|
@@ -650,7 +709,7 @@ fn test_visitor_with_skip_images() {
|
|
|
650
709
|
/// Test that the main `convert()` function accepts optional visitor parameter
|
|
651
710
|
#[test]
|
|
652
711
|
fn test_convert_accepts_visitor_parameter() {
|
|
653
|
-
use html_to_markdown_rs::
|
|
712
|
+
use html_to_markdown_rs::convert;
|
|
654
713
|
|
|
655
714
|
#[derive(Debug, Default)]
|
|
656
715
|
struct CountingVisitor {
|
|
@@ -674,7 +733,7 @@ fn test_convert_accepts_visitor_parameter() {
|
|
|
674
733
|
let visitor = Rc::new(RefCell::new(CountingVisitor::default()));
|
|
675
734
|
|
|
676
735
|
// Test using the main convert() function with visitor parameter
|
|
677
|
-
let _result =
|
|
736
|
+
let _result = convert(html, None, Some(visitor.clone())).expect("convert with visitor should work");
|
|
678
737
|
|
|
679
738
|
let borrowed = visitor.borrow();
|
|
680
739
|
assert!(
|
|
@@ -719,7 +778,10 @@ fn test_convert_with_inline_images_accepts_visitor() {
|
|
|
719
778
|
|
|
720
779
|
// Verify visitor callbacks fire via convert_with_visitor
|
|
721
780
|
let visitor = Rc::new(RefCell::new(ImageTrackingVisitor::default()));
|
|
722
|
-
let markdown =
|
|
781
|
+
let markdown = convert(html, None, Some(visitor.clone()))
|
|
782
|
+
.expect("convert should work")
|
|
783
|
+
.content
|
|
784
|
+
.unwrap_or_default();
|
|
723
785
|
|
|
724
786
|
assert_eq!(
|
|
725
787
|
visitor.borrow().images_seen,
|
|
@@ -771,7 +833,10 @@ fn test_visitor_and_metadata_both_work() {
|
|
|
771
833
|
|
|
772
834
|
// Verify visitor callbacks fire via convert_with_visitor
|
|
773
835
|
let visitor = Rc::new(RefCell::new(MetadataAwareVisitor::default()));
|
|
774
|
-
let markdown =
|
|
836
|
+
let markdown = convert(html, None, Some(visitor.clone()))
|
|
837
|
+
.expect("convert should work")
|
|
838
|
+
.content
|
|
839
|
+
.unwrap_or_default();
|
|
775
840
|
|
|
776
841
|
let borrowed = visitor.borrow();
|
|
777
842
|
assert!(
|
|
@@ -788,7 +853,7 @@ fn test_visitor_and_metadata_both_work() {
|
|
|
788
853
|
drop(borrowed);
|
|
789
854
|
|
|
790
855
|
// Verify metadata extraction via convert()
|
|
791
|
-
let result = html_to_markdown_rs::convert(html, None).expect("convert should work");
|
|
856
|
+
let result = html_to_markdown_rs::convert(html, None, None).expect("convert should work");
|
|
792
857
|
let metadata = result.metadata;
|
|
793
858
|
|
|
794
859
|
assert_eq!(
|
|
@@ -856,7 +921,10 @@ fn test_convert_with_all_features_and_visitor() {
|
|
|
856
921
|
|
|
857
922
|
// Verify visitor callbacks fire via convert_with_visitor
|
|
858
923
|
let visitor = Rc::new(RefCell::new(ComprehensiveVisitor::default()));
|
|
859
|
-
let markdown =
|
|
924
|
+
let markdown = convert(html, None, Some(visitor.clone()))
|
|
925
|
+
.expect("convert should work")
|
|
926
|
+
.content
|
|
927
|
+
.unwrap_or_default();
|
|
860
928
|
|
|
861
929
|
// Verify all visitor callbacks were invoked
|
|
862
930
|
let borrowed = visitor.borrow();
|
|
@@ -901,7 +969,7 @@ fn test_image_visitor_with_metadata_does_not_panic() {
|
|
|
901
969
|
..Default::default()
|
|
902
970
|
};
|
|
903
971
|
|
|
904
|
-
let result =
|
|
972
|
+
let result = convert(html, Some(options), Some(Rc::new(RefCell::new(ImageVisitor))));
|
|
905
973
|
assert!(result.is_ok(), "conversion panicked or errored: {:?}", result.err());
|
|
906
974
|
}
|
|
907
975
|
|
|
@@ -927,10 +995,10 @@ fn test_element_end_replacement_with_metadata_preserves_subsequent_content() {
|
|
|
927
995
|
..Default::default()
|
|
928
996
|
};
|
|
929
997
|
|
|
930
|
-
let result =
|
|
998
|
+
let result = convert(html, Some(options), Some(Rc::new(RefCell::new(FigureReplacingVisitor))));
|
|
931
999
|
assert!(result.is_ok(), "conversion panicked or errored: {:?}", result.err());
|
|
932
1000
|
assert!(
|
|
933
|
-
result.unwrap().contains("after"),
|
|
1001
|
+
result.unwrap().content.unwrap_or_default().contains("after"),
|
|
934
1002
|
"content after replaced element should not be lost"
|
|
935
1003
|
);
|
|
936
1004
|
}
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use html_to_markdown_rs::ConversionOptions;
|
metadata
CHANGED
|
@@ -1,69 +1,50 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.
|
|
4
|
+
version: 3.4.0.pre.rc.13
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
|
-
-
|
|
7
|
+
- Kreuzberg Team
|
|
8
8
|
autorequire:
|
|
9
|
-
bindir:
|
|
9
|
+
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
|
-
- - "
|
|
17
|
+
- - "~>"
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
19
|
version: '0.9'
|
|
20
|
-
- - "<"
|
|
21
|
-
- !ruby/object:Gem::Version
|
|
22
|
-
version: '1.0'
|
|
23
20
|
type: :runtime
|
|
24
21
|
prerelease: false
|
|
25
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
26
23
|
requirements:
|
|
27
|
-
- - "
|
|
24
|
+
- - "~>"
|
|
28
25
|
- !ruby/object:Gem::Version
|
|
29
26
|
version: '0.9'
|
|
30
|
-
|
|
31
|
-
- !ruby/object:Gem::Version
|
|
32
|
-
version: '1.0'
|
|
33
|
-
description: |-
|
|
34
|
-
html-to-markdown is a native Ruby extension built on the shared Rust engine that powers the html-to-markdown project.
|
|
35
|
-
It delivers identical HTML-to-Markdown output across languages, exposes inline image extraction, and ships with a CLI for automation workflows.
|
|
27
|
+
description: High-performance HTML to Markdown converter
|
|
36
28
|
email:
|
|
37
|
-
|
|
38
|
-
executables:
|
|
39
|
-
- html-to-markdown
|
|
29
|
+
executables: []
|
|
40
30
|
extensions:
|
|
41
31
|
- ext/html_to_markdown_rb/extconf.rb
|
|
42
|
-
extra_rdoc_files:
|
|
43
|
-
- README.md
|
|
32
|
+
extra_rdoc_files: []
|
|
44
33
|
files:
|
|
45
|
-
- ".bundle/config"
|
|
46
|
-
- ".gitignore"
|
|
47
|
-
- ".rubocop.yml"
|
|
48
|
-
- Gemfile
|
|
49
|
-
- Gemfile.lock
|
|
50
|
-
- README.md
|
|
51
|
-
- Rakefile
|
|
52
34
|
- Steepfile
|
|
53
|
-
- exe/html-to-markdown
|
|
54
35
|
- ext/html_to_markdown_rb/Cargo.toml
|
|
55
36
|
- ext/html_to_markdown_rb/extconf.rb
|
|
56
|
-
- ext/html_to_markdown_rb/
|
|
57
|
-
- ext/html_to_markdown_rb/src/
|
|
37
|
+
- ext/html_to_markdown_rb/native/Cargo.toml
|
|
38
|
+
- ext/html_to_markdown_rb/src/html-to-markdown.rb
|
|
39
|
+
- ext/html_to_markdown_rb/src/html-to-markdown/version.rb
|
|
58
40
|
- ext/html_to_markdown_rb/src/lib.rs
|
|
59
|
-
- html-to-markdown
|
|
41
|
+
- lib/bin/html-to-markdown
|
|
60
42
|
- lib/html_to_markdown.rb
|
|
61
43
|
- lib/html_to_markdown/version.rb
|
|
62
|
-
- lib/html_to_markdown_rs.rb
|
|
63
|
-
- sig/html_to_markdown.rbs
|
|
64
44
|
- sig/html_to_markdown/cli.rbs
|
|
65
45
|
- sig/html_to_markdown/cli_proxy.rbs
|
|
66
46
|
- sig/open3.rbs
|
|
47
|
+
- sig/types.rbs
|
|
67
48
|
- vendor/Cargo.toml
|
|
68
49
|
- vendor/html-to-markdown-rs/Cargo.toml
|
|
69
50
|
- vendor/html-to-markdown-rs/README.md
|
|
@@ -141,9 +122,7 @@ files:
|
|
|
141
122
|
- vendor/html-to-markdown-rs/src/converter/semantic/mod.rs
|
|
142
123
|
- vendor/html-to-markdown-rs/src/converter/semantic/sectioning.rs
|
|
143
124
|
- vendor/html-to-markdown-rs/src/converter/semantic/summary.rs
|
|
144
|
-
- vendor/html-to-markdown-rs/src/converter/text/escaping.rs
|
|
145
125
|
- vendor/html-to-markdown-rs/src/converter/text/mod.rs
|
|
146
|
-
- vendor/html-to-markdown-rs/src/converter/text/normalization.rs
|
|
147
126
|
- vendor/html-to-markdown-rs/src/converter/text/processing.rs
|
|
148
127
|
- vendor/html-to-markdown-rs/src/converter/text_node.rs
|
|
149
128
|
- vendor/html-to-markdown-rs/src/converter/utility/attributes.rs
|
|
@@ -170,7 +149,6 @@ files:
|
|
|
170
149
|
- vendor/html-to-markdown-rs/src/options/validation.rs
|
|
171
150
|
- vendor/html-to-markdown-rs/src/prelude.rs
|
|
172
151
|
- vendor/html-to-markdown-rs/src/rcdom.rs
|
|
173
|
-
- vendor/html-to-markdown-rs/src/safety.rs
|
|
174
152
|
- vendor/html-to-markdown-rs/src/text.rs
|
|
175
153
|
- vendor/html-to-markdown-rs/src/types/document.rs
|
|
176
154
|
- vendor/html-to-markdown-rs/src/types/mod.rs
|
|
@@ -196,6 +174,7 @@ files:
|
|
|
196
174
|
- vendor/html-to-markdown-rs/tests/br_in_inline_test.rs
|
|
197
175
|
- vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs
|
|
198
176
|
- vendor/html-to-markdown-rs/tests/djot_output_test.rs
|
|
177
|
+
- vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs
|
|
199
178
|
- vendor/html-to-markdown-rs/tests/integration_test.rs
|
|
200
179
|
- vendor/html-to-markdown-rs/tests/issue_121_regressions.rs
|
|
201
180
|
- vendor/html-to-markdown-rs/tests/issue_127_regressions.rs
|
|
@@ -218,11 +197,14 @@ files:
|
|
|
218
197
|
- vendor/html-to-markdown-rs/tests/plain_output_test.rs
|
|
219
198
|
- vendor/html-to-markdown-rs/tests/preprocessing_tests.rs
|
|
220
199
|
- vendor/html-to-markdown-rs/tests/reference_links_test.rs
|
|
200
|
+
- vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs
|
|
221
201
|
- vendor/html-to-markdown-rs/tests/skip_images_test.rs
|
|
222
202
|
- vendor/html-to-markdown-rs/tests/tables_test.rs
|
|
223
203
|
- vendor/html-to-markdown-rs/tests/test_custom_elements.rs
|
|
224
204
|
- vendor/html-to-markdown-rs/tests/test_issue_187.rs
|
|
225
205
|
- vendor/html-to-markdown-rs/tests/test_issue_218.rs
|
|
206
|
+
- vendor/html-to-markdown-rs/tests/test_issue_277.rs
|
|
207
|
+
- vendor/html-to-markdown-rs/tests/test_max_depth.rs
|
|
226
208
|
- vendor/html-to-markdown-rs/tests/test_nested_simple.rs
|
|
227
209
|
- vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs
|
|
228
210
|
- vendor/html-to-markdown-rs/tests/test_spa_bisect.rs
|
|
@@ -233,12 +215,8 @@ homepage: https://github.com/kreuzberg-dev/html-to-markdown
|
|
|
233
215
|
licenses:
|
|
234
216
|
- MIT
|
|
235
217
|
metadata:
|
|
218
|
+
keywords: html,markdown,converter
|
|
236
219
|
rubygems_mfa_required: 'true'
|
|
237
|
-
homepage_uri: https://github.com/kreuzberg-dev/html-to-markdown
|
|
238
|
-
source_code_uri: https://github.com/kreuzberg-dev/html-to-markdown
|
|
239
|
-
bug_tracker_uri: https://github.com/kreuzberg-dev/html-to-markdown/issues
|
|
240
|
-
changelog_uri: https://github.com/kreuzberg-dev/html-to-markdown/releases
|
|
241
|
-
documentation_uri: https://github.com/kreuzberg-dev/html-to-markdown/blob/main/packages/ruby/README.md
|
|
242
220
|
post_install_message:
|
|
243
221
|
rdoc_options: []
|
|
244
222
|
require_paths:
|
|
@@ -247,7 +225,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
247
225
|
requirements:
|
|
248
226
|
- - ">="
|
|
249
227
|
- !ruby/object:Gem::Version
|
|
250
|
-
version:
|
|
228
|
+
version: 3.2.0
|
|
251
229
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
252
230
|
requirements:
|
|
253
231
|
- - ">="
|
|
@@ -257,5 +235,5 @@ requirements: []
|
|
|
257
235
|
rubygems_version: 3.5.22
|
|
258
236
|
signing_key:
|
|
259
237
|
specification_version: 4
|
|
260
|
-
summary:
|
|
238
|
+
summary: High-performance HTML to Markdown converter
|
|
261
239
|
test_files: []
|
data/.bundle/config
DELETED
data/.gitignore
DELETED
data/.rubocop.yml
DELETED
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
plugins:
|
|
2
|
-
- rubocop-performance
|
|
3
|
-
- rubocop-rspec
|
|
4
|
-
|
|
5
|
-
AllCops:
|
|
6
|
-
TargetRubyVersion: 3.2
|
|
7
|
-
NewCops: enable
|
|
8
|
-
SuggestExtensions: false
|
|
9
|
-
Exclude:
|
|
10
|
-
- 'vendor/**/*'
|
|
11
|
-
- 'tmp/**/*'
|
|
12
|
-
- 'lib/**/*.bundle'
|
|
13
|
-
- 'ext/**/*'
|
|
14
|
-
|
|
15
|
-
Style/FrozenStringLiteralComment:
|
|
16
|
-
Enabled: true
|
|
17
|
-
EnforcedStyle: always
|
|
18
|
-
|
|
19
|
-
Style/StringLiterals:
|
|
20
|
-
Enabled: true
|
|
21
|
-
EnforcedStyle: single_quotes
|
|
22
|
-
|
|
23
|
-
Style/StringLiteralsInInterpolation:
|
|
24
|
-
Enabled: true
|
|
25
|
-
EnforcedStyle: single_quotes
|
|
26
|
-
|
|
27
|
-
Style/Documentation:
|
|
28
|
-
Enabled: false
|
|
29
|
-
|
|
30
|
-
Layout/LineLength:
|
|
31
|
-
Max: 120
|
|
32
|
-
AllowedPatterns:
|
|
33
|
-
- '\A\s*#'
|
|
34
|
-
Exclude:
|
|
35
|
-
- 'spec/**/*'
|
|
36
|
-
|
|
37
|
-
Metrics/MethodLength:
|
|
38
|
-
Max: 20
|
|
39
|
-
Exclude:
|
|
40
|
-
- 'spec/**/*'
|
|
41
|
-
|
|
42
|
-
Metrics/BlockLength:
|
|
43
|
-
Enabled: true
|
|
44
|
-
Max: 350
|
|
45
|
-
CountComments: false
|
|
46
|
-
|
|
47
|
-
Metrics/AbcSize:
|
|
48
|
-
Max: 20
|
|
49
|
-
Exclude:
|
|
50
|
-
- 'spec/**/*'
|
|
51
|
-
|
|
52
|
-
RSpec/ExampleLength:
|
|
53
|
-
Max: 50
|
|
54
|
-
|
|
55
|
-
RSpec/MultipleExpectations:
|
|
56
|
-
Max: 25
|
|
57
|
-
|
|
58
|
-
RSpec/NestedGroups:
|
|
59
|
-
Max: 6
|
data/Gemfile
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
source 'https://rubygems.org'
|
|
4
|
-
|
|
5
|
-
ruby '>= 3.2'
|
|
6
|
-
|
|
7
|
-
gemspec
|
|
8
|
-
|
|
9
|
-
group :development, :test do
|
|
10
|
-
gem 'rake-compiler'
|
|
11
|
-
gem 'rbs', require: false
|
|
12
|
-
gem 'rb_sys' # provides build tooling when developing locally
|
|
13
|
-
gem 'rspec'
|
|
14
|
-
gem 'rubocop', require: false
|
|
15
|
-
gem 'rubocop-performance', require: false
|
|
16
|
-
gem 'rubocop-rspec', require: false
|
|
17
|
-
gem 'steep', require: false
|
|
18
|
-
end
|