html-to-markdown 3.2.4 → 3.4.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Steepfile +6 -0
- data/ext/html_to_markdown_rb/Cargo.toml +2 -2
- data/ext/html_to_markdown_rb/native/Cargo.toml +28 -0
- data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +10 -0
- data/ext/html_to_markdown_rb/src/html-to-markdown.rb +13 -0
- data/ext/html_to_markdown_rb/src/lib.rs +2088 -268
- data/lib/bin/html-to-markdown +0 -0
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +5 -3
- data/sig/types.rbs +769 -0
- data/vendor/Cargo.toml +2 -2
- data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/examples/basic.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_deser.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +1 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +15 -25
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/container.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +6 -7
- data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -108
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +2 -4
- data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/context.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -3
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/main.rs +57 -31
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +8 -8
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +5 -5
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +6 -17
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -11
- data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +80 -22
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +5 -9
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +10 -10
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +13 -13
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +4 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +6 -14
- data/vendor/html-to-markdown-rs/src/inline_images.rs +6 -0
- data/vendor/html-to-markdown-rs/src/lib.rs +17 -18
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +31 -0
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -12
- data/vendor/html-to-markdown-rs/src/text.rs +0 -44
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +2 -0
- data/vendor/html-to-markdown-rs/src/visitor/types.rs +5 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +4 -1
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +136 -0
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +137 -0
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +5 -2
- data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +77 -0
- data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +82 -0
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +6 -6
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +103 -35
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +1 -1
- metadata +21 -43
- data/.bundle/config +0 -2
- data/.gitignore +0 -3
- data/.rubocop.yml +0 -59
- data/Gemfile +0 -18
- data/Gemfile.lock +0 -173
- data/README.md +0 -331
- data/Rakefile +0 -26
- data/exe/html-to-markdown +0 -6
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +0 -6
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +0 -9
- data/html-to-markdown-rb.gemspec +0 -99
- data/lib/html_to_markdown_rs.rb +0 -3
- data/sig/html_to_markdown.rbs +0 -149
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +0 -94
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -86
- data/vendor/html-to-markdown-rs/src/safety.rs +0 -70
|
@@ -27,7 +27,7 @@ use std::borrow::Cow;
|
|
|
27
27
|
/// - **Inline mode**: Children are processed inline without block spacing
|
|
28
28
|
/// - **Block mode**: Content is collected, trimmed, and wrapped with blank lines
|
|
29
29
|
/// - **Empty content**: Skipped entirely
|
|
30
|
-
pub
|
|
30
|
+
pub fn handle_form(
|
|
31
31
|
_tag_name: &str,
|
|
32
32
|
node_handle: &tl::NodeHandle,
|
|
33
33
|
parser: &tl::Parser,
|
|
@@ -82,7 +82,7 @@ pub(crate) fn handle_form(
|
|
|
82
82
|
/// - **Inline mode**: Children are processed inline without block spacing
|
|
83
83
|
/// - **Block mode**: Content is collected, trimmed, and wrapped with blank lines
|
|
84
84
|
/// - **Empty content**: Skipped entirely
|
|
85
|
-
pub
|
|
85
|
+
pub fn handle_fieldset(
|
|
86
86
|
_tag_name: &str,
|
|
87
87
|
node_handle: &tl::NodeHandle,
|
|
88
88
|
parser: &tl::Parser,
|
|
@@ -137,7 +137,7 @@ pub(crate) fn handle_fieldset(
|
|
|
137
137
|
/// - **Block mode**: Content is wrapped in strong markers (e.g., `**text**`)
|
|
138
138
|
/// - **Inline mode**: Content is rendered without emphasis
|
|
139
139
|
/// - Uses the configured strong/emphasis symbol from ConversionOptions
|
|
140
|
-
pub
|
|
140
|
+
pub fn handle_legend(
|
|
141
141
|
_tag_name: &str,
|
|
142
142
|
node_handle: &tl::NodeHandle,
|
|
143
143
|
parser: &tl::Parser,
|
|
@@ -198,7 +198,7 @@ pub(crate) fn handle_legend(
|
|
|
198
198
|
/// - Content is collected from children
|
|
199
199
|
/// - Non-empty content is output followed by blank lines (in block mode)
|
|
200
200
|
/// - Blank lines are suppressed in inline mode
|
|
201
|
-
pub
|
|
201
|
+
pub fn handle_label(
|
|
202
202
|
_tag_name: &str,
|
|
203
203
|
node_handle: &tl::NodeHandle,
|
|
204
204
|
parser: &tl::Parser,
|
|
@@ -231,7 +231,7 @@ pub(crate) fn handle_label(
|
|
|
231
231
|
///
|
|
232
232
|
/// An input element represents a form control for user input. Since input
|
|
233
233
|
/// elements typically have no text content, this handler produces no output.
|
|
234
|
-
pub
|
|
234
|
+
pub fn handle_input(
|
|
235
235
|
_tag_name: &str,
|
|
236
236
|
_node_handle: &tl::NodeHandle,
|
|
237
237
|
_parser: &tl::Parser,
|
|
@@ -253,7 +253,7 @@ pub(crate) fn handle_input(
|
|
|
253
253
|
///
|
|
254
254
|
/// - Content is collected from children
|
|
255
255
|
/// - Blank lines are added after content in block mode only
|
|
256
|
-
pub
|
|
256
|
+
pub fn handle_textarea(
|
|
257
257
|
_tag_name: &str,
|
|
258
258
|
node_handle: &tl::NodeHandle,
|
|
259
259
|
parser: &tl::Parser,
|
|
@@ -287,7 +287,7 @@ pub(crate) fn handle_textarea(
|
|
|
287
287
|
///
|
|
288
288
|
/// - Content (options) is collected from children
|
|
289
289
|
/// - A single newline is added after the select in block mode
|
|
290
|
-
pub
|
|
290
|
+
pub fn handle_select(
|
|
291
291
|
_tag_name: &str,
|
|
292
292
|
node_handle: &tl::NodeHandle,
|
|
293
293
|
parser: &tl::Parser,
|
|
@@ -322,7 +322,7 @@ pub(crate) fn handle_select(
|
|
|
322
322
|
/// - Content is collected from children
|
|
323
323
|
/// - If the option has the `selected` attribute, it's prefixed with `* ` in block mode
|
|
324
324
|
/// - A newline is added after each option in block mode
|
|
325
|
-
pub
|
|
325
|
+
pub fn handle_option(
|
|
326
326
|
_tag_name: &str,
|
|
327
327
|
node_handle: &tl::NodeHandle,
|
|
328
328
|
parser: &tl::Parser,
|
|
@@ -365,7 +365,7 @@ pub(crate) fn handle_option(
|
|
|
365
365
|
///
|
|
366
366
|
/// - The `label` attribute is output as strong text (if present)
|
|
367
367
|
/// - Options within the group are rendered normally
|
|
368
|
-
pub
|
|
368
|
+
pub fn handle_optgroup(
|
|
369
369
|
_tag_name: &str,
|
|
370
370
|
node_handle: &tl::NodeHandle,
|
|
371
371
|
parser: &tl::Parser,
|
|
@@ -410,7 +410,7 @@ pub(crate) fn handle_optgroup(
|
|
|
410
410
|
///
|
|
411
411
|
/// - Content is collected from children
|
|
412
412
|
/// - Blank lines are added after content in block mode only
|
|
413
|
-
pub
|
|
413
|
+
pub fn handle_button(
|
|
414
414
|
_tag_name: &str,
|
|
415
415
|
node_handle: &tl::NodeHandle,
|
|
416
416
|
parser: &tl::Parser,
|
|
@@ -444,7 +444,7 @@ pub(crate) fn handle_button(
|
|
|
444
444
|
///
|
|
445
445
|
/// - Content is collected from children (usually empty)
|
|
446
446
|
/// - Blank lines are added after content in block mode only
|
|
447
|
-
pub
|
|
447
|
+
pub fn handle_progress(
|
|
448
448
|
_tag_name: &str,
|
|
449
449
|
node_handle: &tl::NodeHandle,
|
|
450
450
|
parser: &tl::Parser,
|
|
@@ -478,7 +478,7 @@ pub(crate) fn handle_progress(
|
|
|
478
478
|
///
|
|
479
479
|
/// - Content is collected from children (usually empty)
|
|
480
480
|
/// - Blank lines are added after content in block mode only
|
|
481
|
-
pub
|
|
481
|
+
pub fn handle_meter(
|
|
482
482
|
_tag_name: &str,
|
|
483
483
|
node_handle: &tl::NodeHandle,
|
|
484
484
|
parser: &tl::Parser,
|
|
@@ -512,7 +512,7 @@ pub(crate) fn handle_meter(
|
|
|
512
512
|
///
|
|
513
513
|
/// - Content is collected from children
|
|
514
514
|
/// - Blank lines are added after content in block mode only
|
|
515
|
-
pub
|
|
515
|
+
pub fn handle_output(
|
|
516
516
|
_tag_name: &str,
|
|
517
517
|
node_handle: &tl::NodeHandle,
|
|
518
518
|
parser: &tl::Parser,
|
|
@@ -546,7 +546,7 @@ pub(crate) fn handle_output(
|
|
|
546
546
|
///
|
|
547
547
|
/// - Content (options) is collected from children
|
|
548
548
|
/// - A single newline is added after the datalist in block mode
|
|
549
|
-
pub
|
|
549
|
+
pub fn handle_datalist(
|
|
550
550
|
_tag_name: &str,
|
|
551
551
|
node_handle: &tl::NodeHandle,
|
|
552
552
|
parser: &tl::Parser,
|
|
@@ -35,7 +35,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
35
35
|
/// # Note
|
|
36
36
|
/// This function references helper functions and `walk_node` from converter.rs
|
|
37
37
|
/// which must be accessible (pub(crate)) for this module to work correctly.
|
|
38
|
-
pub
|
|
38
|
+
pub fn handle(
|
|
39
39
|
tag_name: &str,
|
|
40
40
|
node_handle: &NodeHandle,
|
|
41
41
|
parser: &Parser,
|
|
@@ -32,7 +32,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
32
32
|
/// # Note
|
|
33
33
|
/// This function references helper functions and `walk_node` from converter.rs
|
|
34
34
|
/// which must be accessible (pub(crate)) for this module to work correctly.
|
|
35
|
-
pub
|
|
35
|
+
pub fn handle(
|
|
36
36
|
tag_name: &str,
|
|
37
37
|
node_handle: &NodeHandle,
|
|
38
38
|
parser: &Parser,
|
|
@@ -44,7 +44,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
44
44
|
/// # Note
|
|
45
45
|
/// This function references helper functions from converter.rs
|
|
46
46
|
/// which must be accessible (pub(crate)) for this module to work correctly.
|
|
47
|
-
pub
|
|
47
|
+
pub fn handle(
|
|
48
48
|
node_handle: &NodeHandle,
|
|
49
49
|
parser: &Parser,
|
|
50
50
|
output: &mut String,
|
|
@@ -360,7 +360,7 @@ pub(crate) fn handle(
|
|
|
360
360
|
/// * `title` - Optional link title attribute
|
|
361
361
|
/// * `raw_text` - Original unprocessed text (for default_title option)
|
|
362
362
|
/// * `options` - Conversion options
|
|
363
|
-
pub
|
|
363
|
+
pub fn append_markdown_link(
|
|
364
364
|
output: &mut String,
|
|
365
365
|
label: &str,
|
|
366
366
|
href: &str,
|
|
@@ -31,7 +31,6 @@ pub mod ruby;
|
|
|
31
31
|
pub mod semantic;
|
|
32
32
|
|
|
33
33
|
// Re-export types from parent module for submodule access
|
|
34
|
-
pub use super::{Context, DomContext};
|
|
35
34
|
|
|
36
35
|
// Re-export handler functions for internal use by dispatcher (crate-private)
|
|
37
36
|
// pub(crate) use ruby::handle as handle_ruby;
|
|
@@ -45,7 +45,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
45
45
|
/// # Note
|
|
46
46
|
/// This function references `walk_node` and `normalized_tag_name` from converter.rs,
|
|
47
47
|
/// which must be accessible (pub(crate)) for this module to work correctly.
|
|
48
|
-
pub
|
|
48
|
+
pub fn handle(
|
|
49
49
|
tag_name: &str,
|
|
50
50
|
node_handle: &NodeHandle,
|
|
51
51
|
parser: &Parser,
|
|
@@ -37,7 +37,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
37
37
|
/// # Note
|
|
38
38
|
/// This function references helper functions and `walk_node` from converter.rs
|
|
39
39
|
/// which must be accessible (pub(crate)) for this module to work correctly.
|
|
40
|
-
pub
|
|
40
|
+
pub fn handle(
|
|
41
41
|
tag_name: &str,
|
|
42
42
|
node_handle: &NodeHandle,
|
|
43
43
|
parser: &Parser,
|
|
@@ -15,7 +15,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
15
15
|
/// Handle definition list element (<dl>).
|
|
16
16
|
///
|
|
17
17
|
/// Groups dt/dd pairs and formats them with proper Markdown separation.
|
|
18
|
-
pub
|
|
18
|
+
pub fn handle_dl(
|
|
19
19
|
node_handle: &tl::NodeHandle,
|
|
20
20
|
parser: &tl::Parser,
|
|
21
21
|
output: &mut String,
|
|
@@ -61,7 +61,7 @@ pub(crate) fn handle_dl(
|
|
|
61
61
|
/// Handle definition term element (<dt>).
|
|
62
62
|
///
|
|
63
63
|
/// Outputs the term text followed by a newline.
|
|
64
|
-
pub
|
|
64
|
+
pub fn handle_dt(
|
|
65
65
|
node_handle: &tl::NodeHandle,
|
|
66
66
|
parser: &tl::Parser,
|
|
67
67
|
output: &mut String,
|
|
@@ -96,7 +96,7 @@ pub(crate) fn handle_dt(
|
|
|
96
96
|
/// Handle definition description element (<dd>).
|
|
97
97
|
///
|
|
98
98
|
/// Outputs the description as a plain block.
|
|
99
|
-
pub
|
|
99
|
+
pub fn handle_dd(
|
|
100
100
|
node_handle: &tl::NodeHandle,
|
|
101
101
|
parser: &tl::Parser,
|
|
102
102
|
output: &mut String,
|
|
@@ -24,7 +24,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
24
24
|
/// Processes list item content with support for task lists (checkboxes),
|
|
25
25
|
/// proper indentation, and block-level element detection.
|
|
26
26
|
#[allow(clippy::too_many_arguments)]
|
|
27
|
-
pub
|
|
27
|
+
pub fn handle_li(
|
|
28
28
|
node_handle: &tl::NodeHandle,
|
|
29
29
|
tag: &tl::HTMLTag,
|
|
30
30
|
parser: &tl::Parser,
|
|
@@ -26,7 +26,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
26
26
|
/// Extracts the `start` attribute to set initial counter value,
|
|
27
27
|
/// detects loose/tight list format, and processes list items.
|
|
28
28
|
#[allow(clippy::too_many_arguments)]
|
|
29
|
-
pub
|
|
29
|
+
pub fn handle_ol(
|
|
30
30
|
node_handle: &tl::NodeHandle,
|
|
31
31
|
parser: &tl::Parser,
|
|
32
32
|
output: &mut String,
|
|
@@ -198,4 +198,4 @@ pub(crate) fn handle_ol(
|
|
|
198
198
|
}
|
|
199
199
|
|
|
200
200
|
/// Public alias for `handle_ol` to match the expected module interface.
|
|
201
|
-
pub
|
|
201
|
+
pub use handle_ol as handle;
|
|
@@ -26,7 +26,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
26
26
|
/// Detects loose/tight list format, handles nested bullets,
|
|
27
27
|
/// and processes list items with proper indentation.
|
|
28
28
|
#[allow(clippy::too_many_arguments)]
|
|
29
|
-
pub
|
|
29
|
+
pub fn handle_ul(
|
|
30
30
|
node_handle: &tl::NodeHandle,
|
|
31
31
|
parser: &tl::Parser,
|
|
32
32
|
output: &mut String,
|
|
@@ -192,4 +192,4 @@ pub(crate) fn handle_ul(
|
|
|
192
192
|
}
|
|
193
193
|
|
|
194
194
|
/// Public alias for `handle_ul` to match the expected module interface.
|
|
195
|
-
pub
|
|
195
|
+
pub use handle_ul as handle;
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
)]
|
|
12
12
|
|
|
13
13
|
use std::borrow::Cow;
|
|
14
|
-
use std::collections::BTreeMap;
|
|
14
|
+
use std::collections::{BTreeMap, HashSet};
|
|
15
15
|
|
|
16
16
|
use crate::converter::dom_context::DomContext;
|
|
17
17
|
use crate::converter::main_helpers::{
|
|
@@ -33,26 +33,6 @@ use crate::options::ConversionOptions;
|
|
|
33
33
|
use crate::converter::context::{Context, InlineCollectorHandle};
|
|
34
34
|
use crate::types::structure_collector::StructureCollectorHandle;
|
|
35
35
|
|
|
36
|
-
/// Converts HTML to Markdown using the provided conversion options.
|
|
37
|
-
///
|
|
38
|
-
/// This is the main entry point for HTML to Markdown conversion.
|
|
39
|
-
pub fn convert_html(html: &str, options: &ConversionOptions) -> Result<String> {
|
|
40
|
-
convert_html_impl(html, options, None, None, None, None).map(|(md, _)| md)
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
/// Converts HTML to Markdown with a custom visitor for callbacks during traversal.
|
|
44
|
-
///
|
|
45
|
-
/// This variant allows passing a visitor that will receive callbacks for each node
|
|
46
|
-
/// during the tree walk, enabling custom processing or analysis.
|
|
47
|
-
#[cfg(feature = "visitor")]
|
|
48
|
-
pub fn convert_html_with_visitor(
|
|
49
|
-
html: &str,
|
|
50
|
-
options: &ConversionOptions,
|
|
51
|
-
visitor: Option<crate::visitor::VisitorHandle>,
|
|
52
|
-
) -> Result<String> {
|
|
53
|
-
convert_html_impl(html, options, None, None, visitor, None).map(|(md, _)| md)
|
|
54
|
-
}
|
|
55
|
-
|
|
56
36
|
/// Internal implementation of HTML to Markdown conversion.
|
|
57
37
|
///
|
|
58
38
|
/// Returns `(markdown, Option<DocumentStructure>)`. The structure is populated when
|
|
@@ -62,7 +42,7 @@ pub fn convert_html_with_visitor(
|
|
|
62
42
|
allow(unused_variables)
|
|
63
43
|
)]
|
|
64
44
|
#[allow(clippy::too_many_lines)]
|
|
65
|
-
pub
|
|
45
|
+
pub fn convert_html_impl(
|
|
66
46
|
html: &str,
|
|
67
47
|
options: &ConversionOptions,
|
|
68
48
|
inline_collector: Option<InlineCollectorHandle>,
|
|
@@ -82,7 +62,9 @@ pub(crate) fn convert_html_impl(
|
|
|
82
62
|
|
|
83
63
|
if has_custom_element_tags(&preprocessed) {
|
|
84
64
|
if let Some(repaired_html) = repair_with_html5ever(&preprocessed) {
|
|
85
|
-
let
|
|
65
|
+
let stripped = strip_script_and_style_tags(&repaired_html);
|
|
66
|
+
let stripped = strip_hidden_elements(&stripped);
|
|
67
|
+
let repaired = preprocess_html(&stripped).into_owned();
|
|
86
68
|
preprocessed = repaired;
|
|
87
69
|
preprocessed_len = preprocessed.len();
|
|
88
70
|
}
|
|
@@ -93,7 +75,9 @@ pub(crate) fn convert_html_impl(
|
|
|
93
75
|
break dom;
|
|
94
76
|
}
|
|
95
77
|
if let Some(repaired_html) = repair_with_html5ever(&preprocessed) {
|
|
96
|
-
|
|
78
|
+
let stripped = strip_script_and_style_tags(&repaired_html);
|
|
79
|
+
let stripped = strip_hidden_elements(&stripped);
|
|
80
|
+
preprocessed = preprocess_html(&stripped).into_owned();
|
|
97
81
|
preprocessed_len = preprocessed.len();
|
|
98
82
|
continue;
|
|
99
83
|
}
|
|
@@ -111,7 +95,9 @@ pub(crate) fn convert_html_impl(
|
|
|
111
95
|
if let Some(repaired_html) = repair_with_html5ever(&preprocessed) {
|
|
112
96
|
// Drop dom to release borrow on preprocessed
|
|
113
97
|
drop(dom);
|
|
114
|
-
|
|
98
|
+
let stripped = strip_script_and_style_tags(&repaired_html);
|
|
99
|
+
let stripped = strip_hidden_elements(&stripped);
|
|
100
|
+
preprocessed = preprocess_html(&stripped).into_owned();
|
|
115
101
|
preprocessed_len = preprocessed.len();
|
|
116
102
|
// Re-parse with repaired HTML
|
|
117
103
|
dom = tl::parse(&preprocessed, parser_options)
|
|
@@ -205,7 +191,7 @@ pub(crate) fn convert_html_impl(
|
|
|
205
191
|
};
|
|
206
192
|
|
|
207
193
|
#[cfg(all(feature = "metadata", feature = "visitor"))]
|
|
208
|
-
let ctx = Context::new(
|
|
194
|
+
let mut ctx = Context::new(
|
|
209
195
|
options,
|
|
210
196
|
inline_collector,
|
|
211
197
|
metadata_collector,
|
|
@@ -214,7 +200,7 @@ pub(crate) fn convert_html_impl(
|
|
|
214
200
|
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
215
201
|
);
|
|
216
202
|
#[cfg(all(feature = "metadata", not(feature = "visitor")))]
|
|
217
|
-
let ctx = Context::new(
|
|
203
|
+
let mut ctx = Context::new(
|
|
218
204
|
options,
|
|
219
205
|
inline_collector,
|
|
220
206
|
metadata_collector,
|
|
@@ -223,7 +209,7 @@ pub(crate) fn convert_html_impl(
|
|
|
223
209
|
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
224
210
|
);
|
|
225
211
|
#[cfg(all(not(feature = "metadata"), feature = "visitor"))]
|
|
226
|
-
let ctx = Context::new(
|
|
212
|
+
let mut ctx = Context::new(
|
|
227
213
|
options,
|
|
228
214
|
inline_collector,
|
|
229
215
|
_metadata_collector,
|
|
@@ -232,7 +218,7 @@ pub(crate) fn convert_html_impl(
|
|
|
232
218
|
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
233
219
|
);
|
|
234
220
|
#[cfg(all(not(feature = "metadata"), not(feature = "visitor")))]
|
|
235
|
-
let ctx = Context::new(
|
|
221
|
+
let mut ctx = Context::new(
|
|
236
222
|
options,
|
|
237
223
|
inline_collector,
|
|
238
224
|
_metadata_collector,
|
|
@@ -241,6 +227,20 @@ pub(crate) fn convert_html_impl(
|
|
|
241
227
|
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
242
228
|
);
|
|
243
229
|
|
|
230
|
+
// Pre-compute node IDs matching exclude_selectors so walk_node can skip them in O(1).
|
|
231
|
+
// Invalid or unsupported selectors are silently skipped.
|
|
232
|
+
if !options.exclude_selectors.is_empty() {
|
|
233
|
+
let mut excluded: HashSet<u32> = HashSet::new();
|
|
234
|
+
for selector in &options.exclude_selectors {
|
|
235
|
+
if let Some(iter) = dom.query_selector(selector) {
|
|
236
|
+
for handle in iter {
|
|
237
|
+
excluded.insert(handle.get_inner());
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
ctx.set_excluded_node_ids(excluded);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
244
|
for child_handle in dom.children() {
|
|
245
245
|
walk_node(child_handle, parser, &mut output, options, &ctx, 0, &dom_ctx);
|
|
246
246
|
}
|
|
@@ -297,7 +297,7 @@ pub(crate) fn convert_html_impl(
|
|
|
297
297
|
#[allow(clippy::only_used_in_recursion)]
|
|
298
298
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
299
299
|
#[allow(clippy::cast_possible_truncation)]
|
|
300
|
-
pub
|
|
300
|
+
pub fn walk_node(
|
|
301
301
|
node_handle: &tl::NodeHandle,
|
|
302
302
|
parser: &tl::Parser,
|
|
303
303
|
output: &mut String,
|
|
@@ -308,6 +308,12 @@ pub(crate) fn walk_node(
|
|
|
308
308
|
) {
|
|
309
309
|
let Some(node) = node_handle.get(parser) else { return };
|
|
310
310
|
|
|
311
|
+
if let Some(max) = options.max_depth {
|
|
312
|
+
if depth >= max {
|
|
313
|
+
return;
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
|
|
311
317
|
match node {
|
|
312
318
|
tl::Node::Raw(bytes) => {
|
|
313
319
|
let raw = bytes.as_utf8_str();
|
|
@@ -353,7 +359,13 @@ pub(crate) fn walk_node(
|
|
|
353
359
|
}
|
|
354
360
|
}
|
|
355
361
|
|
|
356
|
-
if should_drop_for_preprocessing(
|
|
362
|
+
if should_drop_for_preprocessing(tag_name.as_ref(), tag, options) {
|
|
363
|
+
trim_trailing_whitespace(output);
|
|
364
|
+
return;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// Drop elements matching exclude_selectors, including all their descendants.
|
|
368
|
+
if !ctx.excluded_node_ids.is_empty() && ctx.excluded_node_ids.contains(&node_handle.get_inner()) {
|
|
357
369
|
trim_trailing_whitespace(output);
|
|
358
370
|
return;
|
|
359
371
|
}
|
|
@@ -520,6 +532,20 @@ pub(crate) fn walk_node(
|
|
|
520
532
|
);
|
|
521
533
|
}
|
|
522
534
|
|
|
535
|
+
// Sectioning elements routed to semantic dispatcher
|
|
536
|
+
"article" | "section" | "nav" | "aside" | "header" | "footer" | "main" => {
|
|
537
|
+
crate::converter::semantic::dispatch_semantic_handler(
|
|
538
|
+
&tag_name,
|
|
539
|
+
node_handle,
|
|
540
|
+
parser,
|
|
541
|
+
output,
|
|
542
|
+
options,
|
|
543
|
+
ctx,
|
|
544
|
+
depth,
|
|
545
|
+
dom_ctx,
|
|
546
|
+
);
|
|
547
|
+
}
|
|
548
|
+
|
|
523
549
|
// Quote element routed to semantic dispatcher
|
|
524
550
|
"q" => {
|
|
525
551
|
crate::converter::semantic::dispatch_semantic_handler(
|
|
@@ -16,7 +16,7 @@ use crate::converter::main_helpers::tag_name_eq;
|
|
|
16
16
|
use crate::options::ConversionOptions;
|
|
17
17
|
|
|
18
18
|
/// Extract src attribute from media element (audio, video, iframe).
|
|
19
|
-
pub
|
|
19
|
+
pub fn extract_media_src<'a>(tag: &'a HTMLTag<'a>) -> Cow<'a, str> {
|
|
20
20
|
tag.attributes()
|
|
21
21
|
.get("src")
|
|
22
22
|
.flatten()
|
|
@@ -28,7 +28,7 @@ pub(crate) fn extract_media_src<'a>(tag: &'a HTMLTag<'a>) -> Cow<'a, str> {
|
|
|
28
28
|
///
|
|
29
29
|
/// Used by audio and video elements to extract src from child <source> elements
|
|
30
30
|
/// when the parent doesn't have a src attribute.
|
|
31
|
-
pub
|
|
31
|
+
pub fn find_source_src<'a, T>(children: T, parser: &'a Parser) -> Option<Cow<'a, str>>
|
|
32
32
|
where
|
|
33
33
|
T: IntoIterator<Item = &'a NodeHandle>,
|
|
34
34
|
{
|
|
@@ -43,14 +43,14 @@ where
|
|
|
43
43
|
}
|
|
44
44
|
|
|
45
45
|
/// Check if tag is a source element.
|
|
46
|
-
pub
|
|
46
|
+
pub fn is_source_element(tag: &HTMLTag) -> bool {
|
|
47
47
|
tag_name_eq(tag.name().as_utf8_str(), "source")
|
|
48
48
|
}
|
|
49
49
|
|
|
50
50
|
/// Determine if media should output source link in markdown.
|
|
51
51
|
///
|
|
52
52
|
/// Returns true if src is non-empty.
|
|
53
|
-
pub
|
|
53
|
+
pub fn should_output_media_link(src: &str) -> bool {
|
|
54
54
|
!src.is_empty()
|
|
55
55
|
}
|
|
56
56
|
|
|
@@ -58,7 +58,7 @@ pub(crate) fn should_output_media_link(src: &str) -> bool {
|
|
|
58
58
|
///
|
|
59
59
|
/// Extracts src from audio tag or nested source elements, outputs as a link,
|
|
60
60
|
/// and processes fallback content (e.g., browser compatibility text).
|
|
61
|
-
pub
|
|
61
|
+
pub fn handle_audio(
|
|
62
62
|
_node_handle: &NodeHandle,
|
|
63
63
|
tag: &HTMLTag,
|
|
64
64
|
parser: &Parser,
|
|
@@ -121,7 +121,7 @@ pub(crate) fn handle_audio(
|
|
|
121
121
|
///
|
|
122
122
|
/// Extracts src from video tag or nested source elements, outputs as a link,
|
|
123
123
|
/// and processes fallback content (e.g., browser compatibility text).
|
|
124
|
-
pub
|
|
124
|
+
pub fn handle_video(
|
|
125
125
|
_node_handle: &NodeHandle,
|
|
126
126
|
tag: &HTMLTag,
|
|
127
127
|
parser: &Parser,
|
|
@@ -183,7 +183,7 @@ pub(crate) fn handle_video(
|
|
|
183
183
|
/// Handle picture element conversion to Markdown.
|
|
184
184
|
///
|
|
185
185
|
/// Finds and processes the first child img element, skipping source elements.
|
|
186
|
-
pub
|
|
186
|
+
pub fn handle_picture(
|
|
187
187
|
_node_handle: &NodeHandle,
|
|
188
188
|
tag: &HTMLTag,
|
|
189
189
|
parser: &Parser,
|
|
@@ -209,7 +209,7 @@ pub(crate) fn handle_picture(
|
|
|
209
209
|
///
|
|
210
210
|
/// Extracts src attribute from iframe and outputs as a markdown link.
|
|
211
211
|
/// iframes cannot be embedded in markdown, so we just provide a link to the source.
|
|
212
|
-
pub
|
|
212
|
+
pub fn handle_iframe(tag: &HTMLTag, output: &mut String, ctx: &Context) {
|
|
213
213
|
let src = tag
|
|
214
214
|
.attributes()
|
|
215
215
|
.get("src")
|
|
@@ -19,7 +19,7 @@ type InlineCollectorHandle = std::rc::Rc<std::cell::RefCell<InlineImageCollector
|
|
|
19
19
|
#[cfg(feature = "inline-images")]
|
|
20
20
|
#[allow(clippy::items_after_statements)]
|
|
21
21
|
#[allow(clippy::manual_let_else)]
|
|
22
|
-
pub
|
|
22
|
+
pub fn handle_inline_data_image(
|
|
23
23
|
collector_ref: &InlineCollectorHandle,
|
|
24
24
|
src: &str,
|
|
25
25
|
alt: &str,
|
|
@@ -15,7 +15,7 @@ pub mod svg;
|
|
|
15
15
|
pub use super::{Context, DomContext};
|
|
16
16
|
|
|
17
17
|
#[cfg(feature = "inline-images")]
|
|
18
|
-
pub
|
|
18
|
+
pub use image::handle_inline_data_image;
|
|
19
19
|
|
|
20
20
|
/// Dispatches media element handling to the appropriate handler.
|
|
21
21
|
///
|
|
@@ -23,7 +23,7 @@ type InlineCollectorHandle = std::rc::Rc<std::cell::RefCell<InlineImageCollector
|
|
|
23
23
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
24
24
|
#[allow(clippy::needless_pass_by_value)]
|
|
25
25
|
#[allow(clippy::option_if_let_else)]
|
|
26
|
-
pub
|
|
26
|
+
pub fn handle_inline_svg(
|
|
27
27
|
collector_ref: &InlineCollectorHandle,
|
|
28
28
|
node_handle: &NodeHandle,
|
|
29
29
|
parser: &Parser,
|
|
@@ -93,7 +93,7 @@ pub(crate) fn handle_inline_svg(
|
|
|
93
93
|
|
|
94
94
|
/// Serialize an element to HTML string (for SVG and Math elements).
|
|
95
95
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
96
|
-
pub
|
|
96
|
+
pub fn serialize_element(node_handle: &NodeHandle, parser: &Parser) -> String {
|
|
97
97
|
if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
|
|
98
98
|
let tag_name = normalized_tag_name(tag.name().as_utf8_str());
|
|
99
99
|
let mut html = String::with_capacity(256);
|
|
@@ -132,7 +132,7 @@ pub(crate) fn serialize_element(node_handle: &NodeHandle, parser: &Parser) -> St
|
|
|
132
132
|
|
|
133
133
|
/// Serialize a node to HTML string.
|
|
134
134
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
135
|
-
pub
|
|
135
|
+
pub fn serialize_node(node_handle: &NodeHandle, parser: &Parser) -> String {
|
|
136
136
|
if let Some(node) = node_handle.get(parser) {
|
|
137
137
|
match node {
|
|
138
138
|
tl::Node::Raw(bytes) => bytes.as_utf8_str().to_string(),
|
|
@@ -160,7 +160,7 @@ fn non_empty_trimmed(value: &str) -> Option<String> {
|
|
|
160
160
|
/// Extracts title from child elements, handles inline image collection,
|
|
161
161
|
/// and outputs either the title text (in inline mode) or a base64-encoded image.
|
|
162
162
|
#[allow(clippy::too_many_arguments)]
|
|
163
|
-
pub
|
|
163
|
+
pub fn handle_svg(
|
|
164
164
|
node_handle: &NodeHandle,
|
|
165
165
|
tag: &tl::HTMLTag,
|
|
166
166
|
parser: &Parser,
|
|
@@ -230,7 +230,7 @@ pub(crate) fn handle_svg(
|
|
|
230
230
|
///
|
|
231
231
|
/// Serializes MathML to HTML comment and outputs text content with escaping.
|
|
232
232
|
#[allow(clippy::too_many_arguments)]
|
|
233
|
-
pub
|
|
233
|
+
pub fn handle_math(
|
|
234
234
|
node_handle: &NodeHandle,
|
|
235
235
|
tag: &tl::HTMLTag,
|
|
236
236
|
parser: &Parser,
|
|
@@ -115,47 +115,36 @@ pub mod visitor_hooks;
|
|
|
115
115
|
// Import and re-export public types and functions from the main module
|
|
116
116
|
pub use self::context::Context;
|
|
117
117
|
pub use self::dom_context::DomContext;
|
|
118
|
-
pub use self::main::convert_html;
|
|
119
|
-
|
|
120
|
-
#[cfg(feature = "visitor")]
|
|
121
|
-
pub use self::main::convert_html_with_visitor;
|
|
122
118
|
|
|
123
119
|
// Import the tree walker and utility functions from main and main_helpers
|
|
124
|
-
pub
|
|
125
|
-
pub
|
|
120
|
+
pub use self::main::{convert_html_impl, walk_node};
|
|
121
|
+
pub use self::main_helpers::trim_trailing_whitespace;
|
|
126
122
|
|
|
127
123
|
// Re-export helper functions from utility modules (migrated from converter_legacy)
|
|
128
|
-
pub
|
|
124
|
+
pub use crate::converter::utility::content::{chomp_inline, get_text_content, normalized_tag_name};
|
|
129
125
|
#[allow(unused_imports)]
|
|
130
|
-
pub
|
|
126
|
+
pub use crate::converter::utility::serialization::{serialize_node, serialize_node_to_html};
|
|
131
127
|
|
|
132
128
|
// Helper functions migrated to utility modules
|
|
133
|
-
pub
|
|
129
|
+
pub use crate::converter::utility::siblings::append_inline_suffix;
|
|
134
130
|
|
|
135
131
|
// Caching functions migrated to utility/caching
|
|
136
132
|
|
|
137
133
|
// Content functions migrated to utility/content
|
|
138
134
|
|
|
139
135
|
// Heading functions migrated to block/heading
|
|
140
|
-
pub
|
|
136
|
+
pub use crate::converter::block::heading::find_single_heading_child;
|
|
141
137
|
|
|
142
138
|
// Link functions migrated to inline/link
|
|
143
139
|
|
|
144
140
|
// Re-export dispatch functions for routing elements to handlers
|
|
145
|
-
pub use block::dispatch_block_handler;
|
|
146
|
-
pub use form::dispatch_form_handler;
|
|
147
|
-
pub use inline::dispatch_inline_handler;
|
|
148
|
-
pub use list::dispatch_list_handler;
|
|
149
|
-
pub use semantic::dispatch_semantic_handler;
|
|
150
141
|
// Media module doesn't have a dispatcher - it exports utility functions
|
|
151
142
|
|
|
152
143
|
// Re-export utility submodules for public access to their types
|
|
153
144
|
// NOTE: utility::preprocessing is deliberately not re-exported to avoid naming conflict
|
|
154
145
|
// with preprocessing_helpers module. Users should access utility::preprocessing directly.
|
|
155
|
-
pub use utility::{attributes, caching, content, serialization, siblings};
|
|
156
146
|
|
|
157
147
|
// Re-export format renderer types
|
|
158
|
-
pub use format::{DjotRenderer, FormatRenderer, MarkdownRenderer};
|
|
159
148
|
|
|
160
149
|
// Block and inline handlers are internal - only dispatchers are exposed
|
|
161
150
|
// Individual handlers are pub(crate) and not meant to be part of the public API
|