html-to-markdown 2.30.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -19
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +6 -3
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +126 -52
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -11,6 +11,9 @@ use crate::converter::dom_context::DomContext;
|
|
|
11
11
|
use crate::converter::main::walk_node;
|
|
12
12
|
use crate::options::ConversionOptions;
|
|
13
13
|
|
|
14
|
+
#[cfg(feature = "visitor")]
|
|
15
|
+
#[cfg(feature = "visitor")]
|
|
16
|
+
use crate::converter::utility::content::collect_tag_attributes;
|
|
14
17
|
#[cfg(feature = "visitor")]
|
|
15
18
|
use std::collections::BTreeMap;
|
|
16
19
|
|
|
@@ -86,11 +89,7 @@ pub fn handle_blockquote(
|
|
|
86
89
|
if let Some(ref visitor) = ctx.visitor {
|
|
87
90
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
88
91
|
|
|
89
|
-
let attributes: BTreeMap<String, String> = tag
|
|
90
|
-
.attributes()
|
|
91
|
-
.iter()
|
|
92
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
93
|
-
.collect();
|
|
92
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
94
93
|
|
|
95
94
|
let node_id = node_handle.get_inner();
|
|
96
95
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -13,6 +13,9 @@ use crate::converter::main::walk_node;
|
|
|
13
13
|
use crate::converter::text::dedent_code_block;
|
|
14
14
|
use crate::options::ConversionOptions;
|
|
15
15
|
|
|
16
|
+
#[cfg(feature = "visitor")]
|
|
17
|
+
#[cfg(feature = "visitor")]
|
|
18
|
+
use crate::converter::utility::content::collect_tag_attributes;
|
|
16
19
|
#[cfg(feature = "visitor")]
|
|
17
20
|
use std::collections::BTreeMap;
|
|
18
21
|
|
|
@@ -75,11 +78,7 @@ pub fn handle_code(
|
|
|
75
78
|
let code_output = if let Some(ref visitor_handle) = ctx.visitor {
|
|
76
79
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
77
80
|
|
|
78
|
-
let attributes: BTreeMap<String, String> = tag
|
|
79
|
-
.attributes()
|
|
80
|
-
.iter()
|
|
81
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
82
|
-
.collect();
|
|
81
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
83
82
|
|
|
84
83
|
let node_id = node_handle.get_inner();
|
|
85
84
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -255,11 +254,7 @@ pub fn handle_pre(
|
|
|
255
254
|
let code_block_output = if let Some(ref visitor_handle) = ctx.visitor {
|
|
256
255
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
257
256
|
|
|
258
|
-
let attributes: BTreeMap<String, String> = tag
|
|
259
|
-
.attributes()
|
|
260
|
-
.iter()
|
|
261
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
262
|
-
.collect();
|
|
257
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
263
258
|
|
|
264
259
|
let node_id = node_handle.get_inner();
|
|
265
260
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -11,6 +11,8 @@ use std::collections::BTreeMap;
|
|
|
11
11
|
|
|
12
12
|
use crate::converter::Context;
|
|
13
13
|
use crate::converter::dom_context::DomContext;
|
|
14
|
+
#[cfg(feature = "visitor")]
|
|
15
|
+
use crate::converter::utility::content::collect_tag_attributes;
|
|
14
16
|
use crate::options::ConversionOptions;
|
|
15
17
|
|
|
16
18
|
#[cfg(feature = "visitor")]
|
|
@@ -100,11 +102,7 @@ pub fn handle_graphic(
|
|
|
100
102
|
let graphic_output = if let Some(ref visitor_handle) = ctx.visitor {
|
|
101
103
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
102
104
|
|
|
103
|
-
let attributes: BTreeMap<String, String> = tag
|
|
104
|
-
.attributes()
|
|
105
|
-
.iter()
|
|
106
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
107
|
-
.collect();
|
|
105
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
108
106
|
|
|
109
107
|
let node_id = node_handle.get_inner();
|
|
110
108
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -11,6 +11,8 @@ use std::collections::BTreeMap;
|
|
|
11
11
|
|
|
12
12
|
use crate::converter::Context;
|
|
13
13
|
use crate::converter::dom_context::DomContext;
|
|
14
|
+
#[cfg(feature = "visitor")]
|
|
15
|
+
use crate::converter::utility::content::collect_tag_attributes;
|
|
14
16
|
use crate::converter::utility::preprocessing::sanitize_markdown_url;
|
|
15
17
|
use crate::options::ConversionOptions;
|
|
16
18
|
|
|
@@ -123,11 +125,7 @@ pub fn handle_img(
|
|
|
123
125
|
let image_output = if let Some(ref visitor_handle) = ctx.visitor {
|
|
124
126
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
125
127
|
|
|
126
|
-
let attributes: BTreeMap<String, String> = tag
|
|
127
|
-
.attributes()
|
|
128
|
-
.iter()
|
|
129
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
130
|
-
.collect();
|
|
128
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
131
129
|
|
|
132
130
|
let node_id = node_handle.get_inner();
|
|
133
131
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -15,6 +15,8 @@ use crate::converter::block::heading::{find_single_heading_child, heading_allows
|
|
|
15
15
|
use crate::converter::dom_context::DomContext;
|
|
16
16
|
use crate::converter::inline::link::append_markdown_link;
|
|
17
17
|
use crate::converter::main::walk_node;
|
|
18
|
+
#[cfg(feature = "visitor")]
|
|
19
|
+
use crate::converter::utility::content::collect_tag_attributes;
|
|
18
20
|
use crate::converter::utility::content::{
|
|
19
21
|
collect_link_label_text, escape_link_label, get_text_content, normalize_link_label, normalized_tag_name,
|
|
20
22
|
};
|
|
@@ -194,11 +196,7 @@ pub fn handle_link(
|
|
|
194
196
|
let link_output = if let Some(ref visitor_handle) = ctx.visitor {
|
|
195
197
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
196
198
|
|
|
197
|
-
let attributes: BTreeMap<String, String> = tag
|
|
198
|
-
.attributes()
|
|
199
|
-
.iter()
|
|
200
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
201
|
-
.collect();
|
|
199
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
202
200
|
|
|
203
201
|
let node_id = node_handle.get_inner();
|
|
204
202
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -10,6 +10,8 @@
|
|
|
10
10
|
//! - Visitor callbacks for custom code processing
|
|
11
11
|
//! - Whitespace normalization for kbd/samp elements
|
|
12
12
|
|
|
13
|
+
#[cfg(feature = "visitor")]
|
|
14
|
+
use crate::converter::utility::content::collect_tag_attributes;
|
|
13
15
|
use crate::options::ConversionOptions;
|
|
14
16
|
use crate::text;
|
|
15
17
|
#[allow(unused_imports)]
|
|
@@ -120,11 +122,7 @@ fn handle_code(
|
|
|
120
122
|
let code_output = if let Some(ref visitor_handle) = ctx.visitor {
|
|
121
123
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
122
124
|
|
|
123
|
-
let attributes: BTreeMap<String, String> = tag
|
|
124
|
-
.attributes()
|
|
125
|
-
.iter()
|
|
126
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
127
|
-
.collect();
|
|
125
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
128
126
|
|
|
129
127
|
let node_id = node_handle.get_inner();
|
|
130
128
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -8,6 +8,8 @@
|
|
|
8
8
|
//! - Visitor callbacks for custom emphasis processing
|
|
9
9
|
//! - Bootstrap caret detection (.caret class)
|
|
10
10
|
|
|
11
|
+
#[cfg(feature = "visitor")]
|
|
12
|
+
use crate::converter::utility::content::collect_tag_attributes;
|
|
11
13
|
use crate::options::{ConversionOptions, OutputFormat};
|
|
12
14
|
#[allow(unused_imports)]
|
|
13
15
|
use std::collections::BTreeMap;
|
|
@@ -106,11 +108,7 @@ fn handle_strong(
|
|
|
106
108
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
107
109
|
|
|
108
110
|
let text_content = get_text_content(node_handle, parser, dom_ctx);
|
|
109
|
-
let attributes: BTreeMap<String, String> = tag
|
|
110
|
-
.attributes()
|
|
111
|
-
.iter()
|
|
112
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
113
|
-
.collect();
|
|
111
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
114
112
|
|
|
115
113
|
let node_id = node_handle.get_inner();
|
|
116
114
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -246,11 +244,7 @@ fn handle_emphasis(
|
|
|
246
244
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
247
245
|
|
|
248
246
|
let text_content = get_text_content(node_handle, parser, dom_ctx);
|
|
249
|
-
let attributes: BTreeMap<String, String> = tag
|
|
250
|
-
.attributes()
|
|
251
|
-
.iter()
|
|
252
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
253
|
-
.collect();
|
|
247
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
254
248
|
|
|
255
249
|
let node_id = node_handle.get_inner();
|
|
256
250
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -9,7 +9,9 @@
|
|
|
9
9
|
//! - Metadata collection for links (links, URLs, titles, rel attributes)
|
|
10
10
|
//! - Block-level content within links (via inline context)
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
#[cfg(feature = "visitor")]
|
|
13
|
+
use crate::converter::utility::content::collect_tag_attributes;
|
|
14
|
+
use crate::converter::utility::content::{collect_link_label_text, escape_link_label, normalize_link_label};
|
|
13
15
|
use crate::converter::utility::preprocessing::sanitize_markdown_url;
|
|
14
16
|
use crate::options::ConversionOptions;
|
|
15
17
|
use std::collections::BTreeMap;
|
|
@@ -230,11 +232,7 @@ pub(crate) fn handle(
|
|
|
230
232
|
let link_output = if let Some(ref visitor_handle) = ctx.visitor {
|
|
231
233
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
232
234
|
|
|
233
|
-
let attributes: BTreeMap<String, String> = tag
|
|
234
|
-
.attributes()
|
|
235
|
-
.iter()
|
|
236
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
237
|
-
.collect();
|
|
235
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
238
236
|
|
|
239
237
|
let node_id = node_handle.get_inner();
|
|
240
238
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -341,58 +339,6 @@ pub(crate) fn handle(
|
|
|
341
339
|
}
|
|
342
340
|
}
|
|
343
341
|
|
|
344
|
-
/// Escape special Markdown characters in link labels.
|
|
345
|
-
///
|
|
346
|
-
/// Escapes unmatched closing brackets `]` to prevent accidental link termination.
|
|
347
|
-
/// Tracks bracket nesting to avoid escaping matched closing brackets.
|
|
348
|
-
///
|
|
349
|
-
/// # Examples
|
|
350
|
-
/// ```text
|
|
351
|
-
/// Input: "Click [here] for more"
|
|
352
|
-
/// Output: "Click [here\\] for more" (closing bracket is escaped because it's unmatched)
|
|
353
|
-
///
|
|
354
|
-
/// Input: "Normal text"
|
|
355
|
-
/// Output: "Normal text" (no escaping needed)
|
|
356
|
-
/// ```
|
|
357
|
-
fn escape_link_label(text: &str) -> String {
|
|
358
|
-
if text.is_empty() {
|
|
359
|
-
return String::new();
|
|
360
|
-
}
|
|
361
|
-
|
|
362
|
-
let mut result = String::with_capacity(text.len());
|
|
363
|
-
let mut backslash_count = 0usize;
|
|
364
|
-
let mut bracket_depth = 0usize;
|
|
365
|
-
|
|
366
|
-
for ch in text.chars() {
|
|
367
|
-
if ch == '\\' {
|
|
368
|
-
result.push('\\');
|
|
369
|
-
backslash_count += 1;
|
|
370
|
-
continue;
|
|
371
|
-
}
|
|
372
|
-
|
|
373
|
-
let is_escaped = backslash_count % 2 == 1;
|
|
374
|
-
backslash_count = 0;
|
|
375
|
-
|
|
376
|
-
match ch {
|
|
377
|
-
'[' if !is_escaped => {
|
|
378
|
-
bracket_depth = bracket_depth.saturating_add(1);
|
|
379
|
-
result.push('[');
|
|
380
|
-
}
|
|
381
|
-
']' if !is_escaped => {
|
|
382
|
-
if bracket_depth == 0 {
|
|
383
|
-
result.push('\\');
|
|
384
|
-
} else {
|
|
385
|
-
bracket_depth -= 1;
|
|
386
|
-
}
|
|
387
|
-
result.push(']');
|
|
388
|
-
}
|
|
389
|
-
_ => result.push(ch),
|
|
390
|
-
}
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
result
|
|
394
|
-
}
|
|
395
|
-
|
|
396
342
|
/// Format and append a Markdown link to the output string.
|
|
397
343
|
///
|
|
398
344
|
/// Generates the link syntax: `[label](href "title")`
|
|
@@ -462,115 +408,3 @@ pub(crate) fn append_markdown_link(
|
|
|
462
408
|
|
|
463
409
|
output.push(')');
|
|
464
410
|
}
|
|
465
|
-
|
|
466
|
-
/// Collect text content from direct inline children of a link element.
|
|
467
|
-
///
|
|
468
|
-
/// Performs a shallow scan to find text content, distinguishing between:
|
|
469
|
-
/// - Inline text (normal flow, accumulated)
|
|
470
|
-
/// - Block-level elements (stop at them, mark `saw_block`)
|
|
471
|
-
/// - Comments (stop processing)
|
|
472
|
-
///
|
|
473
|
-
/// Returns:
|
|
474
|
-
/// - `(text, block_nodes, saw_block)` where:
|
|
475
|
-
/// - `text` is concatenated inline text
|
|
476
|
-
/// - `block_nodes` is list of block-level children found
|
|
477
|
-
/// - `saw_block` indicates if any block elements were encountered
|
|
478
|
-
///
|
|
479
|
-
/// # Algorithm
|
|
480
|
-
/// Uses a stack-based approach to traverse the DOM tree, accumulating text
|
|
481
|
-
/// from inline elements while identifying block-level boundaries.
|
|
482
|
-
fn collect_link_label_text(
|
|
483
|
-
children: &[NodeHandle],
|
|
484
|
-
parser: &Parser,
|
|
485
|
-
dom_ctx: &DomContext,
|
|
486
|
-
) -> (String, Vec<NodeHandle>, bool) {
|
|
487
|
-
let mut text = String::new();
|
|
488
|
-
let mut saw_block = false;
|
|
489
|
-
let mut block_nodes = Vec::new();
|
|
490
|
-
let mut stack: Vec<_> = children.iter().rev().copied().collect();
|
|
491
|
-
|
|
492
|
-
while let Some(handle) = stack.pop() {
|
|
493
|
-
if let Some(node) = handle.get(parser) {
|
|
494
|
-
match node {
|
|
495
|
-
tl::Node::Raw(bytes) => {
|
|
496
|
-
let raw = bytes.as_utf8_str();
|
|
497
|
-
let decoded = crate::text::decode_html_entities_cow(raw.as_ref());
|
|
498
|
-
text.push_str(decoded.as_ref());
|
|
499
|
-
}
|
|
500
|
-
tl::Node::Tag(tag) => {
|
|
501
|
-
let is_block = dom_ctx.tag_info(handle.get_inner(), parser).map_or_else(
|
|
502
|
-
|| {
|
|
503
|
-
let tag_name = normalized_tag_name(tag.name().as_utf8_str());
|
|
504
|
-
is_block_level_element(tag_name.as_ref())
|
|
505
|
-
},
|
|
506
|
-
|info| info.is_block,
|
|
507
|
-
);
|
|
508
|
-
if is_block {
|
|
509
|
-
saw_block = true;
|
|
510
|
-
block_nodes.push(handle);
|
|
511
|
-
continue;
|
|
512
|
-
}
|
|
513
|
-
|
|
514
|
-
if let Some(children) = dom_ctx.children_of(handle.get_inner()) {
|
|
515
|
-
for child in children.iter().rev() {
|
|
516
|
-
stack.push(*child);
|
|
517
|
-
}
|
|
518
|
-
} else {
|
|
519
|
-
let tag_children = tag.children();
|
|
520
|
-
let mut child_nodes: Vec<_> = tag_children.top().iter().copied().collect();
|
|
521
|
-
child_nodes.reverse();
|
|
522
|
-
stack.extend(child_nodes);
|
|
523
|
-
}
|
|
524
|
-
}
|
|
525
|
-
_ => {}
|
|
526
|
-
}
|
|
527
|
-
}
|
|
528
|
-
}
|
|
529
|
-
|
|
530
|
-
(text, block_nodes, saw_block)
|
|
531
|
-
}
|
|
532
|
-
|
|
533
|
-
/// Normalize link label text.
|
|
534
|
-
///
|
|
535
|
-
/// Collapses line breaks and normalizes whitespace:
|
|
536
|
-
/// - Replaces `\n` and `\r` with spaces
|
|
537
|
-
/// - Collapses multiple consecutive spaces to single space
|
|
538
|
-
/// - Trims leading/trailing whitespace
|
|
539
|
-
///
|
|
540
|
-
/// This is required by the Markdown spec for link labels to function properly.
|
|
541
|
-
///
|
|
542
|
-
/// # Examples
|
|
543
|
-
/// ```text
|
|
544
|
-
/// Input: "Line 1\nLine 2"
|
|
545
|
-
/// Output: "Line 1 Line 2"
|
|
546
|
-
///
|
|
547
|
-
/// Input: "Text with spaces"
|
|
548
|
-
/// Output: "Text with spaces"
|
|
549
|
-
/// ```
|
|
550
|
-
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
551
|
-
fn normalize_link_label(label: &str) -> String {
|
|
552
|
-
let mut needs_collapse = false;
|
|
553
|
-
for ch in label.chars() {
|
|
554
|
-
if ch == '\n' || ch == '\r' {
|
|
555
|
-
needs_collapse = true;
|
|
556
|
-
break;
|
|
557
|
-
}
|
|
558
|
-
}
|
|
559
|
-
|
|
560
|
-
let collapsed = if needs_collapse {
|
|
561
|
-
let mut collapsed = String::with_capacity(label.len());
|
|
562
|
-
for ch in label.chars() {
|
|
563
|
-
if ch == '\n' || ch == '\r' {
|
|
564
|
-
collapsed.push(' ');
|
|
565
|
-
} else {
|
|
566
|
-
collapsed.push(ch);
|
|
567
|
-
}
|
|
568
|
-
}
|
|
569
|
-
std::borrow::Cow::Owned(collapsed)
|
|
570
|
-
} else {
|
|
571
|
-
std::borrow::Cow::Borrowed(label)
|
|
572
|
-
};
|
|
573
|
-
|
|
574
|
-
let normalized = crate::text::normalize_whitespace_cow(collapsed.as_ref());
|
|
575
|
-
normalized.as_ref().trim().to_string()
|
|
576
|
-
}
|
|
@@ -5,7 +5,11 @@
|
|
|
5
5
|
//! - Strikethrough (del, s tags) with ~~ syntax
|
|
6
6
|
//! - Inserted/underlined text (ins, u tags) with == syntax
|
|
7
7
|
|
|
8
|
+
#[cfg(feature = "visitor")]
|
|
9
|
+
use crate::converter::utility::content::collect_tag_attributes;
|
|
8
10
|
use crate::options::{ConversionOptions, OutputFormat};
|
|
11
|
+
#[cfg(feature = "visitor")]
|
|
12
|
+
use std::collections::BTreeMap;
|
|
9
13
|
use tl::{NodeHandle, Parser};
|
|
10
14
|
|
|
11
15
|
type Context = crate::converter::Context;
|
|
@@ -135,14 +139,8 @@ pub fn handle_strikethrough(
|
|
|
135
139
|
let strikethrough_output = if let Some(ref visitor_handle) = ctx.visitor {
|
|
136
140
|
use crate::converter::get_text_content;
|
|
137
141
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
138
|
-
use std::collections::BTreeMap;
|
|
139
|
-
|
|
140
142
|
let text_content = get_text_content(node_handle, parser, dom_ctx);
|
|
141
|
-
let attributes: BTreeMap<String, String> = tag
|
|
142
|
-
.attributes()
|
|
143
|
-
.iter()
|
|
144
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
145
|
-
.collect();
|
|
143
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
146
144
|
|
|
147
145
|
let node_id = node_handle.get_inner();
|
|
148
146
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -262,14 +260,9 @@ pub fn handle_inserted(
|
|
|
262
260
|
let underline_output = if let Some(ref visitor_handle) = ctx.visitor {
|
|
263
261
|
use crate::converter::get_text_content;
|
|
264
262
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
265
|
-
use std::collections::BTreeMap;
|
|
266
263
|
|
|
267
264
|
let text_content = get_text_content(node_handle, parser, dom_ctx);
|
|
268
|
-
let attributes: BTreeMap<String, String> = tag
|
|
269
|
-
.attributes()
|
|
270
|
-
.iter()
|
|
271
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
272
|
-
.collect();
|
|
265
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
273
266
|
|
|
274
267
|
let node_id = node_handle.get_inner();
|
|
275
268
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -377,14 +370,9 @@ pub fn handle_underline(
|
|
|
377
370
|
if let Some(ref visitor_handle) = ctx.visitor {
|
|
378
371
|
use crate::converter::get_text_content;
|
|
379
372
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
380
|
-
use std::collections::BTreeMap;
|
|
381
373
|
|
|
382
374
|
let text_content = get_text_content(node_handle, parser, dom_ctx);
|
|
383
|
-
let attributes: BTreeMap<String, String> = tag
|
|
384
|
-
.attributes()
|
|
385
|
-
.iter()
|
|
386
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
387
|
-
.collect();
|
|
375
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
388
376
|
|
|
389
377
|
let node_id = node_handle.get_inner();
|
|
390
378
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -8,6 +8,8 @@
|
|
|
8
8
|
|
|
9
9
|
use crate::converter::main_helpers::tag_name_eq;
|
|
10
10
|
use crate::converter::main_helpers::trim_trailing_whitespace;
|
|
11
|
+
#[cfg(feature = "visitor")]
|
|
12
|
+
use crate::converter::utility::content::collect_tag_attributes;
|
|
11
13
|
use crate::converter::utility::content::normalized_tag_name;
|
|
12
14
|
use crate::converter::walk_node;
|
|
13
15
|
use crate::options::ConversionOptions;
|
|
@@ -216,11 +218,7 @@ pub(crate) fn handle_li(
|
|
|
216
218
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
217
219
|
use std::collections::BTreeMap;
|
|
218
220
|
|
|
219
|
-
let attributes: BTreeMap<String, String> = tag
|
|
220
|
-
.attributes()
|
|
221
|
-
.iter()
|
|
222
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
223
|
-
.collect();
|
|
221
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
224
222
|
|
|
225
223
|
let parent_tag = dom_ctx
|
|
226
224
|
.parent_of(node_handle.get_inner())
|
|
@@ -10,6 +10,8 @@ use super::utils::{
|
|
|
10
10
|
add_list_leading_separator, add_nested_list_trailing_separator, calculate_list_nesting_depth, is_loose_list,
|
|
11
11
|
process_list_children,
|
|
12
12
|
};
|
|
13
|
+
#[cfg(feature = "visitor")]
|
|
14
|
+
use crate::converter::utility::content::collect_tag_attributes;
|
|
13
15
|
use crate::options::ConversionOptions;
|
|
14
16
|
#[allow(unused_imports)]
|
|
15
17
|
use std::collections::BTreeMap;
|
|
@@ -60,11 +62,7 @@ pub(crate) fn handle_ol(
|
|
|
60
62
|
if let Some(ref visitor_handle) = ctx.visitor {
|
|
61
63
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
62
64
|
|
|
63
|
-
let attributes: BTreeMap<String, String> = tag
|
|
64
|
-
.attributes()
|
|
65
|
-
.iter()
|
|
66
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
67
|
-
.collect();
|
|
65
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
68
66
|
|
|
69
67
|
let parent_tag = dom_ctx
|
|
70
68
|
.parent_of(node_handle.get_inner())
|
|
@@ -129,11 +127,7 @@ pub(crate) fn handle_ol(
|
|
|
129
127
|
if let Some(ref visitor_handle) = ctx.visitor {
|
|
130
128
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
131
129
|
|
|
132
|
-
let attributes: BTreeMap<String, String> = tag
|
|
133
|
-
.attributes()
|
|
134
|
-
.iter()
|
|
135
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
136
|
-
.collect();
|
|
130
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
137
131
|
|
|
138
132
|
let parent_tag = dom_ctx
|
|
139
133
|
.parent_of(node_handle.get_inner())
|
|
@@ -10,7 +10,11 @@ use super::utils::{
|
|
|
10
10
|
add_list_leading_separator, add_nested_list_trailing_separator, calculate_list_nesting_depth, is_loose_list,
|
|
11
11
|
process_list_children,
|
|
12
12
|
};
|
|
13
|
+
#[cfg(feature = "visitor")]
|
|
14
|
+
use crate::converter::utility::content::collect_tag_attributes;
|
|
13
15
|
use crate::options::ConversionOptions;
|
|
16
|
+
#[cfg(feature = "visitor")]
|
|
17
|
+
use std::collections::BTreeMap;
|
|
14
18
|
use tl;
|
|
15
19
|
|
|
16
20
|
// Type aliases for Context and DomContext to avoid circular imports
|
|
@@ -51,13 +55,8 @@ pub(crate) fn handle_ul(
|
|
|
51
55
|
#[cfg(feature = "visitor")]
|
|
52
56
|
if let Some(ref visitor_handle) = ctx.visitor {
|
|
53
57
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
54
|
-
use std::collections::BTreeMap;
|
|
55
58
|
|
|
56
|
-
let attributes: BTreeMap<String, String> = tag
|
|
57
|
-
.attributes()
|
|
58
|
-
.iter()
|
|
59
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
60
|
-
.collect();
|
|
59
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
61
60
|
|
|
62
61
|
let parent_tag = dom_ctx
|
|
63
62
|
.parent_of(node_handle.get_inner())
|
|
@@ -121,13 +120,8 @@ pub(crate) fn handle_ul(
|
|
|
121
120
|
#[cfg(feature = "visitor")]
|
|
122
121
|
if let Some(ref visitor_handle) = ctx.visitor {
|
|
123
122
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
124
|
-
use std::collections::BTreeMap;
|
|
125
123
|
|
|
126
|
-
let attributes: BTreeMap<String, String> = tag
|
|
127
|
-
.attributes()
|
|
128
|
-
.iter()
|
|
129
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
130
|
-
.collect();
|
|
124
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
131
125
|
|
|
132
126
|
let parent_tag = dom_ctx
|
|
133
127
|
.parent_of(node_handle.get_inner())
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
//! Contains helper functions for loose list detection, indentation calculation,
|
|
4
4
|
//! list spacing, and list child processing.
|
|
5
5
|
|
|
6
|
+
use crate::converter::main_helpers::{tag_name_eq, trim_trailing_whitespace};
|
|
6
7
|
use crate::options::{ConversionOptions, ListIndentType};
|
|
7
8
|
use tl;
|
|
8
9
|
|
|
@@ -11,18 +12,6 @@ use tl;
|
|
|
11
12
|
type Context = crate::converter::Context;
|
|
12
13
|
type DomContext = crate::converter::DomContext;
|
|
13
14
|
|
|
14
|
-
/// Remove trailing spaces and tabs from output string.
|
|
15
|
-
fn trim_trailing_whitespace(output: &mut String) {
|
|
16
|
-
while output.ends_with(' ') || output.ends_with('\t') {
|
|
17
|
-
output.pop();
|
|
18
|
-
}
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
/// Check if tag names are equal (case-insensitive).
|
|
22
|
-
fn tag_name_eq<'a>(a: impl AsRef<str>, b: &str) -> bool {
|
|
23
|
-
a.as_ref().eq_ignore_ascii_case(b)
|
|
24
|
-
}
|
|
25
|
-
|
|
26
15
|
/// Calculate indentation level for list item continuations.
|
|
27
16
|
///
|
|
28
17
|
/// Returns the number of 4-space indent groups needed for list continuations.
|