html-to-markdown 2.30.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -14
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +5 -2
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +126 -52
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -1,26 +1,23 @@
|
|
|
1
|
-
//! Main HTML to Markdown conversion
|
|
1
|
+
//! Main HTML to Markdown conversion API.
|
|
2
2
|
//!
|
|
3
|
-
//! This module provides the primary
|
|
4
|
-
//! including support for metadata extraction, inline image collection, and custom visitors.
|
|
3
|
+
//! This module provides the primary `convert()` function for converting HTML to Markdown.
|
|
5
4
|
|
|
6
5
|
use std::borrow::Cow;
|
|
7
6
|
|
|
8
7
|
use crate::error::Result;
|
|
9
8
|
use crate::options::{ConversionOptions, WhitespaceMode};
|
|
10
9
|
use crate::text;
|
|
10
|
+
use crate::types::ConversionResult;
|
|
11
11
|
use crate::validation::{Utf16Encoding, detect_utf16_encoding, validate_input};
|
|
12
12
|
use crate::{ConversionError, ConversionOptionsUpdate};
|
|
13
13
|
|
|
14
|
-
#[cfg(feature = "visitor")]
|
|
15
|
-
use crate::visitor;
|
|
16
|
-
#[cfg(feature = "async-visitor")]
|
|
17
|
-
use crate::visitor_helpers;
|
|
18
|
-
#[cfg(feature = "metadata")]
|
|
19
|
-
use crate::{ExtendedMetadata, MetadataConfig};
|
|
20
14
|
#[cfg(feature = "inline-images")]
|
|
21
|
-
use crate::
|
|
15
|
+
use crate::InlineImageConfig;
|
|
16
|
+
#[cfg(feature = "metadata")]
|
|
17
|
+
use crate::{HtmlMetadata, MetadataConfig};
|
|
22
18
|
|
|
23
|
-
/// Convert HTML to Markdown
|
|
19
|
+
/// Convert HTML to Markdown, returning a [`ConversionResult`] with content, metadata, images,
|
|
20
|
+
/// and warnings.
|
|
24
21
|
///
|
|
25
22
|
/// # Arguments
|
|
26
23
|
///
|
|
@@ -33,265 +30,121 @@ use crate::{HtmlExtraction, InlineImageConfig};
|
|
|
33
30
|
/// use html_to_markdown_rs::{convert, ConversionOptions};
|
|
34
31
|
///
|
|
35
32
|
/// let html = "<h1>Hello World</h1>";
|
|
36
|
-
/// let
|
|
37
|
-
/// assert!(
|
|
33
|
+
/// let result = convert(html, None).unwrap();
|
|
34
|
+
/// assert!(result.content.as_deref().unwrap_or("").contains("Hello World"));
|
|
38
35
|
/// ```
|
|
36
|
+
///
|
|
39
37
|
/// # Errors
|
|
40
38
|
///
|
|
41
39
|
/// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
|
|
42
|
-
pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<
|
|
40
|
+
pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<ConversionResult> {
|
|
41
|
+
use std::cell::RefCell;
|
|
42
|
+
use std::rc::Rc;
|
|
43
|
+
|
|
43
44
|
let options = options.unwrap_or_default();
|
|
44
45
|
|
|
45
46
|
let normalized_html = normalize_input(html)?;
|
|
46
47
|
|
|
48
|
+
// Fast path: plain text with no HTML tags — skip full parsing pipeline.
|
|
47
49
|
if !options.wrap {
|
|
48
50
|
if let Some(markdown) = fast_text_only(normalized_html.as_ref(), &options) {
|
|
49
|
-
return Ok(
|
|
51
|
+
return Ok(ConversionResult {
|
|
52
|
+
content: Some(markdown),
|
|
53
|
+
..ConversionResult::default()
|
|
54
|
+
});
|
|
50
55
|
}
|
|
51
56
|
}
|
|
52
57
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
Ok(markdown)
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
/// Convert HTML to Markdown while collecting inline image assets (requires the `inline-images` feature).
|
|
63
|
-
///
|
|
64
|
-
/// Extracts inline image data URIs and inline `<svg>` elements alongside Markdown conversion.
|
|
65
|
-
///
|
|
66
|
-
/// # Arguments
|
|
67
|
-
///
|
|
68
|
-
/// * `html` - The HTML string to convert
|
|
69
|
-
/// * `options` - Optional conversion options (defaults to `ConversionOptions::default()`)
|
|
70
|
-
/// * `image_cfg` - Configuration controlling inline image extraction
|
|
71
|
-
/// * `visitor` - Optional visitor for customizing conversion behavior. Only used if `visitor` feature is enabled.
|
|
72
|
-
/// # Errors
|
|
73
|
-
///
|
|
74
|
-
/// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
|
|
75
|
-
#[cfg(feature = "inline-images")]
|
|
76
|
-
pub fn convert_with_inline_images(
|
|
77
|
-
html: &str,
|
|
78
|
-
options: Option<ConversionOptions>,
|
|
79
|
-
image_cfg: InlineImageConfig,
|
|
80
|
-
#[cfg(feature = "visitor")] visitor: Option<visitor::VisitorHandle>,
|
|
81
|
-
#[cfg(not(feature = "visitor"))] _visitor: Option<()>,
|
|
82
|
-
) -> Result<HtmlExtraction> {
|
|
83
|
-
use std::cell::RefCell;
|
|
84
|
-
use std::rc::Rc;
|
|
85
|
-
|
|
86
|
-
let options = options.unwrap_or_default();
|
|
87
|
-
|
|
88
|
-
let normalized_html = normalize_input(html)?;
|
|
89
|
-
|
|
90
|
-
let collector = Rc::new(RefCell::new(crate::inline_images::InlineImageCollector::new(
|
|
91
|
-
image_cfg,
|
|
92
|
-
)?));
|
|
58
|
+
// Determine whether metadata / inline-image extraction is requested.
|
|
59
|
+
#[cfg(feature = "metadata")]
|
|
60
|
+
let wants_metadata = options.extract_metadata;
|
|
61
|
+
#[cfg(not(feature = "metadata"))]
|
|
62
|
+
let wants_metadata = false;
|
|
93
63
|
|
|
94
|
-
#[cfg(feature = "
|
|
95
|
-
let
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
Some(Rc::clone(&collector)),
|
|
99
|
-
None,
|
|
100
|
-
visitor,
|
|
101
|
-
)?;
|
|
102
|
-
#[cfg(not(feature = "visitor"))]
|
|
103
|
-
let markdown = crate::converter::convert_html_impl(
|
|
104
|
-
normalized_html.as_ref(),
|
|
105
|
-
&options,
|
|
106
|
-
Some(Rc::clone(&collector)),
|
|
107
|
-
None,
|
|
108
|
-
None,
|
|
109
|
-
)?;
|
|
64
|
+
#[cfg(feature = "inline-images")]
|
|
65
|
+
let wants_images = options.extract_images;
|
|
66
|
+
#[cfg(not(feature = "inline-images"))]
|
|
67
|
+
let wants_images = false;
|
|
110
68
|
|
|
111
|
-
|
|
112
|
-
|
|
69
|
+
// Build optional collectors based on requested features.
|
|
70
|
+
#[cfg(feature = "metadata")]
|
|
71
|
+
let metadata_collector = if wants_metadata {
|
|
72
|
+
Some(Rc::new(RefCell::new(crate::metadata::MetadataCollector::new(
|
|
73
|
+
MetadataConfig::default(),
|
|
74
|
+
))))
|
|
113
75
|
} else {
|
|
114
|
-
|
|
76
|
+
None
|
|
115
77
|
};
|
|
116
78
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
})
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
/// Convert HTML to Markdown with comprehensive metadata extraction (requires the `metadata` feature).
|
|
130
|
-
///
|
|
131
|
-
/// Performs HTML-to-Markdown conversion while simultaneously extracting structured metadata in a
|
|
132
|
-
/// single pass for maximum efficiency. Ideal for content analysis, SEO optimization, and document
|
|
133
|
-
/// indexing workflows.
|
|
134
|
-
///
|
|
135
|
-
/// # Arguments
|
|
136
|
-
///
|
|
137
|
-
/// * `html` - The HTML string to convert. Will normalize line endings (CRLF → LF).
|
|
138
|
-
/// * `options` - Optional conversion configuration. Defaults to `ConversionOptions::default()` if `None`.
|
|
139
|
-
/// Controls heading style, list indentation, escape behavior, wrapping, and other output formatting.
|
|
140
|
-
/// * `metadata_cfg` - Configuration for metadata extraction granularity. Use `MetadataConfig::default()`
|
|
141
|
-
/// to extract all metadata types, or customize with selective extraction flags.
|
|
142
|
-
/// * `visitor` - Optional visitor for customizing conversion behavior. Only used if `visitor` feature is enabled.
|
|
143
|
-
///
|
|
144
|
-
/// # Returns
|
|
145
|
-
///
|
|
146
|
-
/// On success, returns a tuple of:
|
|
147
|
-
/// - `String`: The converted Markdown output
|
|
148
|
-
/// - `ExtendedMetadata`: Comprehensive metadata containing:
|
|
149
|
-
/// - `document`: Title, description, author, language, Open Graph, Twitter Card, and other meta tags
|
|
150
|
-
/// - `headers`: All heading elements (h1-h6) with hierarchy and IDs
|
|
151
|
-
/// - `links`: Hyperlinks classified as anchor, internal, external, email, or phone
|
|
152
|
-
/// - `images`: Image elements with source, dimensions, and alt text
|
|
153
|
-
/// - `structured_data`: JSON-LD, Microdata, and `RDFa` blocks
|
|
154
|
-
///
|
|
155
|
-
/// # Errors
|
|
156
|
-
///
|
|
157
|
-
/// Returns `ConversionError` if:
|
|
158
|
-
/// - HTML parsing fails
|
|
159
|
-
/// - Invalid UTF-8 sequences encountered
|
|
160
|
-
/// - Internal panic during conversion (wrapped in `ConversionError::Panic`)
|
|
161
|
-
/// - Configuration size limits exceeded
|
|
162
|
-
///
|
|
163
|
-
/// # Performance Notes
|
|
164
|
-
///
|
|
165
|
-
/// - Single-pass collection: metadata extraction has minimal overhead
|
|
166
|
-
/// - Zero cost when metadata feature is disabled
|
|
167
|
-
/// - Pre-allocated buffers: typically handles 50+ headers, 100+ links, 20+ images efficiently
|
|
168
|
-
/// - Structured data size-limited to prevent memory exhaustion (configurable)
|
|
169
|
-
///
|
|
170
|
-
/// # Example: Basic Usage
|
|
171
|
-
///
|
|
172
|
-
/// ```ignore
|
|
173
|
-
/// use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
|
|
174
|
-
///
|
|
175
|
-
/// let html = r#"
|
|
176
|
-
/// <html lang="en">
|
|
177
|
-
/// <head><title>My Article</title></head>
|
|
178
|
-
/// <body>
|
|
179
|
-
/// <h1 id="intro">Introduction</h1>
|
|
180
|
-
/// <p>Welcome to <a href="https://example.com">our site</a></p>
|
|
181
|
-
/// </body>
|
|
182
|
-
/// </html>
|
|
183
|
-
/// "#;
|
|
184
|
-
///
|
|
185
|
-
/// let (markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default(), None)?;
|
|
186
|
-
///
|
|
187
|
-
/// assert_eq!(metadata.document.title, Some("My Article".to_string()));
|
|
188
|
-
/// assert_eq!(metadata.document.language, Some("en".to_string()));
|
|
189
|
-
/// assert_eq!(metadata.headers[0].text, "Introduction");
|
|
190
|
-
/// assert_eq!(metadata.headers[0].id, Some("intro".to_string()));
|
|
191
|
-
/// assert_eq!(metadata.links.len(), 1);
|
|
192
|
-
/// # Ok::<(), html_to_markdown_rs::ConversionError>(())
|
|
193
|
-
/// ```
|
|
194
|
-
///
|
|
195
|
-
/// # Example: Selective Metadata Extraction
|
|
196
|
-
///
|
|
197
|
-
/// ```ignore
|
|
198
|
-
/// use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
|
|
199
|
-
///
|
|
200
|
-
/// let html = "<html><body><h1>Title</h1><a href='#anchor'>Link</a></body></html>";
|
|
201
|
-
///
|
|
202
|
-
/// // Extract only headers and document metadata, skip links/images
|
|
203
|
-
/// let config = MetadataConfig {
|
|
204
|
-
/// extract_headers: true,
|
|
205
|
-
/// extract_links: false,
|
|
206
|
-
/// extract_images: false,
|
|
207
|
-
/// extract_structured_data: false,
|
|
208
|
-
/// max_structured_data_size: 0,
|
|
209
|
-
/// };
|
|
210
|
-
///
|
|
211
|
-
/// let (markdown, metadata) = convert_with_metadata(html, None, config, None)?;
|
|
212
|
-
/// assert!(metadata.headers.len() > 0);
|
|
213
|
-
/// assert!(metadata.links.is_empty()); // Not extracted
|
|
214
|
-
/// # Ok::<(), html_to_markdown_rs::ConversionError>(())
|
|
215
|
-
/// ```
|
|
216
|
-
///
|
|
217
|
-
/// # Example: With Conversion Options and Metadata Config
|
|
218
|
-
///
|
|
219
|
-
/// ```ignore
|
|
220
|
-
/// use html_to_markdown_rs::{convert_with_metadata, ConversionOptions, MetadataConfig, HeadingStyle};
|
|
221
|
-
///
|
|
222
|
-
/// let html = "<html><head><title>Blog Post</title></head><body><h1>Hello</h1></body></html>";
|
|
223
|
-
///
|
|
224
|
-
/// let options = ConversionOptions {
|
|
225
|
-
/// heading_style: HeadingStyle::Atx,
|
|
226
|
-
/// wrap: true,
|
|
227
|
-
/// wrap_width: 80,
|
|
228
|
-
/// ..Default::default()
|
|
229
|
-
/// };
|
|
230
|
-
///
|
|
231
|
-
/// let metadata_cfg = MetadataConfig::default();
|
|
232
|
-
///
|
|
233
|
-
/// let (markdown, metadata) = convert_with_metadata(html, Some(options), metadata_cfg, None)?;
|
|
234
|
-
/// // Markdown will use ATX-style headings (# H1, ## H2, etc.)
|
|
235
|
-
/// // Wrapped at 80 characters
|
|
236
|
-
/// // All metadata extracted
|
|
237
|
-
/// # Ok::<(), html_to_markdown_rs::ConversionError>(())
|
|
238
|
-
/// ```
|
|
239
|
-
///
|
|
240
|
-
/// # See Also
|
|
241
|
-
///
|
|
242
|
-
/// - [`convert`] - Simple HTML to Markdown conversion without metadata
|
|
243
|
-
/// - [`convert_with_inline_images`] - Conversion with inline image extraction
|
|
244
|
-
/// - [`MetadataConfig`] - Configuration for metadata extraction
|
|
245
|
-
/// - [`ExtendedMetadata`] - Metadata structure documentation
|
|
246
|
-
/// - [`metadata`] module - Detailed type documentation for metadata components
|
|
247
|
-
#[cfg(feature = "metadata")]
|
|
248
|
-
pub fn convert_with_metadata(
|
|
249
|
-
html: &str,
|
|
250
|
-
options: Option<ConversionOptions>,
|
|
251
|
-
metadata_cfg: MetadataConfig,
|
|
252
|
-
#[cfg(feature = "visitor")] visitor: Option<visitor::VisitorHandle>,
|
|
253
|
-
#[cfg(not(feature = "visitor"))] _visitor: Option<()>,
|
|
254
|
-
) -> Result<(String, ExtendedMetadata)> {
|
|
255
|
-
use std::cell::RefCell;
|
|
256
|
-
use std::rc::Rc;
|
|
257
|
-
|
|
258
|
-
// Disable YAML frontmatter prepending: metadata is returned as a struct,
|
|
259
|
-
// so embedding it in the content string is redundant and pollutes the output.
|
|
260
|
-
let mut options = options.unwrap_or_default();
|
|
261
|
-
options.extract_metadata = false;
|
|
79
|
+
#[cfg(feature = "inline-images")]
|
|
80
|
+
let image_collector = if wants_images {
|
|
81
|
+
use crate::inline_images::{DEFAULT_INLINE_IMAGE_LIMIT, InlineImageConfig as IIC};
|
|
82
|
+
Some(Rc::new(RefCell::new(crate::inline_images::InlineImageCollector::new(
|
|
83
|
+
IIC::new(DEFAULT_INLINE_IMAGE_LIMIT),
|
|
84
|
+
)?)))
|
|
85
|
+
} else {
|
|
86
|
+
None
|
|
87
|
+
};
|
|
262
88
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
let markdown = if options.wrap {
|
|
270
|
-
crate::wrapper::wrap_markdown(&markdown, &options)
|
|
89
|
+
// Build optional structure collector when requested.
|
|
90
|
+
let structure_collector: Option<std::rc::Rc<std::cell::RefCell<crate::types::StructureCollector>>> =
|
|
91
|
+
if options.include_document_structure {
|
|
92
|
+
Some(std::rc::Rc::new(std::cell::RefCell::new(
|
|
93
|
+
crate::types::StructureCollector::new(),
|
|
94
|
+
)))
|
|
271
95
|
} else {
|
|
272
|
-
|
|
96
|
+
None
|
|
273
97
|
};
|
|
274
|
-
return Ok((markdown, ExtendedMetadata::default()));
|
|
275
|
-
}
|
|
276
98
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
let markdown =
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
99
|
+
// Run the conversion pipeline.
|
|
100
|
+
// Pass structure_collector by value — convert_html_impl will consume it via Rc::try_unwrap
|
|
101
|
+
// to return the finished DocumentStructure. We must not hold a second Rc reference.
|
|
102
|
+
let (markdown, document) = {
|
|
103
|
+
#[cfg(all(feature = "metadata", feature = "inline-images"))]
|
|
104
|
+
{
|
|
105
|
+
crate::converter::convert_html_impl(
|
|
106
|
+
normalized_html.as_ref(),
|
|
107
|
+
&options,
|
|
108
|
+
image_collector.as_ref().map(Rc::clone),
|
|
109
|
+
metadata_collector.as_ref().map(Rc::clone),
|
|
110
|
+
None,
|
|
111
|
+
structure_collector,
|
|
112
|
+
)?
|
|
113
|
+
}
|
|
114
|
+
#[cfg(all(feature = "metadata", not(feature = "inline-images")))]
|
|
115
|
+
{
|
|
116
|
+
crate::converter::convert_html_impl(
|
|
117
|
+
normalized_html.as_ref(),
|
|
118
|
+
&options,
|
|
119
|
+
None,
|
|
120
|
+
metadata_collector.as_ref().map(Rc::clone),
|
|
121
|
+
None,
|
|
122
|
+
structure_collector,
|
|
123
|
+
)?
|
|
124
|
+
}
|
|
125
|
+
#[cfg(all(not(feature = "metadata"), feature = "inline-images"))]
|
|
126
|
+
{
|
|
127
|
+
crate::converter::convert_html_impl(
|
|
128
|
+
normalized_html.as_ref(),
|
|
129
|
+
&options,
|
|
130
|
+
image_collector.as_ref().map(Rc::clone),
|
|
131
|
+
None,
|
|
132
|
+
None,
|
|
133
|
+
structure_collector,
|
|
134
|
+
)?
|
|
135
|
+
}
|
|
136
|
+
#[cfg(all(not(feature = "metadata"), not(feature = "inline-images")))]
|
|
137
|
+
{
|
|
138
|
+
crate::converter::convert_html_impl(
|
|
139
|
+
normalized_html.as_ref(),
|
|
140
|
+
&options,
|
|
141
|
+
None,
|
|
142
|
+
None,
|
|
143
|
+
None,
|
|
144
|
+
structure_collector,
|
|
145
|
+
)?
|
|
146
|
+
}
|
|
147
|
+
};
|
|
295
148
|
|
|
296
149
|
let markdown = if options.wrap {
|
|
297
150
|
crate::wrapper::wrap_markdown(&markdown, &options)
|
|
@@ -299,146 +152,67 @@ pub fn convert_with_metadata(
|
|
|
299
152
|
markdown
|
|
300
153
|
};
|
|
301
154
|
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
155
|
+
// Collect metadata if extracted.
|
|
156
|
+
#[cfg(feature = "metadata")]
|
|
157
|
+
let metadata = if let Some(collector) = metadata_collector {
|
|
158
|
+
Rc::try_unwrap(collector)
|
|
159
|
+
.map_err(|_| ConversionError::Other("failed to recover metadata state".to_string()))?
|
|
160
|
+
.into_inner()
|
|
161
|
+
.finish()
|
|
162
|
+
} else {
|
|
163
|
+
HtmlMetadata::default()
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
// Collect inline images if extracted.
|
|
167
|
+
#[cfg(feature = "inline-images")]
|
|
168
|
+
let (images, image_warnings) = if let Some(collector) = image_collector {
|
|
169
|
+
let c = Rc::try_unwrap(collector)
|
|
170
|
+
.map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
|
|
171
|
+
.into_inner();
|
|
172
|
+
c.finish()
|
|
173
|
+
} else {
|
|
174
|
+
(Vec::new(), Vec::new())
|
|
175
|
+
};
|
|
306
176
|
|
|
307
|
-
|
|
177
|
+
// Map InlineImageWarnings → ProcessingWarnings.
|
|
178
|
+
#[cfg(feature = "inline-images")]
|
|
179
|
+
let warnings: Vec<crate::types::ProcessingWarning> = image_warnings
|
|
180
|
+
.into_iter()
|
|
181
|
+
.map(|w| crate::types::ProcessingWarning {
|
|
182
|
+
kind: crate::types::WarningKind::ImageExtractionFailed,
|
|
183
|
+
message: w.message,
|
|
184
|
+
})
|
|
185
|
+
.collect();
|
|
186
|
+
#[cfg(not(feature = "inline-images"))]
|
|
187
|
+
let warnings: Vec<crate::types::ProcessingWarning> = Vec::new();
|
|
188
|
+
|
|
189
|
+
let _ = wants_metadata;
|
|
190
|
+
let _ = wants_images;
|
|
191
|
+
|
|
192
|
+
Ok(ConversionResult {
|
|
193
|
+
content: Some(markdown),
|
|
194
|
+
document,
|
|
195
|
+
#[cfg(feature = "metadata")]
|
|
196
|
+
metadata,
|
|
197
|
+
tables: Vec::new(),
|
|
198
|
+
#[cfg(feature = "inline-images")]
|
|
199
|
+
images,
|
|
200
|
+
warnings,
|
|
201
|
+
})
|
|
308
202
|
}
|
|
309
203
|
|
|
310
|
-
///
|
|
311
|
-
///
|
|
312
|
-
/// This function allows you to provide a visitor implementation that can inspect,
|
|
313
|
-
/// modify, or replace the default conversion behavior for any HTML element type.
|
|
314
|
-
///
|
|
315
|
-
/// # Arguments
|
|
316
|
-
///
|
|
317
|
-
/// * `html` - The HTML input to convert
|
|
318
|
-
/// * `options` - Optional conversion options (uses defaults if None)
|
|
319
|
-
/// * `visitor` - Mutable reference to visitor implementation for customization
|
|
320
|
-
///
|
|
321
|
-
/// # Example
|
|
322
|
-
///
|
|
323
|
-
/// ```ignore
|
|
324
|
-
/// use html_to_markdown_rs::convert_with_visitor;
|
|
325
|
-
/// use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};
|
|
326
|
-
///
|
|
327
|
-
/// #[derive(Debug)]
|
|
328
|
-
/// struct CustomVisitor;
|
|
329
|
-
///
|
|
330
|
-
/// impl HtmlVisitor for CustomVisitor {
|
|
331
|
-
/// fn visit_code_block(
|
|
332
|
-
/// &mut self,
|
|
333
|
-
/// _ctx: &NodeContext,
|
|
334
|
-
/// language: Option<&str>,
|
|
335
|
-
/// code: &str,
|
|
336
|
-
/// ) -> VisitResult {
|
|
337
|
-
/// VisitResult::Custom(format!("```{}\n{}\n```", language.unwrap_or(""), code))
|
|
338
|
-
/// }
|
|
339
|
-
/// }
|
|
340
|
-
///
|
|
341
|
-
/// let html = "<pre><code class=\"language-rust\">fn main() {}</code></pre>";
|
|
342
|
-
/// let mut visitor = CustomVisitor;
|
|
343
|
-
/// let markdown = convert_with_visitor(html, None, &mut visitor).unwrap();
|
|
344
|
-
/// ```
|
|
204
|
+
/// Internal: convert with visitor support. Used by FFI crate.
|
|
205
|
+
/// Will be removed when convert() accepts visitor parameter directly.
|
|
345
206
|
#[cfg(feature = "visitor")]
|
|
346
|
-
|
|
347
|
-
///
|
|
348
|
-
/// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
|
|
207
|
+
#[doc(hidden)]
|
|
349
208
|
pub fn convert_with_visitor(
|
|
350
209
|
html: &str,
|
|
351
210
|
options: Option<ConversionOptions>,
|
|
352
|
-
visitor: Option<visitor::VisitorHandle>,
|
|
211
|
+
visitor: Option<crate::visitor::VisitorHandle>,
|
|
353
212
|
) -> Result<String> {
|
|
354
213
|
let options = options.unwrap_or_default();
|
|
355
|
-
|
|
356
214
|
let normalized_html = normalize_input(html)?;
|
|
357
|
-
|
|
358
215
|
let markdown = crate::converter::convert_html_with_visitor(normalized_html.as_ref(), &options, visitor)?;
|
|
359
|
-
|
|
360
|
-
if options.wrap {
|
|
361
|
-
Ok(crate::wrapper::wrap_markdown(&markdown, &options))
|
|
362
|
-
} else {
|
|
363
|
-
Ok(markdown)
|
|
364
|
-
}
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
#[cfg(feature = "async-visitor")]
|
|
368
|
-
/// Convert HTML to Markdown with an async visitor callback.
|
|
369
|
-
///
|
|
370
|
-
/// This async function allows you to provide an async visitor implementation that can inspect,
|
|
371
|
-
/// modify, or replace the default conversion behavior for any HTML element type.
|
|
372
|
-
///
|
|
373
|
-
/// This function is useful for:
|
|
374
|
-
/// - Python async functions (with `async def` and `asyncio`)
|
|
375
|
-
/// - TypeScript/JavaScript async functions (with `Promise`-based callbacks)
|
|
376
|
-
/// - Elixir processes (with message-passing async operations)
|
|
377
|
-
///
|
|
378
|
-
/// For synchronous languages (Ruby, PHP, Go, Java, C#), use `convert_with_visitor` instead.
|
|
379
|
-
///
|
|
380
|
-
/// # Note
|
|
381
|
-
///
|
|
382
|
-
/// The async visitor trait (`AsyncHtmlVisitor`) and async dispatch helpers are designed to be
|
|
383
|
-
/// consumed by language bindings (`PyO3`, NAPI-RS, Magnus, etc.) which can bridge async/await
|
|
384
|
-
/// semantics from their host languages. The conversion pipeline wraps async visitor calls using
|
|
385
|
-
/// tokio's runtime to support both multi-threaded and current_thread runtimes (like NAPI's).
|
|
386
|
-
///
|
|
387
|
-
/// Binding implementations will be responsible for running async callbacks on appropriate
|
|
388
|
-
/// event loops (asyncio for Python, Promise chains for TypeScript, etc.).
|
|
389
|
-
///
|
|
390
|
-
/// # Arguments
|
|
391
|
-
///
|
|
392
|
-
/// * `html` - The HTML input to convert
|
|
393
|
-
/// * `options` - Optional conversion options (uses defaults if None)
|
|
394
|
-
/// * `visitor` - Optional async visitor implementing `AsyncHtmlVisitor` trait for customization
|
|
395
|
-
///
|
|
396
|
-
/// # Example (Rust-like async)
|
|
397
|
-
///
|
|
398
|
-
/// ```ignore
|
|
399
|
-
/// use html_to_markdown_rs::convert_with_async_visitor;
|
|
400
|
-
/// use html_to_markdown_rs::visitor::{AsyncHtmlVisitor, NodeContext, VisitResult};
|
|
401
|
-
/// use async_trait::async_trait;
|
|
402
|
-
/// use std::rc::Rc;
|
|
403
|
-
/// use std::cell::RefCell;
|
|
404
|
-
///
|
|
405
|
-
/// #[derive(Debug)]
|
|
406
|
-
/// struct CustomAsyncVisitor;
|
|
407
|
-
///
|
|
408
|
-
/// #[async_trait]
|
|
409
|
-
/// impl AsyncHtmlVisitor for CustomAsyncVisitor {
|
|
410
|
-
/// async fn visit_code_block(
|
|
411
|
-
/// &mut self,
|
|
412
|
-
/// _ctx: &NodeContext,
|
|
413
|
-
/// language: Option<&str>,
|
|
414
|
-
/// code: &str,
|
|
415
|
-
/// ) -> VisitResult {
|
|
416
|
-
/// // Can perform async operations here (e.g., syntax highlighting via service)
|
|
417
|
-
/// VisitResult::Custom(format!("```{}\n{}\n```", language.unwrap_or(""), code))
|
|
418
|
-
/// }
|
|
419
|
-
/// }
|
|
420
|
-
///
|
|
421
|
-
/// let html = "<pre><code class=\"language-rust\">fn main() {}</code></pre>";
|
|
422
|
-
/// let visitor = Some(Rc::new(RefCell::new(CustomAsyncVisitor) as _));
|
|
423
|
-
/// let markdown = convert_with_async_visitor(html, None, visitor).await.unwrap();
|
|
424
|
-
/// ```
|
|
425
|
-
#[allow(clippy::future_not_send)]
|
|
426
|
-
/// # Errors
|
|
427
|
-
///
|
|
428
|
-
/// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
|
|
429
|
-
pub async fn convert_with_async_visitor(
|
|
430
|
-
html: &str,
|
|
431
|
-
options: Option<ConversionOptions>,
|
|
432
|
-
visitor: Option<visitor_helpers::AsyncVisitorHandle>,
|
|
433
|
-
) -> Result<String> {
|
|
434
|
-
let options = options.unwrap_or_default();
|
|
435
|
-
|
|
436
|
-
let normalized_html = normalize_input(html)?;
|
|
437
|
-
|
|
438
|
-
// Use the async implementation that properly awaits visitor callbacks
|
|
439
|
-
let markdown =
|
|
440
|
-
crate::converter::convert_html_with_visitor_async(normalized_html.as_ref(), &options, visitor).await?;
|
|
441
|
-
|
|
442
216
|
if options.wrap {
|
|
443
217
|
Ok(crate::wrapper::wrap_markdown(&markdown, &options))
|
|
444
218
|
} else {
|
|
@@ -681,371 +455,3 @@ pub fn metadata_config_from_json(json: &str) -> Result<MetadataConfig> {
|
|
|
681
455
|
let update: crate::MetadataConfigUpdate = parse_json(json)?;
|
|
682
456
|
Ok(MetadataConfig::from(update))
|
|
683
457
|
}
|
|
684
|
-
|
|
685
|
-
// ============================================================================
|
|
686
|
-
// Table Extraction API (requires visitor feature)
|
|
687
|
-
// ============================================================================
|
|
688
|
-
|
|
689
|
-
/// Extracted table data from HTML conversion.
|
|
690
|
-
///
|
|
691
|
-
/// Each instance represents a single `<table>` element found during conversion.
|
|
692
|
-
/// Tables are collected in document order.
|
|
693
|
-
#[cfg(feature = "visitor")]
|
|
694
|
-
#[derive(Debug, Clone)]
|
|
695
|
-
#[cfg_attr(
|
|
696
|
-
any(feature = "serde", feature = "metadata"),
|
|
697
|
-
derive(serde::Serialize, serde::Deserialize)
|
|
698
|
-
)]
|
|
699
|
-
pub struct TableData {
|
|
700
|
-
/// Table cells organized as rows x columns. Cell contents are already
|
|
701
|
-
/// converted to the target output format (markdown/djot/plain).
|
|
702
|
-
pub cells: Vec<Vec<String>>,
|
|
703
|
-
/// Complete rendered table in the target output format.
|
|
704
|
-
pub markdown: String,
|
|
705
|
-
/// Per-row flag indicating whether the row was inside `<thead>`.
|
|
706
|
-
pub is_header_row: Vec<bool>,
|
|
707
|
-
}
|
|
708
|
-
|
|
709
|
-
/// Result of HTML-to-markdown conversion with extracted table data.
|
|
710
|
-
#[cfg(feature = "visitor")]
|
|
711
|
-
#[derive(Debug, Clone)]
|
|
712
|
-
#[cfg_attr(
|
|
713
|
-
any(feature = "serde", feature = "metadata"),
|
|
714
|
-
derive(serde::Serialize, serde::Deserialize)
|
|
715
|
-
)]
|
|
716
|
-
pub struct ConversionWithTables {
|
|
717
|
-
/// Converted markdown/djot/plain text content.
|
|
718
|
-
pub content: String,
|
|
719
|
-
/// Extended metadata (if metadata extraction was requested).
|
|
720
|
-
#[cfg(feature = "metadata")]
|
|
721
|
-
pub metadata: Option<ExtendedMetadata>,
|
|
722
|
-
/// All tables found in the HTML, in document order.
|
|
723
|
-
pub tables: Vec<TableData>,
|
|
724
|
-
}
|
|
725
|
-
|
|
726
|
-
#[cfg(feature = "visitor")]
|
|
727
|
-
#[derive(Debug)]
|
|
728
|
-
struct TableCollector {
|
|
729
|
-
tables: Vec<TableData>,
|
|
730
|
-
current_rows: Vec<Vec<String>>,
|
|
731
|
-
current_is_header: Vec<bool>,
|
|
732
|
-
}
|
|
733
|
-
|
|
734
|
-
#[cfg(feature = "visitor")]
|
|
735
|
-
impl TableCollector {
|
|
736
|
-
fn new() -> Self {
|
|
737
|
-
Self {
|
|
738
|
-
tables: Vec::new(),
|
|
739
|
-
current_rows: Vec::new(),
|
|
740
|
-
current_is_header: Vec::new(),
|
|
741
|
-
}
|
|
742
|
-
}
|
|
743
|
-
}
|
|
744
|
-
|
|
745
|
-
#[cfg(feature = "visitor")]
|
|
746
|
-
impl visitor::HtmlVisitor for TableCollector {
|
|
747
|
-
fn visit_table_start(&mut self, _ctx: &visitor::NodeContext) -> visitor::VisitResult {
|
|
748
|
-
self.current_rows.clear();
|
|
749
|
-
self.current_is_header.clear();
|
|
750
|
-
visitor::VisitResult::Continue
|
|
751
|
-
}
|
|
752
|
-
|
|
753
|
-
fn visit_table_row(
|
|
754
|
-
&mut self,
|
|
755
|
-
_ctx: &visitor::NodeContext,
|
|
756
|
-
cells: &[String],
|
|
757
|
-
is_header: bool,
|
|
758
|
-
) -> visitor::VisitResult {
|
|
759
|
-
self.current_rows.push(cells.to_vec());
|
|
760
|
-
self.current_is_header.push(is_header);
|
|
761
|
-
visitor::VisitResult::Continue
|
|
762
|
-
}
|
|
763
|
-
|
|
764
|
-
fn visit_table_end(&mut self, _ctx: &visitor::NodeContext, output: &str) -> visitor::VisitResult {
|
|
765
|
-
if !self.current_rows.is_empty() {
|
|
766
|
-
self.tables.push(TableData {
|
|
767
|
-
cells: std::mem::take(&mut self.current_rows),
|
|
768
|
-
markdown: output.to_string(),
|
|
769
|
-
is_header_row: std::mem::take(&mut self.current_is_header),
|
|
770
|
-
});
|
|
771
|
-
}
|
|
772
|
-
visitor::VisitResult::Continue
|
|
773
|
-
}
|
|
774
|
-
}
|
|
775
|
-
|
|
776
|
-
/// Convert HTML to markdown/djot/plain text with structured table extraction.
|
|
777
|
-
///
|
|
778
|
-
/// Combines conversion, optional metadata extraction, and table data collection
|
|
779
|
-
/// in a single DOM walk. Each table found in the HTML is returned with its
|
|
780
|
-
/// cell contents (already converted to the target format) and rendered output.
|
|
781
|
-
///
|
|
782
|
-
/// # Arguments
|
|
783
|
-
///
|
|
784
|
-
/// * `html` - The HTML string to convert
|
|
785
|
-
/// * `options` - Optional conversion options (defaults to `ConversionOptions::default()`)
|
|
786
|
-
/// * `metadata_cfg` - Optional metadata extraction configuration (requires `metadata` feature)
|
|
787
|
-
///
|
|
788
|
-
/// # Example
|
|
789
|
-
///
|
|
790
|
-
/// ```ignore
|
|
791
|
-
/// use html_to_markdown_rs::convert_with_tables;
|
|
792
|
-
///
|
|
793
|
-
/// let html = r#"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>"#;
|
|
794
|
-
/// let result = convert_with_tables(html, None, None).unwrap();
|
|
795
|
-
/// assert_eq!(result.tables.len(), 1);
|
|
796
|
-
/// assert_eq!(result.tables[0].cells[0], vec!["Name", "Age"]);
|
|
797
|
-
/// ```
|
|
798
|
-
///
|
|
799
|
-
/// # Errors
|
|
800
|
-
///
|
|
801
|
-
/// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
|
|
802
|
-
#[cfg(feature = "visitor")]
|
|
803
|
-
pub fn convert_with_tables(
|
|
804
|
-
html: &str,
|
|
805
|
-
options: Option<ConversionOptions>,
|
|
806
|
-
#[cfg(feature = "metadata")] metadata_cfg: Option<MetadataConfig>,
|
|
807
|
-
#[cfg(not(feature = "metadata"))] _metadata_cfg: Option<()>,
|
|
808
|
-
) -> Result<ConversionWithTables> {
|
|
809
|
-
use std::cell::RefCell;
|
|
810
|
-
use std::rc::Rc;
|
|
811
|
-
|
|
812
|
-
let collector = Rc::new(RefCell::new(TableCollector::new()));
|
|
813
|
-
let visitor_handle: visitor::VisitorHandle = Rc::clone(&collector) as visitor::VisitorHandle;
|
|
814
|
-
|
|
815
|
-
#[cfg(feature = "metadata")]
|
|
816
|
-
let result = {
|
|
817
|
-
let metadata_config = metadata_cfg.unwrap_or_default();
|
|
818
|
-
let (content, metadata) = convert_with_metadata(html, options, metadata_config, Some(visitor_handle))?;
|
|
819
|
-
let tables = Rc::try_unwrap(collector)
|
|
820
|
-
.map_err(|_| ConversionError::Other("failed to recover table collector state".into()))?
|
|
821
|
-
.into_inner()
|
|
822
|
-
.tables;
|
|
823
|
-
ConversionWithTables {
|
|
824
|
-
content,
|
|
825
|
-
metadata: Some(metadata),
|
|
826
|
-
tables,
|
|
827
|
-
}
|
|
828
|
-
};
|
|
829
|
-
|
|
830
|
-
#[cfg(not(feature = "metadata"))]
|
|
831
|
-
let result = {
|
|
832
|
-
let content = convert_with_visitor(html, options, Some(visitor_handle))?;
|
|
833
|
-
let tables = Rc::try_unwrap(collector)
|
|
834
|
-
.map_err(|_| ConversionError::Other("failed to recover table collector state".into()))?
|
|
835
|
-
.into_inner()
|
|
836
|
-
.tables;
|
|
837
|
-
ConversionWithTables { content, tables }
|
|
838
|
-
};
|
|
839
|
-
|
|
840
|
-
Ok(result)
|
|
841
|
-
}
|
|
842
|
-
|
|
843
|
-
#[cfg(test)]
|
|
844
|
-
#[cfg(feature = "visitor")]
|
|
845
|
-
mod table_extraction_tests {
|
|
846
|
-
use super::*;
|
|
847
|
-
|
|
848
|
-
fn tables_from_html(html: &str) -> ConversionWithTables {
|
|
849
|
-
convert_with_tables(
|
|
850
|
-
html,
|
|
851
|
-
None,
|
|
852
|
-
#[cfg(feature = "metadata")]
|
|
853
|
-
None,
|
|
854
|
-
#[cfg(not(feature = "metadata"))]
|
|
855
|
-
None,
|
|
856
|
-
)
|
|
857
|
-
.unwrap()
|
|
858
|
-
}
|
|
859
|
-
|
|
860
|
-
#[test]
|
|
861
|
-
fn test_convert_with_tables_basic() {
|
|
862
|
-
let html = r"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>";
|
|
863
|
-
let result = tables_from_html(html);
|
|
864
|
-
assert_eq!(result.tables.len(), 1);
|
|
865
|
-
assert_eq!(result.tables[0].cells.len(), 2);
|
|
866
|
-
assert_eq!(result.tables[0].cells[0], vec!["Name", "Age"]);
|
|
867
|
-
assert_eq!(result.tables[0].cells[1], vec!["Alice", "30"]);
|
|
868
|
-
assert!(result.tables[0].is_header_row[0]);
|
|
869
|
-
assert!(!result.tables[0].is_header_row[1]);
|
|
870
|
-
assert!(result.tables[0].markdown.contains('|'));
|
|
871
|
-
}
|
|
872
|
-
|
|
873
|
-
#[test]
|
|
874
|
-
fn test_convert_with_tables_nested() {
|
|
875
|
-
let html = r"
|
|
876
|
-
<table>
|
|
877
|
-
<tr><th>Category</th><th>Details</th></tr>
|
|
878
|
-
<tr>
|
|
879
|
-
<td>Project Alpha</td>
|
|
880
|
-
<td>
|
|
881
|
-
<table>
|
|
882
|
-
<tr><th>Task</th><th>Status</th></tr>
|
|
883
|
-
<tr><td>001</td><td>Done</td></tr>
|
|
884
|
-
</table>
|
|
885
|
-
</td>
|
|
886
|
-
</tr>
|
|
887
|
-
</table>";
|
|
888
|
-
let result = tables_from_html(html);
|
|
889
|
-
assert!(
|
|
890
|
-
result.tables.len() >= 2,
|
|
891
|
-
"Expected at least 2 tables (outer + nested), got {}",
|
|
892
|
-
result.tables.len()
|
|
893
|
-
);
|
|
894
|
-
}
|
|
895
|
-
|
|
896
|
-
#[test]
|
|
897
|
-
fn test_convert_with_tables_no_tables() {
|
|
898
|
-
let html = "<p>No tables here</p>";
|
|
899
|
-
let result = tables_from_html(html);
|
|
900
|
-
assert!(result.tables.is_empty());
|
|
901
|
-
assert!(result.content.contains("No tables here"));
|
|
902
|
-
}
|
|
903
|
-
|
|
904
|
-
#[test]
|
|
905
|
-
fn test_convert_with_tables_empty_table() {
|
|
906
|
-
let result = tables_from_html("<table></table>");
|
|
907
|
-
assert!(result.tables.is_empty(), "Empty table should not produce TableData");
|
|
908
|
-
}
|
|
909
|
-
|
|
910
|
-
#[test]
|
|
911
|
-
fn test_convert_with_tables_headers_only() {
|
|
912
|
-
let html = r"<table><thead><tr><th>A</th><th>B</th></tr></thead></table>";
|
|
913
|
-
let result = tables_from_html(html);
|
|
914
|
-
assert_eq!(result.tables.len(), 1);
|
|
915
|
-
assert!(result.tables[0].is_header_row[0]);
|
|
916
|
-
assert_eq!(result.tables[0].cells[0], vec!["A", "B"]);
|
|
917
|
-
}
|
|
918
|
-
|
|
919
|
-
#[test]
|
|
920
|
-
fn test_convert_with_tables_thead_tbody_tfoot() {
|
|
921
|
-
let html = r"
|
|
922
|
-
<table>
|
|
923
|
-
<thead><tr><th>H1</th></tr></thead>
|
|
924
|
-
<tbody><tr><td>B1</td></tr></tbody>
|
|
925
|
-
<tfoot><tr><td>F1</td></tr></tfoot>
|
|
926
|
-
</table>";
|
|
927
|
-
let result = tables_from_html(html);
|
|
928
|
-
assert_eq!(result.tables.len(), 1);
|
|
929
|
-
let t = &result.tables[0];
|
|
930
|
-
assert!(t.is_header_row[0], "thead row should be header");
|
|
931
|
-
assert!(!t.is_header_row[1], "tbody row should not be header");
|
|
932
|
-
assert_eq!(t.cells[0], vec!["H1"]);
|
|
933
|
-
assert_eq!(t.cells[1], vec!["B1"]);
|
|
934
|
-
}
|
|
935
|
-
|
|
936
|
-
#[test]
|
|
937
|
-
fn test_convert_with_tables_multiple_separate() {
|
|
938
|
-
let html = r"
|
|
939
|
-
<table><tr><td>T1</td></tr></table>
|
|
940
|
-
<p>Between tables</p>
|
|
941
|
-
<table><tr><td>T2</td></tr></table>";
|
|
942
|
-
let result = tables_from_html(html);
|
|
943
|
-
assert_eq!(result.tables.len(), 2, "Should find 2 separate tables");
|
|
944
|
-
}
|
|
945
|
-
|
|
946
|
-
#[test]
|
|
947
|
-
fn test_convert_with_tables_special_chars() {
|
|
948
|
-
let html = r"<table><tr><td>a | b</td><td>c*d</td></tr></table>";
|
|
949
|
-
let result = tables_from_html(html);
|
|
950
|
-
assert_eq!(result.tables.len(), 1);
|
|
951
|
-
assert!(!result.tables[0].cells[0].is_empty());
|
|
952
|
-
}
|
|
953
|
-
|
|
954
|
-
#[test]
|
|
955
|
-
fn test_convert_with_tables_single_cell() {
|
|
956
|
-
let html = r"<table><tr><td>Only cell</td></tr></table>";
|
|
957
|
-
let result = tables_from_html(html);
|
|
958
|
-
assert_eq!(result.tables.len(), 1);
|
|
959
|
-
assert_eq!(result.tables[0].cells.len(), 1);
|
|
960
|
-
assert_eq!(result.tables[0].cells[0], vec!["Only cell"]);
|
|
961
|
-
}
|
|
962
|
-
|
|
963
|
-
#[test]
|
|
964
|
-
fn test_convert_with_tables_content_preserved() {
|
|
965
|
-
let html = r"<p>Before</p><table><tr><td>Cell</td></tr></table><p>After</p>";
|
|
966
|
-
let result = tables_from_html(html);
|
|
967
|
-
assert!(result.content.contains("Before"));
|
|
968
|
-
assert!(result.content.contains("After"));
|
|
969
|
-
assert!(result.content.contains('|'), "Markdown table should appear in content");
|
|
970
|
-
}
|
|
971
|
-
|
|
972
|
-
#[test]
|
|
973
|
-
fn test_convert_with_tables_with_options() {
|
|
974
|
-
let options = ConversionOptions {
|
|
975
|
-
heading_style: crate::options::HeadingStyle::Underlined,
|
|
976
|
-
..ConversionOptions::default()
|
|
977
|
-
};
|
|
978
|
-
let html = r"<h1>Title</h1><table><tr><td>Cell</td></tr></table>";
|
|
979
|
-
let result = convert_with_tables(
|
|
980
|
-
html,
|
|
981
|
-
Some(options),
|
|
982
|
-
#[cfg(feature = "metadata")]
|
|
983
|
-
None,
|
|
984
|
-
#[cfg(not(feature = "metadata"))]
|
|
985
|
-
None,
|
|
986
|
-
)
|
|
987
|
-
.unwrap();
|
|
988
|
-
assert_eq!(result.tables.len(), 1);
|
|
989
|
-
assert!(result.content.contains("Title"));
|
|
990
|
-
}
|
|
991
|
-
|
|
992
|
-
#[test]
|
|
993
|
-
fn test_convert_with_tables_plain_text_format() {
|
|
994
|
-
let options = ConversionOptions {
|
|
995
|
-
output_format: crate::options::OutputFormat::Plain,
|
|
996
|
-
..ConversionOptions::default()
|
|
997
|
-
};
|
|
998
|
-
let html = r"<table><tr><th>Name</th></tr><tr><td>Alice</td></tr></table>";
|
|
999
|
-
let result = convert_with_tables(
|
|
1000
|
-
html,
|
|
1001
|
-
Some(options),
|
|
1002
|
-
#[cfg(feature = "metadata")]
|
|
1003
|
-
None,
|
|
1004
|
-
#[cfg(not(feature = "metadata"))]
|
|
1005
|
-
None,
|
|
1006
|
-
)
|
|
1007
|
-
.unwrap();
|
|
1008
|
-
assert!(
|
|
1009
|
-
!result.tables.is_empty(),
|
|
1010
|
-
"Tables should be populated even with plain text output format"
|
|
1011
|
-
);
|
|
1012
|
-
assert_eq!(result.tables[0].cells[0], vec!["Name"]);
|
|
1013
|
-
}
|
|
1014
|
-
|
|
1015
|
-
#[cfg(feature = "metadata")]
|
|
1016
|
-
#[test]
|
|
1017
|
-
fn test_convert_with_tables_metadata_integration() {
|
|
1018
|
-
let html = r#"<html lang="en"><head><title>Test</title></head><body>
|
|
1019
|
-
<table><tr><th>Col</th></tr><tr><td>Val</td></tr></table>
|
|
1020
|
-
</body></html>"#;
|
|
1021
|
-
let config = MetadataConfig::default();
|
|
1022
|
-
let result = convert_with_tables(html, None, Some(config)).unwrap();
|
|
1023
|
-
assert_eq!(result.tables.len(), 1);
|
|
1024
|
-
let meta = result.metadata.as_ref().expect("metadata should be present");
|
|
1025
|
-
assert_eq!(meta.document.language, Some("en".to_string()));
|
|
1026
|
-
}
|
|
1027
|
-
|
|
1028
|
-
#[cfg(feature = "metadata")]
|
|
1029
|
-
#[test]
|
|
1030
|
-
fn test_convert_with_tables_plain_text_metadata() {
|
|
1031
|
-
let options = ConversionOptions {
|
|
1032
|
-
output_format: crate::options::OutputFormat::Plain,
|
|
1033
|
-
..ConversionOptions::default()
|
|
1034
|
-
};
|
|
1035
|
-
let html = r#"<html lang="fr"><body>
|
|
1036
|
-
<table><tr><td>Cell</td></tr></table>
|
|
1037
|
-
</body></html>"#;
|
|
1038
|
-
let config = MetadataConfig::default();
|
|
1039
|
-
let result = convert_with_tables(html, Some(options), Some(config)).unwrap();
|
|
1040
|
-
assert!(
|
|
1041
|
-
!result.tables.is_empty(),
|
|
1042
|
-
"Tables should be populated in plain text mode"
|
|
1043
|
-
);
|
|
1044
|
-
let meta = result.metadata.as_ref().expect("metadata should be present");
|
|
1045
|
-
assert_eq!(
|
|
1046
|
-
meta.document.language,
|
|
1047
|
-
Some("fr".to_string()),
|
|
1048
|
-
"Metadata should be populated in plain text mode"
|
|
1049
|
-
);
|
|
1050
|
-
}
|
|
1051
|
-
}
|