html-to-markdown 2.30.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -14
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +5 -2
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +126 -52
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -33,7 +33,6 @@
|
|
|
33
33
|
clippy::assigning_clones,
|
|
34
34
|
clippy::uninlined_format_args
|
|
35
35
|
)]
|
|
36
|
-
#![allow(dead_code)]
|
|
37
36
|
|
|
38
37
|
//! High-performance HTML to Markdown converter.
|
|
39
38
|
//!
|
|
@@ -50,7 +49,6 @@
|
|
|
50
49
|
|
|
51
50
|
pub mod converter;
|
|
52
51
|
pub mod error;
|
|
53
|
-
pub mod hocr;
|
|
54
52
|
#[cfg(feature = "inline-images")]
|
|
55
53
|
mod inline_images;
|
|
56
54
|
#[cfg(feature = "metadata")]
|
|
@@ -58,6 +56,7 @@ pub mod metadata;
|
|
|
58
56
|
pub mod options;
|
|
59
57
|
pub mod safety;
|
|
60
58
|
pub mod text;
|
|
59
|
+
pub mod types;
|
|
61
60
|
#[cfg(feature = "visitor")]
|
|
62
61
|
pub mod visitor;
|
|
63
62
|
#[cfg(feature = "visitor")]
|
|
@@ -76,6 +75,11 @@ mod validation;
|
|
|
76
75
|
// ============================================================================
|
|
77
76
|
|
|
78
77
|
pub use exports::*;
|
|
78
|
+
pub use types::{
|
|
79
|
+
AnnotationKind, ConversionResult, DocumentNode, DocumentStructure, GridCell, NodeContent, ProcessingWarning,
|
|
80
|
+
TableGrid, TextAnnotation, WarningKind,
|
|
81
|
+
};
|
|
82
|
+
// Note: types::TableData will replace convert_api::TableData when convert() is refactored
|
|
79
83
|
|
|
80
84
|
// ============================================================================
|
|
81
85
|
// Main Public API Functions
|
|
@@ -90,141 +94,15 @@ pub use convert_api::{conversion_options_from_json, conversion_options_update_fr
|
|
|
90
94
|
pub use convert_api::metadata_config_from_json;
|
|
91
95
|
|
|
92
96
|
#[cfg(feature = "inline-images")]
|
|
93
|
-
pub use convert_api::
|
|
94
|
-
|
|
95
|
-
#[cfg(feature = "metadata")]
|
|
96
|
-
pub use convert_api::convert_with_metadata;
|
|
97
|
+
pub use convert_api::inline_image_config_from_json;
|
|
97
98
|
|
|
98
99
|
#[cfg(feature = "visitor")]
|
|
100
|
+
#[doc(hidden)]
|
|
99
101
|
pub use convert_api::convert_with_visitor;
|
|
100
102
|
|
|
101
|
-
#[cfg(feature = "visitor")]
|
|
102
|
-
pub use convert_api::{ConversionWithTables, TableData, convert_with_tables};
|
|
103
|
-
|
|
104
|
-
#[cfg(feature = "async-visitor")]
|
|
105
|
-
pub use convert_api::convert_with_async_visitor;
|
|
106
|
-
|
|
107
103
|
// Tests
|
|
108
104
|
// ============================================================================
|
|
109
105
|
|
|
110
|
-
#[cfg(all(test, feature = "metadata"))]
|
|
111
|
-
mod tests {
|
|
112
|
-
use super::*;
|
|
113
|
-
|
|
114
|
-
#[test]
|
|
115
|
-
fn test_convert_with_metadata_full_workflow() {
|
|
116
|
-
let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
|
|
117
|
-
|
|
118
|
-
let config = MetadataConfig {
|
|
119
|
-
extract_document: true,
|
|
120
|
-
extract_headers: true,
|
|
121
|
-
extract_links: true,
|
|
122
|
-
extract_images: true,
|
|
123
|
-
extract_structured_data: true,
|
|
124
|
-
max_structured_data_size: metadata::DEFAULT_MAX_STRUCTURED_DATA_SIZE,
|
|
125
|
-
};
|
|
126
|
-
|
|
127
|
-
let (markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
|
|
128
|
-
|
|
129
|
-
assert!(!markdown.is_empty());
|
|
130
|
-
assert!(markdown.contains("Main Title"));
|
|
131
|
-
assert!(markdown.contains("Subsection"));
|
|
132
|
-
|
|
133
|
-
assert_eq!(metadata.document.language, Some("en".to_string()));
|
|
134
|
-
|
|
135
|
-
assert_eq!(metadata.headers.len(), 2);
|
|
136
|
-
assert_eq!(metadata.headers[0].level, 1);
|
|
137
|
-
assert_eq!(metadata.headers[0].text, "Main Title");
|
|
138
|
-
assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
|
|
139
|
-
assert_eq!(metadata.headers[1].level, 2);
|
|
140
|
-
assert_eq!(metadata.headers[1].text, "Subsection");
|
|
141
|
-
|
|
142
|
-
assert!(metadata.links.len() >= 2);
|
|
143
|
-
let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
|
|
144
|
-
assert!(external_link.is_some());
|
|
145
|
-
let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
|
|
146
|
-
assert!(anchor_link.is_some());
|
|
147
|
-
|
|
148
|
-
assert_eq!(metadata.images.len(), 1);
|
|
149
|
-
assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
|
|
150
|
-
assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
|
|
151
|
-
assert_eq!(metadata.images[0].image_type, ImageType::External);
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
#[test]
|
|
155
|
-
fn test_convert_with_metadata_document_fields() {
|
|
156
|
-
let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
|
|
157
|
-
|
|
158
|
-
let (_markdown, metadata) =
|
|
159
|
-
convert_with_metadata(html, None, MetadataConfig::default(), None).expect("conversion should succeed");
|
|
160
|
-
|
|
161
|
-
assert_eq!(
|
|
162
|
-
metadata.document.title,
|
|
163
|
-
Some("Test Article".to_string()),
|
|
164
|
-
"document: {:?}",
|
|
165
|
-
metadata.document
|
|
166
|
-
);
|
|
167
|
-
assert_eq!(metadata.document.description, Some("Desc".to_string()));
|
|
168
|
-
assert_eq!(metadata.document.author, Some("Author".to_string()));
|
|
169
|
-
assert_eq!(metadata.document.language, Some("en".to_string()));
|
|
170
|
-
assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
|
|
171
|
-
assert_eq!(
|
|
172
|
-
metadata.document.open_graph.get("description"),
|
|
173
|
-
Some(&"OG Desc".to_string())
|
|
174
|
-
);
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
#[test]
|
|
178
|
-
fn test_convert_with_metadata_empty_config() {
|
|
179
|
-
let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
|
|
180
|
-
|
|
181
|
-
let config = MetadataConfig {
|
|
182
|
-
extract_document: false,
|
|
183
|
-
extract_headers: false,
|
|
184
|
-
extract_links: false,
|
|
185
|
-
extract_images: false,
|
|
186
|
-
extract_structured_data: false,
|
|
187
|
-
max_structured_data_size: 0,
|
|
188
|
-
};
|
|
189
|
-
|
|
190
|
-
let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
|
|
191
|
-
|
|
192
|
-
assert!(metadata.headers.is_empty());
|
|
193
|
-
assert!(metadata.links.is_empty());
|
|
194
|
-
assert!(metadata.images.is_empty());
|
|
195
|
-
assert_eq!(metadata.document.language, None);
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
#[test]
|
|
199
|
-
fn test_convert_with_metadata_data_uri_image() {
|
|
200
|
-
let html = "<html><body><img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==\" alt=\"Pixel\"></body></html>";
|
|
201
|
-
|
|
202
|
-
let config = MetadataConfig::default();
|
|
203
|
-
|
|
204
|
-
let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
|
|
205
|
-
|
|
206
|
-
assert_eq!(metadata.images.len(), 1);
|
|
207
|
-
assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
|
|
208
|
-
assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
|
|
209
|
-
}
|
|
210
|
-
|
|
211
|
-
#[test]
|
|
212
|
-
fn test_convert_with_metadata_relative_paths() {
|
|
213
|
-
let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
|
|
214
|
-
|
|
215
|
-
let config = MetadataConfig::default();
|
|
216
|
-
|
|
217
|
-
let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
|
|
218
|
-
|
|
219
|
-
let internal_link_count = metadata
|
|
220
|
-
.links
|
|
221
|
-
.iter()
|
|
222
|
-
.filter(|l| l.link_type == LinkType::Internal)
|
|
223
|
-
.count();
|
|
224
|
-
assert_eq!(internal_link_count, 2);
|
|
225
|
-
}
|
|
226
|
-
}
|
|
227
|
-
|
|
228
106
|
#[cfg(test)]
|
|
229
107
|
mod basic_tests {
|
|
230
108
|
use super::*;
|
|
@@ -253,7 +131,8 @@ mod basic_tests {
|
|
|
253
131
|
#[test]
|
|
254
132
|
fn test_plain_text_allowed() {
|
|
255
133
|
let result = convert("Just text", None).unwrap();
|
|
256
|
-
|
|
134
|
+
let content = result.content.unwrap_or_default();
|
|
135
|
+
assert!(content.contains("Just text"));
|
|
257
136
|
}
|
|
258
137
|
|
|
259
138
|
#[test]
|
|
@@ -264,7 +143,8 @@ mod basic_tests {
|
|
|
264
143
|
..ConversionOptions::default()
|
|
265
144
|
};
|
|
266
145
|
let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
|
|
267
|
-
|
|
268
|
-
assert!(
|
|
146
|
+
let content = result.content.unwrap_or_default();
|
|
147
|
+
assert!(content.contains(r"\*asterisks\*"));
|
|
148
|
+
assert!(content.contains(r"\_underscores\_"));
|
|
269
149
|
}
|
|
270
150
|
}
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
use super::config::MetadataConfig;
|
|
4
4
|
use super::extraction::{extract_document_metadata, extract_structured_data};
|
|
5
|
-
use super::types::{
|
|
5
|
+
use super::types::{HtmlMetadata, ImageMetadata, ImageType, LinkMetadata};
|
|
6
6
|
use std::collections::BTreeMap;
|
|
7
7
|
|
|
8
8
|
/// Internal metadata collector for single-pass extraction.
|
|
@@ -256,13 +256,13 @@ impl MetadataCollector {
|
|
|
256
256
|
/// Finish collection and return all extracted metadata.
|
|
257
257
|
///
|
|
258
258
|
/// Performs final processing, validation, and consolidation of all
|
|
259
|
-
/// collected data into the [`
|
|
259
|
+
/// collected data into the [`HtmlMetadata`] output structure.
|
|
260
260
|
#[allow(dead_code)]
|
|
261
|
-
pub(crate) fn finish(self) ->
|
|
261
|
+
pub(crate) fn finish(self) -> HtmlMetadata {
|
|
262
262
|
let structured_data = extract_structured_data(self.json_ld);
|
|
263
263
|
let document = extract_document_metadata(self.head_metadata, self.lang, self.dir);
|
|
264
264
|
|
|
265
|
-
|
|
265
|
+
HtmlMetadata {
|
|
266
266
|
document,
|
|
267
267
|
headers: self.headers,
|
|
268
268
|
links: self.links,
|
|
@@ -40,14 +40,14 @@
|
|
|
40
40
|
//! - [`ImageMetadata`]: Image element with src, alt, title, dimensions, type, and attributes
|
|
41
41
|
//! - [`StructuredData`]: Structured data block with type and raw JSON
|
|
42
42
|
//! - [`MetadataConfig`]: Configuration controlling extraction granularity and size limits
|
|
43
|
-
//! - [`
|
|
43
|
+
//! - [`HtmlMetadata`]: Top-level result containing all extracted metadata
|
|
44
44
|
//!
|
|
45
45
|
//! # Examples
|
|
46
46
|
//!
|
|
47
|
-
//! ## Basic Usage with `
|
|
47
|
+
//! ## Basic Usage with `convert()`
|
|
48
48
|
//!
|
|
49
49
|
//! ```ignore
|
|
50
|
-
//! use html_to_markdown_rs::
|
|
50
|
+
//! use html_to_markdown_rs::convert;
|
|
51
51
|
//!
|
|
52
52
|
//! let html = r#"
|
|
53
53
|
//! <html lang="en">
|
|
@@ -63,8 +63,8 @@
|
|
|
63
63
|
//! </html>
|
|
64
64
|
//! "#;
|
|
65
65
|
//!
|
|
66
|
-
//! let
|
|
67
|
-
//! let
|
|
66
|
+
//! let result = convert(html, None)?;
|
|
67
|
+
//! let metadata = result.metadata.unwrap();
|
|
68
68
|
//!
|
|
69
69
|
//! // Access document metadata
|
|
70
70
|
//! assert_eq!(metadata.document.title, Some("My Article".to_string()));
|
|
@@ -88,28 +88,26 @@
|
|
|
88
88
|
//! ## Selective Extraction
|
|
89
89
|
//!
|
|
90
90
|
//! ```ignore
|
|
91
|
-
//! use html_to_markdown_rs::{
|
|
91
|
+
//! use html_to_markdown_rs::{convert, ConversionOptions};
|
|
92
92
|
//!
|
|
93
|
-
//! let
|
|
94
|
-
//!
|
|
95
|
-
//!
|
|
96
|
-
//! extract_images: false, // Skip images
|
|
97
|
-
//! extract_structured_data: false, // Skip structured data
|
|
98
|
-
//! max_structured_data_size: 0,
|
|
93
|
+
//! let options = ConversionOptions {
|
|
94
|
+
//! extract_metadata: false, // Disable metadata extraction
|
|
95
|
+
//! ..Default::default()
|
|
99
96
|
//! };
|
|
100
97
|
//!
|
|
101
|
-
//! let
|
|
102
|
-
//!
|
|
98
|
+
//! let result = convert(html, Some(options))?;
|
|
99
|
+
//! assert!(result.metadata.is_none()); // Metadata not extracted
|
|
103
100
|
//! # Ok::<(), html_to_markdown_rs::ConversionError>(())
|
|
104
101
|
//! ```
|
|
105
102
|
//!
|
|
106
103
|
//! ## Analyzing Link Types
|
|
107
104
|
//!
|
|
108
105
|
//! ```ignore
|
|
109
|
-
//! use html_to_markdown_rs::
|
|
106
|
+
//! use html_to_markdown_rs::convert;
|
|
110
107
|
//! use html_to_markdown_rs::metadata::LinkType;
|
|
111
108
|
//!
|
|
112
|
-
//! let
|
|
109
|
+
//! let result = convert(html, None)?;
|
|
110
|
+
//! let metadata = result.metadata.unwrap();
|
|
113
111
|
//!
|
|
114
112
|
//! for link in &metadata.links {
|
|
115
113
|
//! match link.link_type {
|
|
@@ -129,11 +127,13 @@
|
|
|
129
127
|
//! This enables easy export to JSON, YAML, or other formats:
|
|
130
128
|
//!
|
|
131
129
|
//! ```ignore
|
|
132
|
-
//! use html_to_markdown_rs::
|
|
130
|
+
//! use html_to_markdown_rs::convert;
|
|
133
131
|
//!
|
|
134
|
-
//! let
|
|
135
|
-
//! let
|
|
136
|
-
//!
|
|
132
|
+
//! let result = convert(html, None)?;
|
|
133
|
+
//! if let Some(metadata) = &result.metadata {
|
|
134
|
+
//! let json = serde_json::to_string_pretty(metadata)?;
|
|
135
|
+
//! println!("{}", json);
|
|
136
|
+
//! }
|
|
137
137
|
//! # Ok::<(), Box<dyn std::error::Error>>(())
|
|
138
138
|
//! ```
|
|
139
139
|
|
|
@@ -146,8 +146,8 @@ pub mod types;
|
|
|
146
146
|
pub use collector::MetadataCollector;
|
|
147
147
|
pub use config::{DEFAULT_MAX_STRUCTURED_DATA_SIZE, MetadataConfig, MetadataConfigUpdate};
|
|
148
148
|
pub use types::{
|
|
149
|
-
DocumentMetadata,
|
|
150
|
-
|
|
149
|
+
DocumentMetadata, HeaderMetadata, HtmlMetadata, ImageMetadata, ImageType, LinkMetadata, LinkType, StructuredData,
|
|
150
|
+
StructuredDataType, TextDirection,
|
|
151
151
|
};
|
|
152
152
|
|
|
153
153
|
// Internal handle type for shared mutable access during tree traversal
|
|
@@ -446,8 +446,8 @@ pub struct StructuredData {
|
|
|
446
446
|
/// # Examples
|
|
447
447
|
///
|
|
448
448
|
/// ```
|
|
449
|
-
/// # use html_to_markdown_rs::metadata::
|
|
450
|
-
/// let metadata =
|
|
449
|
+
/// # use html_to_markdown_rs::metadata::HtmlMetadata;
|
|
450
|
+
/// let metadata = HtmlMetadata {
|
|
451
451
|
/// document: Default::default(),
|
|
452
452
|
/// headers: Vec::new(),
|
|
453
453
|
/// links: Vec::new(),
|
|
@@ -459,7 +459,7 @@ pub struct StructuredData {
|
|
|
459
459
|
/// ```
|
|
460
460
|
#[derive(Debug, Clone, Default)]
|
|
461
461
|
#[cfg_attr(feature = "metadata", derive(serde::Serialize, serde::Deserialize))]
|
|
462
|
-
pub struct
|
|
462
|
+
pub struct HtmlMetadata {
|
|
463
463
|
/// Document-level metadata (title, description, canonical, etc.)
|
|
464
464
|
pub document: DocumentMetadata,
|
|
465
465
|
|