html-to-markdown 3.0.2 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +42 -12
- data/Gemfile +1 -0
- data/Gemfile.lock +27 -55
- data/README.md +9 -10
- data/Rakefile +4 -10
- data/ext/html-to-markdown_rb/Cargo.toml +14 -0
- data/ext/html_to_markdown_rb/Cargo.toml +16 -0
- data/ext/html_to_markdown_rb/extconf.rb +10 -0
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +6 -0
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +9 -0
- data/ext/html_to_markdown_rb/src/lib.rs +3941 -0
- data/html-to-markdown-rb.gemspec +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +31 -21
- data/{ext/html-to-markdown-rb/native/extconf.rb → lib/html_to_markdown_rs.rb} +1 -1
- data/sig/html_to_markdown.rbs +17 -5
- data/vendor/Cargo.toml +4 -4
- data/vendor/html-to-markdown-rs/Cargo.toml +2 -2
- data/vendor/html-to-markdown-rs/examples/test_deser.rs +12 -0
- data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/context.rs +5 -0
- data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +38 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +56 -17
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +11 -0
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +17 -0
- data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +10 -2
- data/vendor/html-to-markdown-rs/src/converter/main.rs +25 -0
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +42 -15
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +3 -2
- data/vendor/html-to-markdown-rs/src/converter/reference_collector.rs +69 -0
- data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +1 -1
- data/vendor/html-to-markdown-rs/src/exports.rs +3 -2
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +1 -2
- data/vendor/html-to-markdown-rs/src/metadata/config.rs +1 -1
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +5 -5
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +14 -13
- data/vendor/html-to-markdown-rs/src/options/mod.rs +2 -2
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +3 -9
- data/vendor/html-to-markdown-rs/src/options/validation.rs +46 -4
- data/vendor/html-to-markdown-rs/src/types/document.rs +11 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +5 -2
- data/vendor/html-to-markdown-rs/src/types/tables.rs +1 -1
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/state.rs +1 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/traversal.rs +1 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +6 -0
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +27 -3
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +169 -0
- metadata +13 -18
- data/ext/html-to-markdown-rb/extconf.rb +0 -41
- data/ext/html-to-markdown-rb/native/Cargo.lock +0 -934
- data/ext/html-to-markdown-rb/native/Cargo.toml +0 -48
- data/ext/html-to-markdown-rb/native/README.md +0 -215
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +0 -54
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +0 -158
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -11
- data/ext/html-to-markdown-rb/native/src/lib.rs +0 -128
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -238
- data/ext/html-to-markdown-rb/native/src/types.rs +0 -24
- data/lib/html_to_markdown/cli.rb +0 -21
- data/lib/html_to_markdown/cli_proxy.rb +0 -74
- data/spec/cli_proxy_spec.rb +0 -42
- data/spec/spec_helper.rb +0 -10
|
@@ -78,11 +78,20 @@ pub(crate) fn handle_audio(
|
|
|
78
78
|
};
|
|
79
79
|
|
|
80
80
|
if should_output_media_link(&src) {
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
81
|
+
if let Some(ref collector) = ctx.reference_collector {
|
|
82
|
+
let ref_num = collector.borrow_mut().get_or_insert(&src, None);
|
|
83
|
+
output.push('[');
|
|
84
|
+
output.push_str(&src);
|
|
85
|
+
output.push_str("][");
|
|
86
|
+
output.push_str(&ref_num.to_string());
|
|
87
|
+
output.push(']');
|
|
88
|
+
} else {
|
|
89
|
+
output.push('[');
|
|
90
|
+
output.push_str(&src);
|
|
91
|
+
output.push_str("](");
|
|
92
|
+
output.push_str(&src);
|
|
93
|
+
output.push(')');
|
|
94
|
+
}
|
|
86
95
|
if !ctx.in_paragraph && !ctx.convert_as_inline {
|
|
87
96
|
output.push_str("\n\n");
|
|
88
97
|
}
|
|
@@ -132,11 +141,20 @@ pub(crate) fn handle_video(
|
|
|
132
141
|
};
|
|
133
142
|
|
|
134
143
|
if should_output_media_link(&src) {
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
144
|
+
if let Some(ref collector) = ctx.reference_collector {
|
|
145
|
+
let ref_num = collector.borrow_mut().get_or_insert(&src, None);
|
|
146
|
+
output.push('[');
|
|
147
|
+
output.push_str(&src);
|
|
148
|
+
output.push_str("][");
|
|
149
|
+
output.push_str(&ref_num.to_string());
|
|
150
|
+
output.push(']');
|
|
151
|
+
} else {
|
|
152
|
+
output.push('[');
|
|
153
|
+
output.push_str(&src);
|
|
154
|
+
output.push_str("](");
|
|
155
|
+
output.push_str(&src);
|
|
156
|
+
output.push(')');
|
|
157
|
+
}
|
|
140
158
|
if !ctx.in_paragraph && !ctx.convert_as_inline {
|
|
141
159
|
output.push_str("\n\n");
|
|
142
160
|
}
|
|
@@ -199,11 +217,20 @@ pub(crate) fn handle_iframe(tag: &HTMLTag, output: &mut String, ctx: &Context) {
|
|
|
199
217
|
.map_or(Cow::Borrowed(""), |v| v.as_utf8_str());
|
|
200
218
|
|
|
201
219
|
if !src.is_empty() {
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
220
|
+
if let Some(ref collector) = ctx.reference_collector {
|
|
221
|
+
let ref_num = collector.borrow_mut().get_or_insert(&src, None);
|
|
222
|
+
output.push('[');
|
|
223
|
+
output.push_str(&src);
|
|
224
|
+
output.push_str("][");
|
|
225
|
+
output.push_str(&ref_num.to_string());
|
|
226
|
+
output.push(']');
|
|
227
|
+
} else {
|
|
228
|
+
output.push('[');
|
|
229
|
+
output.push_str(&src);
|
|
230
|
+
output.push_str("](");
|
|
231
|
+
output.push_str(&src);
|
|
232
|
+
output.push(')');
|
|
233
|
+
}
|
|
207
234
|
if !ctx.in_paragraph && !ctx.convert_as_inline {
|
|
208
235
|
output.push_str("\n\n");
|
|
209
236
|
}
|
|
@@ -40,7 +40,7 @@
|
|
|
40
40
|
//!
|
|
41
41
|
//! Each submodule (block, inline, list, etc.) follows a consistent pattern:
|
|
42
42
|
//!
|
|
43
|
-
//! ```
|
|
43
|
+
//! ```text
|
|
44
44
|
//! // Module declares handlers for specific element types
|
|
45
45
|
//! pub fn dispatch_<category>_handler(
|
|
46
46
|
//! tag_name: &str,
|
|
@@ -74,7 +74,7 @@
|
|
|
74
74
|
//! Once `converter.rs` is refactored to use `converter/main.rs`, the walk_node function
|
|
75
75
|
//! will use dispatch functions like:
|
|
76
76
|
//!
|
|
77
|
-
//! ```
|
|
77
|
+
//! ```text
|
|
78
78
|
//! use crate::converter::{block, inline, list, media, semantic, form};
|
|
79
79
|
//!
|
|
80
80
|
//! fn walk_node(...) {
|
|
@@ -103,6 +103,7 @@ pub mod media;
|
|
|
103
103
|
mod metadata;
|
|
104
104
|
pub mod plain_text;
|
|
105
105
|
pub mod preprocessing_helpers;
|
|
106
|
+
pub mod reference_collector;
|
|
106
107
|
pub mod semantic;
|
|
107
108
|
pub mod text;
|
|
108
109
|
mod text_node;
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
//! Collector for reference-style link definitions.
|
|
2
|
+
|
|
3
|
+
use std::cell::RefCell;
|
|
4
|
+
use std::collections::HashMap;
|
|
5
|
+
use std::rc::Rc;
|
|
6
|
+
|
|
7
|
+
/// Shared handle for passing the collector through the conversion context.
|
|
8
|
+
pub type ReferenceCollectorHandle = Rc<RefCell<ReferenceCollector>>;
|
|
9
|
+
|
|
10
|
+
#[derive(Debug, Clone, Hash, Eq, PartialEq)]
|
|
11
|
+
struct ReferenceKey {
|
|
12
|
+
url: String,
|
|
13
|
+
title: Option<String>,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/// Collects link/image references during conversion and produces a reference
|
|
17
|
+
/// definitions section at the end of the document.
|
|
18
|
+
#[derive(Debug, Default)]
|
|
19
|
+
pub struct ReferenceCollector {
|
|
20
|
+
map: HashMap<ReferenceKey, usize>,
|
|
21
|
+
entries: Vec<(usize, String, Option<String>)>,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
impl ReferenceCollector {
|
|
25
|
+
/// Create a new, empty reference collector.
|
|
26
|
+
pub fn new() -> Self {
|
|
27
|
+
Self::default()
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/// Register a URL (and optional title) and return its 1-based reference number.
|
|
31
|
+
///
|
|
32
|
+
/// If the same URL+title pair was already registered, the existing number is returned.
|
|
33
|
+
pub fn get_or_insert(&mut self, url: &str, title: Option<&str>) -> usize {
|
|
34
|
+
let key = ReferenceKey {
|
|
35
|
+
url: url.to_string(),
|
|
36
|
+
title: title.map(String::from),
|
|
37
|
+
};
|
|
38
|
+
if let Some(&num) = self.map.get(&key) {
|
|
39
|
+
return num;
|
|
40
|
+
}
|
|
41
|
+
let num = self.entries.len() + 1;
|
|
42
|
+
self.map.insert(key, num);
|
|
43
|
+
self.entries.push((num, url.to_string(), title.map(String::from)));
|
|
44
|
+
num
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/// Produce the reference definitions section.
|
|
48
|
+
///
|
|
49
|
+
/// Returns an empty string when no references were collected.
|
|
50
|
+
pub fn finish(&self) -> String {
|
|
51
|
+
if self.entries.is_empty() {
|
|
52
|
+
return String::new();
|
|
53
|
+
}
|
|
54
|
+
let mut out = String::new();
|
|
55
|
+
for (num, url, title) in &self.entries {
|
|
56
|
+
out.push('[');
|
|
57
|
+
out.push_str(&num.to_string());
|
|
58
|
+
out.push_str("]: ");
|
|
59
|
+
out.push_str(url);
|
|
60
|
+
if let Some(t) = title {
|
|
61
|
+
out.push_str(" \"");
|
|
62
|
+
out.push_str(&t.replace('"', "\\\""));
|
|
63
|
+
out.push('"');
|
|
64
|
+
}
|
|
65
|
+
out.push('\n');
|
|
66
|
+
}
|
|
67
|
+
out
|
|
68
|
+
}
|
|
69
|
+
}
|
|
@@ -166,7 +166,7 @@ pub(crate) fn is_block_level_element(tag_name: &str) -> bool {
|
|
|
166
166
|
/// If `index` is already a char boundary it is returned unchanged.
|
|
167
167
|
/// Otherwise it walks backwards to find one. Returns 0 if no boundary
|
|
168
168
|
/// is found before `index`.
|
|
169
|
-
pub
|
|
169
|
+
pub fn floor_char_boundary(s: &str, index: usize) -> usize {
|
|
170
170
|
if index >= s.len() {
|
|
171
171
|
s.len()
|
|
172
172
|
} else {
|
|
@@ -18,6 +18,7 @@ pub use crate::metadata::{
|
|
|
18
18
|
};
|
|
19
19
|
|
|
20
20
|
pub use crate::options::{
|
|
21
|
-
CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle,
|
|
22
|
-
NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate,
|
|
21
|
+
CodeBlockStyle, ConversionOptions, ConversionOptionsBuilder, ConversionOptionsUpdate, HeadingStyle, HighlightStyle,
|
|
22
|
+
LinkStyle, ListIndentType, NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate,
|
|
23
|
+
PreprocessingPreset, WhitespaceMode,
|
|
23
24
|
};
|
|
@@ -26,7 +26,7 @@ pub const DEFAULT_INLINE_IMAGE_LIMIT: u64 = 5 * 1024 * 1024;
|
|
|
26
26
|
/// corresponding fields unchanged when applied via [`InlineImageConfig::apply_update`].
|
|
27
27
|
#[derive(Debug, Clone, Default)]
|
|
28
28
|
#[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
|
|
29
|
-
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(
|
|
29
|
+
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
|
|
30
30
|
pub struct InlineImageConfigUpdate {
|
|
31
31
|
/// Optional maximum decoded size override in bytes.
|
|
32
32
|
pub max_decoded_size_bytes: Option<u64>,
|
|
@@ -77,9 +77,8 @@ mod validation;
|
|
|
77
77
|
pub use exports::*;
|
|
78
78
|
pub use types::{
|
|
79
79
|
AnnotationKind, ConversionResult, DocumentNode, DocumentStructure, GridCell, NodeContent, ProcessingWarning,
|
|
80
|
-
TableGrid, TextAnnotation, WarningKind,
|
|
80
|
+
TableData, TableGrid, TextAnnotation, WarningKind,
|
|
81
81
|
};
|
|
82
|
-
// Note: types::TableData will replace convert_api::TableData when convert() is refactored
|
|
83
82
|
|
|
84
83
|
// ============================================================================
|
|
85
84
|
// Main Public API Functions
|
|
@@ -133,7 +133,7 @@ pub struct MetadataConfig {
|
|
|
133
133
|
/// ```
|
|
134
134
|
#[derive(Debug, Clone, Default)]
|
|
135
135
|
#[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
|
|
136
|
-
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(
|
|
136
|
+
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
|
|
137
137
|
pub struct MetadataConfigUpdate {
|
|
138
138
|
/// Optional override for extracting document-level metadata.
|
|
139
139
|
///
|
|
@@ -46,7 +46,7 @@
|
|
|
46
46
|
//!
|
|
47
47
|
//! ## Basic Usage with `convert()`
|
|
48
48
|
//!
|
|
49
|
-
//! ```
|
|
49
|
+
//! ```text
|
|
50
50
|
//! use html_to_markdown_rs::convert;
|
|
51
51
|
//!
|
|
52
52
|
//! let html = r#"
|
|
@@ -87,7 +87,7 @@
|
|
|
87
87
|
//!
|
|
88
88
|
//! ## Selective Extraction
|
|
89
89
|
//!
|
|
90
|
-
//! ```
|
|
90
|
+
//! ```text
|
|
91
91
|
//! use html_to_markdown_rs::{convert, ConversionOptions};
|
|
92
92
|
//!
|
|
93
93
|
//! let options = ConversionOptions {
|
|
@@ -102,7 +102,7 @@
|
|
|
102
102
|
//!
|
|
103
103
|
//! ## Analyzing Link Types
|
|
104
104
|
//!
|
|
105
|
-
//! ```
|
|
105
|
+
//! ```text
|
|
106
106
|
//! use html_to_markdown_rs::convert;
|
|
107
107
|
//! use html_to_markdown_rs::metadata::LinkType;
|
|
108
108
|
//!
|
|
@@ -126,7 +126,7 @@
|
|
|
126
126
|
//! All types in this module support serialization via `serde` when the `metadata` feature is enabled.
|
|
127
127
|
//! This enables easy export to JSON, YAML, or other formats:
|
|
128
128
|
//!
|
|
129
|
-
//! ```
|
|
129
|
+
//! ```text
|
|
130
130
|
//! use html_to_markdown_rs::convert;
|
|
131
131
|
//!
|
|
132
132
|
//! let result = convert(html, None)?;
|
|
@@ -160,7 +160,7 @@ use std::rc::Rc;
|
|
|
160
160
|
///
|
|
161
161
|
/// # Examples
|
|
162
162
|
///
|
|
163
|
-
/// ```
|
|
163
|
+
/// ```text
|
|
164
164
|
/// let collector = MetadataCollector::new(MetadataConfig::default());
|
|
165
165
|
/// let handle = Rc::new(RefCell::new(collector));
|
|
166
166
|
///
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
use crate::options::preprocessing::PreprocessingOptions;
|
|
6
6
|
use crate::options::validation::{
|
|
7
|
-
CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
|
|
7
|
+
CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
|
|
8
8
|
};
|
|
9
9
|
|
|
10
10
|
/// Main conversion options for HTML to Markdown conversion.
|
|
@@ -13,7 +13,7 @@ use crate::options::validation::{
|
|
|
13
13
|
///
|
|
14
14
|
/// # Example
|
|
15
15
|
///
|
|
16
|
-
/// ```
|
|
16
|
+
/// ```text
|
|
17
17
|
/// use html_to_markdown_rs::ConversionOptions;
|
|
18
18
|
///
|
|
19
19
|
/// let options = ConversionOptions::builder()
|
|
@@ -27,10 +27,7 @@ use crate::options::validation::{
|
|
|
27
27
|
any(feature = "serde", feature = "metadata"),
|
|
28
28
|
derive(serde::Serialize, serde::Deserialize)
|
|
29
29
|
)]
|
|
30
|
-
#[cfg_attr(
|
|
31
|
-
any(feature = "serde", feature = "metadata"),
|
|
32
|
-
serde(rename_all = "camelCase", default, deny_unknown_fields)
|
|
33
|
-
)]
|
|
30
|
+
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(default, deny_unknown_fields))]
|
|
34
31
|
pub struct ConversionOptions {
|
|
35
32
|
/// Heading style to use in Markdown output (ATX `#` or Setext underline).
|
|
36
33
|
pub heading_style: HeadingStyle,
|
|
@@ -94,6 +91,8 @@ pub struct ConversionOptions {
|
|
|
94
91
|
pub preserve_tags: Vec<String>,
|
|
95
92
|
/// Skip conversion of `<img>` elements (omit images from output).
|
|
96
93
|
pub skip_images: bool,
|
|
94
|
+
/// Link rendering style (inline or reference).
|
|
95
|
+
pub link_style: LinkStyle,
|
|
97
96
|
/// Target output format (Markdown, plain text, etc.).
|
|
98
97
|
pub output_format: OutputFormat,
|
|
99
98
|
/// Include structured document tree in result.
|
|
@@ -114,7 +113,7 @@ impl Default for ConversionOptions {
|
|
|
114
113
|
heading_style: HeadingStyle::default(),
|
|
115
114
|
list_indent_type: ListIndentType::default(),
|
|
116
115
|
list_indent_width: 2,
|
|
117
|
-
bullets: "
|
|
116
|
+
bullets: "-*+".to_string(),
|
|
118
117
|
strong_em_symbol: '*',
|
|
119
118
|
escape_asterisks: false,
|
|
120
119
|
escape_underscores: false,
|
|
@@ -142,6 +141,7 @@ impl Default for ConversionOptions {
|
|
|
142
141
|
strip_tags: Vec::new(),
|
|
143
142
|
preserve_tags: Vec::new(),
|
|
144
143
|
skip_images: false,
|
|
144
|
+
link_style: LinkStyle::default(),
|
|
145
145
|
output_format: OutputFormat::default(),
|
|
146
146
|
include_document_structure: false,
|
|
147
147
|
extract_images: false,
|
|
@@ -207,6 +207,7 @@ impl ConversionOptionsBuilder {
|
|
|
207
207
|
builder_setter!(newline_style, NewlineStyle);
|
|
208
208
|
builder_setter!(highlight_style, HighlightStyle);
|
|
209
209
|
builder_setter_into!(code_language, String);
|
|
210
|
+
builder_setter!(link_style, LinkStyle);
|
|
210
211
|
builder_setter!(autolinks, bool);
|
|
211
212
|
builder_setter!(default_title, bool);
|
|
212
213
|
builder_setter!(br_in_tables, bool);
|
|
@@ -289,10 +290,7 @@ use crate::options::preprocessing::PreprocessingOptionsUpdate;
|
|
|
289
290
|
any(feature = "serde", feature = "metadata"),
|
|
290
291
|
derive(serde::Serialize, serde::Deserialize)
|
|
291
292
|
)]
|
|
292
|
-
#[cfg_attr(
|
|
293
|
-
any(feature = "serde", feature = "metadata"),
|
|
294
|
-
serde(rename_all = "camelCase", deny_unknown_fields)
|
|
295
|
-
)]
|
|
293
|
+
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
|
|
296
294
|
pub struct ConversionOptionsUpdate {
|
|
297
295
|
/// Optional override for [`ConversionOptions::heading_style`].
|
|
298
296
|
pub heading_style: Option<HeadingStyle>,
|
|
@@ -356,6 +354,8 @@ pub struct ConversionOptionsUpdate {
|
|
|
356
354
|
pub preserve_tags: Option<Vec<String>>,
|
|
357
355
|
/// Optional override for [`ConversionOptions::skip_images`].
|
|
358
356
|
pub skip_images: Option<bool>,
|
|
357
|
+
/// Optional override for [`ConversionOptions::link_style`].
|
|
358
|
+
pub link_style: Option<LinkStyle>,
|
|
359
359
|
/// Optional override for [`ConversionOptions::output_format`].
|
|
360
360
|
pub output_format: Option<OutputFormat>,
|
|
361
361
|
/// Optional override for [`ConversionOptions::include_document_structure`].
|
|
@@ -410,6 +410,7 @@ impl ConversionOptions {
|
|
|
410
410
|
apply!(strip_tags);
|
|
411
411
|
apply!(preserve_tags);
|
|
412
412
|
apply!(skip_images);
|
|
413
|
+
apply!(link_style);
|
|
413
414
|
apply!(output_format);
|
|
414
415
|
apply!(include_document_structure);
|
|
415
416
|
apply!(extract_images);
|
|
@@ -465,8 +466,8 @@ mod tests {
|
|
|
465
466
|
#[test]
|
|
466
467
|
fn test_conversion_options_partial_deserialization() {
|
|
467
468
|
let partial_json = r#"{
|
|
468
|
-
"
|
|
469
|
-
"
|
|
469
|
+
"heading_style": "atxclosed",
|
|
470
|
+
"list_indent_width": 4,
|
|
470
471
|
"bullets": "*"
|
|
471
472
|
}"#;
|
|
472
473
|
|
|
@@ -10,10 +10,10 @@ pub mod preprocessing;
|
|
|
10
10
|
pub mod validation;
|
|
11
11
|
|
|
12
12
|
// Re-exports for easy access
|
|
13
|
-
pub use conversion::{ConversionOptions, ConversionOptionsUpdate};
|
|
13
|
+
pub use conversion::{ConversionOptions, ConversionOptionsBuilder, ConversionOptionsUpdate};
|
|
14
14
|
pub use preprocessing::{PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset};
|
|
15
15
|
pub use validation::{
|
|
16
|
-
CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
|
|
16
|
+
CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
|
|
17
17
|
};
|
|
18
18
|
|
|
19
19
|
// Note: InlineImageConfig is re-exported from the inline_images module,
|
|
@@ -42,10 +42,7 @@ impl PreprocessingPreset {
|
|
|
42
42
|
any(feature = "serde", feature = "metadata"),
|
|
43
43
|
derive(serde::Serialize, serde::Deserialize)
|
|
44
44
|
)]
|
|
45
|
-
#[cfg_attr(
|
|
46
|
-
any(feature = "serde", feature = "metadata"),
|
|
47
|
-
serde(rename_all = "camelCase", deny_unknown_fields)
|
|
48
|
-
)]
|
|
45
|
+
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(default, deny_unknown_fields))]
|
|
49
46
|
pub struct PreprocessingOptions {
|
|
50
47
|
/// Enable HTML preprocessing globally
|
|
51
48
|
pub enabled: bool,
|
|
@@ -70,10 +67,7 @@ pub struct PreprocessingOptions {
|
|
|
70
67
|
any(feature = "serde", feature = "metadata"),
|
|
71
68
|
derive(serde::Serialize, serde::Deserialize)
|
|
72
69
|
)]
|
|
73
|
-
#[cfg_attr(
|
|
74
|
-
any(feature = "serde", feature = "metadata"),
|
|
75
|
-
serde(rename_all = "camelCase", deny_unknown_fields)
|
|
76
|
-
)]
|
|
70
|
+
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
|
|
77
71
|
pub struct PreprocessingOptionsUpdate {
|
|
78
72
|
/// Optional global preprocessing enablement override
|
|
79
73
|
pub enabled: Option<bool>,
|
|
@@ -91,7 +85,7 @@ pub struct PreprocessingOptionsUpdate {
|
|
|
91
85
|
impl Default for PreprocessingOptions {
|
|
92
86
|
fn default() -> Self {
|
|
93
87
|
Self {
|
|
94
|
-
enabled:
|
|
88
|
+
enabled: true,
|
|
95
89
|
preset: PreprocessingPreset::default(),
|
|
96
90
|
remove_navigation: true,
|
|
97
91
|
remove_forms: true,
|
|
@@ -115,10 +115,10 @@ impl NewlineStyle {
|
|
|
115
115
|
/// Determines how code blocks (`<pre><code>`) are rendered in Markdown.
|
|
116
116
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
|
|
117
117
|
pub enum CodeBlockStyle {
|
|
118
|
-
/// Indented code blocks (4 spaces).
|
|
119
|
-
#[default]
|
|
118
|
+
/// Indented code blocks (4 spaces). `CommonMark` standard.
|
|
120
119
|
Indented,
|
|
121
|
-
/// Fenced code blocks with backticks (```). Supports language hints.
|
|
120
|
+
/// Fenced code blocks with backticks (```). Default (GFM). Supports language hints.
|
|
121
|
+
#[default]
|
|
122
122
|
Backticks,
|
|
123
123
|
/// Fenced code blocks with tildes (~~~). Supports language hints.
|
|
124
124
|
Tildes,
|
|
@@ -172,6 +172,33 @@ impl HighlightStyle {
|
|
|
172
172
|
}
|
|
173
173
|
}
|
|
174
174
|
|
|
175
|
+
/// Link rendering style in Markdown output.
|
|
176
|
+
///
|
|
177
|
+
/// Controls whether links and images use inline `[text](url)` syntax or
|
|
178
|
+
/// reference-style `[text][1]` syntax with definitions collected at the end.
|
|
179
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
|
|
180
|
+
pub enum LinkStyle {
|
|
181
|
+
/// Inline links: `[text](url)`. Default.
|
|
182
|
+
#[default]
|
|
183
|
+
Inline,
|
|
184
|
+
/// Reference-style links: `[text][1]` with `[1]: url` at end of document.
|
|
185
|
+
Reference,
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
impl LinkStyle {
|
|
189
|
+
/// Parse a link style from a string.
|
|
190
|
+
///
|
|
191
|
+
/// Accepts "reference" or defaults to Inline.
|
|
192
|
+
/// Input is normalized (lowercased, alphanumeric only).
|
|
193
|
+
#[must_use]
|
|
194
|
+
pub fn parse(value: &str) -> Self {
|
|
195
|
+
match normalize_token(value).as_str() {
|
|
196
|
+
"reference" => Self::Reference,
|
|
197
|
+
_ => Self::Inline,
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
175
202
|
/// Output format for conversion.
|
|
176
203
|
///
|
|
177
204
|
/// Specifies the target markup language format for the conversion output.
|
|
@@ -215,7 +242,8 @@ pub(crate) fn normalize_token(value: &str) -> String {
|
|
|
215
242
|
#[cfg(any(feature = "serde", feature = "metadata"))]
|
|
216
243
|
mod serde_impls {
|
|
217
244
|
use super::{
|
|
218
|
-
CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat,
|
|
245
|
+
CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat,
|
|
246
|
+
WhitespaceMode,
|
|
219
247
|
};
|
|
220
248
|
use serde::{Deserialize, Serialize, Serializer};
|
|
221
249
|
|
|
@@ -239,6 +267,7 @@ mod serde_impls {
|
|
|
239
267
|
impl_deserialize_from_parse!(NewlineStyle, NewlineStyle::parse);
|
|
240
268
|
impl_deserialize_from_parse!(CodeBlockStyle, CodeBlockStyle::parse);
|
|
241
269
|
impl_deserialize_from_parse!(HighlightStyle, HighlightStyle::parse);
|
|
270
|
+
impl_deserialize_from_parse!(LinkStyle, LinkStyle::parse);
|
|
242
271
|
impl_deserialize_from_parse!(OutputFormat, OutputFormat::parse);
|
|
243
272
|
|
|
244
273
|
// Serialize implementations that convert enum variants to their string representations
|
|
@@ -324,6 +353,19 @@ mod serde_impls {
|
|
|
324
353
|
}
|
|
325
354
|
}
|
|
326
355
|
|
|
356
|
+
impl Serialize for LinkStyle {
|
|
357
|
+
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
|
358
|
+
where
|
|
359
|
+
S: Serializer,
|
|
360
|
+
{
|
|
361
|
+
let s = match self {
|
|
362
|
+
Self::Inline => "inline",
|
|
363
|
+
Self::Reference => "reference",
|
|
364
|
+
};
|
|
365
|
+
serializer.serialize_str(s)
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
327
369
|
impl Serialize for OutputFormat {
|
|
328
370
|
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
|
329
371
|
where
|
|
@@ -147,8 +147,10 @@ pub struct TextAnnotation {
|
|
|
147
147
|
/// Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
|
|
148
148
|
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
|
149
149
|
#[serde(tag = "annotation_type", rename_all = "snake_case")]
|
|
150
|
+
#[derive(Default)]
|
|
150
151
|
pub enum AnnotationKind {
|
|
151
152
|
/// Bold / strong emphasis.
|
|
153
|
+
#[default]
|
|
152
154
|
Bold,
|
|
153
155
|
/// Italic / emphasis.
|
|
154
156
|
Italic,
|
|
@@ -173,3 +175,12 @@ pub enum AnnotationKind {
|
|
|
173
175
|
title: Option<String>,
|
|
174
176
|
},
|
|
175
177
|
}
|
|
178
|
+
|
|
179
|
+
impl Default for NodeContent {
|
|
180
|
+
fn default() -> Self {
|
|
181
|
+
Self::Heading {
|
|
182
|
+
level: 1,
|
|
183
|
+
text: String::new(),
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
//! The primary result type for HTML conversion and extraction.
|
|
2
2
|
|
|
3
|
+
use serde::{Deserialize, Serialize};
|
|
4
|
+
|
|
3
5
|
use super::document::DocumentStructure;
|
|
4
6
|
use super::tables::TableData;
|
|
5
7
|
use super::warnings::ProcessingWarning;
|
|
@@ -11,14 +13,14 @@ use super::warnings::ProcessingWarning;
|
|
|
11
13
|
///
|
|
12
14
|
/// # Example
|
|
13
15
|
///
|
|
14
|
-
/// ```
|
|
16
|
+
/// ```text
|
|
15
17
|
/// use html_to_markdown_rs::{convert, ConversionOptions};
|
|
16
18
|
///
|
|
17
19
|
/// let result = convert("<h1>Hello</h1><p>World</p>", None)?;
|
|
18
20
|
/// assert!(result.content.is_some());
|
|
19
21
|
/// assert!(result.warnings.is_empty());
|
|
20
22
|
/// ```
|
|
21
|
-
#[derive(Debug, Clone, Default)]
|
|
23
|
+
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
|
22
24
|
pub struct ConversionResult {
|
|
23
25
|
/// Converted text output (markdown, djot, or plain text).
|
|
24
26
|
///
|
|
@@ -42,6 +44,7 @@ pub struct ConversionResult {
|
|
|
42
44
|
///
|
|
43
45
|
/// Populated when `extract_images` is `true` in options.
|
|
44
46
|
#[cfg(feature = "inline-images")]
|
|
47
|
+
#[serde(skip)]
|
|
45
48
|
pub images: Vec<crate::inline_images::InlineImage>,
|
|
46
49
|
|
|
47
50
|
/// Non-fatal processing warnings.
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
use serde::{Deserialize, Serialize};
|
|
4
4
|
|
|
5
5
|
/// A structured table grid with cell-level data including spans.
|
|
6
|
-
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
|
6
|
+
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
|
|
7
7
|
pub struct TableGrid {
|
|
8
8
|
/// Number of rows.
|
|
9
9
|
pub rows: u32,
|