html-to-markdown 2.30.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -14
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +5 -2
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +126 -52
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -1,18 +1,27 @@
|
|
|
1
1
|
#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
|
|
2
2
|
|
|
3
|
-
//! Main conversion options
|
|
4
|
-
//!
|
|
5
|
-
//! This module provides the primary `ConversionOptions` struct with all configuration
|
|
6
|
-
//! settings for HTML to Markdown conversion, along with partial update support for
|
|
7
|
-
//! selective option modifications.
|
|
3
|
+
//! Main conversion options with builder pattern.
|
|
8
4
|
|
|
9
5
|
use crate::options::preprocessing::PreprocessingOptions;
|
|
10
|
-
use crate::options::preprocessing::PreprocessingOptionsUpdate;
|
|
11
6
|
use crate::options::validation::{
|
|
12
7
|
CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
|
|
13
8
|
};
|
|
14
9
|
|
|
15
10
|
/// Main conversion options for HTML to Markdown conversion.
|
|
11
|
+
///
|
|
12
|
+
/// Use [`ConversionOptions::builder()`] to construct, or [`Default::default()`] for defaults.
|
|
13
|
+
///
|
|
14
|
+
/// # Example
|
|
15
|
+
///
|
|
16
|
+
/// ```rust,ignore
|
|
17
|
+
/// use html_to_markdown_rs::ConversionOptions;
|
|
18
|
+
///
|
|
19
|
+
/// let options = ConversionOptions::builder()
|
|
20
|
+
/// .heading_style(HeadingStyle::Atx)
|
|
21
|
+
/// .wrap(true)
|
|
22
|
+
/// .wrap_width(100)
|
|
23
|
+
/// .build();
|
|
24
|
+
/// ```
|
|
16
25
|
#[derive(Debug, Clone)]
|
|
17
26
|
#[cfg_attr(
|
|
18
27
|
any(feature = "serde", feature = "metadata"),
|
|
@@ -20,390 +29,399 @@ use crate::options::validation::{
|
|
|
20
29
|
)]
|
|
21
30
|
#[cfg_attr(
|
|
22
31
|
any(feature = "serde", feature = "metadata"),
|
|
23
|
-
serde(rename_all = "camelCase", default)
|
|
32
|
+
serde(rename_all = "camelCase", default, deny_unknown_fields)
|
|
24
33
|
)]
|
|
25
34
|
pub struct ConversionOptions {
|
|
26
|
-
/// Heading style (
|
|
35
|
+
/// Heading style to use in Markdown output (ATX `#` or Setext underline).
|
|
27
36
|
pub heading_style: HeadingStyle,
|
|
28
|
-
|
|
29
|
-
/// List indentation type (Spaces or Tabs)
|
|
37
|
+
/// How to indent nested list items (spaces or tab).
|
|
30
38
|
pub list_indent_type: ListIndentType,
|
|
31
|
-
|
|
32
|
-
/// List indentation width in spaces (applied if using spaces indentation)
|
|
39
|
+
/// Number of spaces (or tabs) to use for each level of list indentation.
|
|
33
40
|
pub list_indent_width: usize,
|
|
34
|
-
|
|
35
|
-
/// Bullet characters for unordered lists (e.g., "-", "*", "+")
|
|
41
|
+
/// Bullet character(s) to use for unordered list items (e.g. `"-"`, `"*"`).
|
|
36
42
|
pub bullets: String,
|
|
37
|
-
|
|
38
|
-
/// Symbol for strong/emphasis emphasis rendering (* or _)
|
|
43
|
+
/// Character used for bold/italic emphasis markers (`*` or `_`).
|
|
39
44
|
pub strong_em_symbol: char,
|
|
40
|
-
|
|
41
|
-
/// Escape asterisks (*) in text to prevent accidental formatting
|
|
45
|
+
/// Escape `*` characters in plain text to avoid unintended bold/italic.
|
|
42
46
|
pub escape_asterisks: bool,
|
|
43
|
-
|
|
44
|
-
/// Escape underscores (_) in text to prevent accidental formatting
|
|
47
|
+
/// Escape `_` characters in plain text to avoid unintended bold/italic.
|
|
45
48
|
pub escape_underscores: bool,
|
|
46
|
-
|
|
47
|
-
/// Escape miscellaneous markdown characters (\ & < ` [ > ~ # = + | -)
|
|
49
|
+
/// Escape miscellaneous Markdown metacharacters (`[]()#` etc.) in plain text.
|
|
48
50
|
pub escape_misc: bool,
|
|
49
|
-
|
|
50
|
-
/// Escape all ASCII punctuation characters (for `CommonMark` spec compliance tests)
|
|
51
|
+
/// Escape ASCII characters that have special meaning in certain Markdown dialects.
|
|
51
52
|
pub escape_ascii: bool,
|
|
52
|
-
|
|
53
|
-
/// Default code language for fenced code blocks when not specified
|
|
53
|
+
/// Default language annotation for fenced code blocks that have no language hint.
|
|
54
54
|
pub code_language: String,
|
|
55
|
-
|
|
56
|
-
/// Use autolinks syntax for bare URLs (<http://example.com>)
|
|
55
|
+
/// Automatically convert bare URLs into Markdown autolinks.
|
|
57
56
|
pub autolinks: bool,
|
|
58
|
-
|
|
59
|
-
/// Add default title element to HTML if none exists before conversion
|
|
57
|
+
/// Emit a default title when no `<title>` tag is present.
|
|
60
58
|
pub default_title: bool,
|
|
61
|
-
|
|
62
|
-
/// Use HTML <br> elements in tables instead of spaces for line breaks
|
|
59
|
+
/// Render `<br>` elements inside table cells as literal line breaks.
|
|
63
60
|
pub br_in_tables: bool,
|
|
64
|
-
|
|
65
|
-
/// Enable spatial table reconstruction in hOCR documents (via spatial positioning analysis).
|
|
66
|
-
///
|
|
67
|
-
/// **Deprecated since 2.30.0**: hOCR support will be removed in v3.
|
|
68
|
-
pub hocr_spatial_tables: bool,
|
|
69
|
-
|
|
70
|
-
/// Highlight style for <mark> elements (`DoubleEqual`, Html, Bold, None)
|
|
61
|
+
/// Style used for `<mark>` / highlighted text (e.g. `==text==`).
|
|
71
62
|
pub highlight_style: HighlightStyle,
|
|
72
|
-
|
|
73
|
-
/// Extract metadata from HTML (title, description, images, links, etc.)
|
|
63
|
+
/// Extract `<meta>` and `<head>` information into the result metadata.
|
|
74
64
|
pub extract_metadata: bool,
|
|
75
|
-
|
|
76
|
-
/// Whitespace handling mode (Normalized collapses multiple spaces, Strict preserves)
|
|
65
|
+
/// Controls how whitespace is normalised during conversion.
|
|
77
66
|
pub whitespace_mode: WhitespaceMode,
|
|
78
|
-
|
|
79
|
-
/// Strip newline characters from HTML before processing
|
|
67
|
+
/// Strip all newlines from the output, producing a single-line result.
|
|
80
68
|
pub strip_newlines: bool,
|
|
81
|
-
|
|
82
|
-
/// Enable automatic text wrapping at `wrap_width`
|
|
69
|
+
/// Wrap long lines at [`wrap_width`](Self::wrap_width) characters.
|
|
83
70
|
pub wrap: bool,
|
|
84
|
-
|
|
85
|
-
/// Text wrapping width in characters (default 80)
|
|
71
|
+
/// Maximum line width when [`wrap`](Self::wrap) is enabled (default `80`).
|
|
86
72
|
pub wrap_width: usize,
|
|
87
|
-
|
|
88
|
-
/// Treat block-level elements as inline during conversion
|
|
73
|
+
/// Treat the entire document as inline content (no block-level wrappers).
|
|
89
74
|
pub convert_as_inline: bool,
|
|
90
|
-
|
|
91
|
-
/// Custom symbol for subscript content (e.g., "~")
|
|
75
|
+
/// Markdown notation for subscript text (e.g. `"~"`).
|
|
92
76
|
pub sub_symbol: String,
|
|
93
|
-
|
|
94
|
-
/// Custom symbol for superscript content (e.g., "^")
|
|
77
|
+
/// Markdown notation for superscript text (e.g. `"^"`).
|
|
95
78
|
pub sup_symbol: String,
|
|
96
|
-
|
|
97
|
-
/// Newline style in markdown output (Spaces adds two spaces, Backslash adds \)
|
|
79
|
+
/// How to encode hard line breaks (`<br>`) in Markdown.
|
|
98
80
|
pub newline_style: NewlineStyle,
|
|
99
|
-
|
|
100
|
-
/// Code block fence style (Indented, Backticks, Tildes)
|
|
81
|
+
/// Style used for fenced code blocks (backticks or tilde).
|
|
101
82
|
pub code_block_style: CodeBlockStyle,
|
|
102
|
-
|
|
103
|
-
/// HTML elements where images should remain as markdown links (not converted to alt text)
|
|
83
|
+
/// HTML tag names whose `<img>` children are kept inline instead of block.
|
|
104
84
|
pub keep_inline_images_in: Vec<String>,
|
|
105
|
-
|
|
106
|
-
/// HTML preprocessing options (remove nav, forms, etc.)
|
|
85
|
+
/// Pre-processing options applied to the HTML before conversion.
|
|
107
86
|
pub preprocessing: PreprocessingOptions,
|
|
108
|
-
|
|
109
|
-
/// Source document encoding (informational, typically "utf-8")
|
|
87
|
+
/// Expected character encoding of the input HTML (default `"utf-8"`).
|
|
110
88
|
pub encoding: String,
|
|
111
|
-
|
|
112
|
-
/// Enable debug mode with diagnostic warnings on conversion issues
|
|
89
|
+
/// Emit debug information during conversion.
|
|
113
90
|
pub debug: bool,
|
|
114
|
-
|
|
115
|
-
/// HTML tags to strip (extract text content, no markdown conversion)
|
|
91
|
+
/// HTML tag names whose content is stripped from the output entirely.
|
|
116
92
|
pub strip_tags: Vec<String>,
|
|
117
|
-
|
|
118
|
-
/// HTML tags to preserve as-is in output (keep original HTML, useful for complex tables)
|
|
93
|
+
/// HTML tag names that are preserved verbatim in the output.
|
|
119
94
|
pub preserve_tags: Vec<String>,
|
|
120
|
-
|
|
121
|
-
/// Skip all images during conversion.
|
|
122
|
-
/// When enabled, all `<img>` elements are completely omitted from output.
|
|
123
|
-
/// Useful for text-only extraction or filtering out visual content.
|
|
95
|
+
/// Skip conversion of `<img>` elements (omit images from output).
|
|
124
96
|
pub skip_images: bool,
|
|
125
|
-
|
|
126
|
-
/// Output format for conversion (Markdown, Djot, or Plain)
|
|
97
|
+
/// Target output format (Markdown, plain text, etc.).
|
|
127
98
|
pub output_format: OutputFormat,
|
|
99
|
+
/// Include structured document tree in result.
|
|
100
|
+
pub include_document_structure: bool,
|
|
101
|
+
/// Extract inline images from data URIs and SVGs.
|
|
102
|
+
pub extract_images: bool,
|
|
103
|
+
/// Maximum decoded image size in bytes (default 5MB).
|
|
104
|
+
pub max_image_size: u64,
|
|
105
|
+
/// Capture SVG elements as images.
|
|
106
|
+
pub capture_svg: bool,
|
|
107
|
+
/// Infer image dimensions from data.
|
|
108
|
+
pub infer_dimensions: bool,
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
impl Default for ConversionOptions {
|
|
112
|
+
fn default() -> Self {
|
|
113
|
+
Self {
|
|
114
|
+
heading_style: HeadingStyle::default(),
|
|
115
|
+
list_indent_type: ListIndentType::default(),
|
|
116
|
+
list_indent_width: 2,
|
|
117
|
+
bullets: "-".to_string(),
|
|
118
|
+
strong_em_symbol: '*',
|
|
119
|
+
escape_asterisks: false,
|
|
120
|
+
escape_underscores: false,
|
|
121
|
+
escape_misc: false,
|
|
122
|
+
escape_ascii: false,
|
|
123
|
+
code_language: String::new(),
|
|
124
|
+
autolinks: true,
|
|
125
|
+
default_title: false,
|
|
126
|
+
br_in_tables: false,
|
|
127
|
+
highlight_style: HighlightStyle::default(),
|
|
128
|
+
extract_metadata: true,
|
|
129
|
+
whitespace_mode: WhitespaceMode::default(),
|
|
130
|
+
strip_newlines: false,
|
|
131
|
+
wrap: false,
|
|
132
|
+
wrap_width: 80,
|
|
133
|
+
convert_as_inline: false,
|
|
134
|
+
sub_symbol: String::new(),
|
|
135
|
+
sup_symbol: String::new(),
|
|
136
|
+
newline_style: NewlineStyle::Spaces,
|
|
137
|
+
code_block_style: CodeBlockStyle::default(),
|
|
138
|
+
keep_inline_images_in: Vec::new(),
|
|
139
|
+
preprocessing: PreprocessingOptions::default(),
|
|
140
|
+
encoding: "utf-8".to_string(),
|
|
141
|
+
debug: false,
|
|
142
|
+
strip_tags: Vec::new(),
|
|
143
|
+
preserve_tags: Vec::new(),
|
|
144
|
+
skip_images: false,
|
|
145
|
+
output_format: OutputFormat::default(),
|
|
146
|
+
include_document_structure: false,
|
|
147
|
+
extract_images: false,
|
|
148
|
+
max_image_size: 5_242_880,
|
|
149
|
+
capture_svg: false,
|
|
150
|
+
infer_dimensions: true,
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
impl ConversionOptions {
|
|
156
|
+
/// Create a new builder with default values.
|
|
157
|
+
#[must_use]
|
|
158
|
+
pub fn builder() -> ConversionOptionsBuilder {
|
|
159
|
+
ConversionOptionsBuilder(Self::default())
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// ── Builder ─────────────────────────────────────────────────────────────────
|
|
164
|
+
|
|
165
|
+
/// Builder for [`ConversionOptions`].
|
|
166
|
+
///
|
|
167
|
+
/// All fields start with default values. Call `.build()` to produce the final options.
|
|
168
|
+
#[derive(Debug, Clone)]
|
|
169
|
+
pub struct ConversionOptionsBuilder(ConversionOptions);
|
|
170
|
+
|
|
171
|
+
macro_rules! builder_setter {
|
|
172
|
+
($name:ident, $ty:ty) => {
|
|
173
|
+
/// Set the value.
|
|
174
|
+
#[must_use]
|
|
175
|
+
pub fn $name(mut self, value: $ty) -> Self {
|
|
176
|
+
self.0.$name = value;
|
|
177
|
+
self
|
|
178
|
+
}
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
macro_rules! builder_setter_into {
|
|
183
|
+
($name:ident, $ty:ty) => {
|
|
184
|
+
/// Set the value.
|
|
185
|
+
#[must_use]
|
|
186
|
+
pub fn $name(mut self, value: impl Into<$ty>) -> Self {
|
|
187
|
+
self.0.$name = value.into();
|
|
188
|
+
self
|
|
189
|
+
}
|
|
190
|
+
};
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
impl ConversionOptionsBuilder {
|
|
194
|
+
// Output control
|
|
195
|
+
builder_setter!(output_format, OutputFormat);
|
|
196
|
+
builder_setter!(include_document_structure, bool);
|
|
197
|
+
builder_setter!(extract_metadata, bool);
|
|
198
|
+
builder_setter!(extract_images, bool);
|
|
199
|
+
|
|
200
|
+
// Markdown formatting
|
|
201
|
+
builder_setter!(heading_style, HeadingStyle);
|
|
202
|
+
builder_setter!(list_indent_type, ListIndentType);
|
|
203
|
+
builder_setter!(list_indent_width, usize);
|
|
204
|
+
builder_setter_into!(bullets, String);
|
|
205
|
+
builder_setter!(strong_em_symbol, char);
|
|
206
|
+
builder_setter!(code_block_style, CodeBlockStyle);
|
|
207
|
+
builder_setter!(newline_style, NewlineStyle);
|
|
208
|
+
builder_setter!(highlight_style, HighlightStyle);
|
|
209
|
+
builder_setter_into!(code_language, String);
|
|
210
|
+
builder_setter!(autolinks, bool);
|
|
211
|
+
builder_setter!(default_title, bool);
|
|
212
|
+
builder_setter!(br_in_tables, bool);
|
|
213
|
+
builder_setter_into!(sub_symbol, String);
|
|
214
|
+
builder_setter_into!(sup_symbol, String);
|
|
215
|
+
|
|
216
|
+
// Escaping
|
|
217
|
+
builder_setter!(escape_asterisks, bool);
|
|
218
|
+
builder_setter!(escape_underscores, bool);
|
|
219
|
+
builder_setter!(escape_misc, bool);
|
|
220
|
+
builder_setter!(escape_ascii, bool);
|
|
221
|
+
|
|
222
|
+
// Whitespace / wrapping
|
|
223
|
+
builder_setter!(whitespace_mode, WhitespaceMode);
|
|
224
|
+
builder_setter!(strip_newlines, bool);
|
|
225
|
+
builder_setter!(wrap, bool);
|
|
226
|
+
builder_setter!(wrap_width, usize);
|
|
227
|
+
|
|
228
|
+
// Element handling
|
|
229
|
+
builder_setter!(convert_as_inline, bool);
|
|
230
|
+
builder_setter!(skip_images, bool);
|
|
231
|
+
|
|
232
|
+
/// Set the list of HTML tag names whose content is stripped from output.
|
|
233
|
+
#[must_use]
|
|
234
|
+
pub fn strip_tags(mut self, tags: Vec<String>) -> Self {
|
|
235
|
+
self.0.strip_tags = tags;
|
|
236
|
+
self
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
/// Set the list of HTML tag names that are preserved verbatim in output.
|
|
240
|
+
#[must_use]
|
|
241
|
+
pub fn preserve_tags(mut self, tags: Vec<String>) -> Self {
|
|
242
|
+
self.0.preserve_tags = tags;
|
|
243
|
+
self
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/// Set the list of HTML tag names whose `<img>` children are kept inline.
|
|
247
|
+
#[must_use]
|
|
248
|
+
pub fn keep_inline_images_in(mut self, tags: Vec<String>) -> Self {
|
|
249
|
+
self.0.keep_inline_images_in = tags;
|
|
250
|
+
self
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Image extraction config
|
|
254
|
+
builder_setter!(max_image_size, u64);
|
|
255
|
+
builder_setter!(capture_svg, bool);
|
|
256
|
+
builder_setter!(infer_dimensions, bool);
|
|
257
|
+
|
|
258
|
+
// Preprocessing
|
|
259
|
+
/// Set the pre-processing options applied to the HTML before conversion.
|
|
260
|
+
#[must_use]
|
|
261
|
+
pub fn preprocessing(mut self, preprocessing: PreprocessingOptions) -> Self {
|
|
262
|
+
self.0.preprocessing = preprocessing;
|
|
263
|
+
self
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// Encoding
|
|
267
|
+
builder_setter_into!(encoding, String);
|
|
268
|
+
|
|
269
|
+
// Debug
|
|
270
|
+
builder_setter!(debug, bool);
|
|
271
|
+
|
|
272
|
+
/// Build the final [`ConversionOptions`].
|
|
273
|
+
#[must_use]
|
|
274
|
+
pub fn build(self) -> ConversionOptions {
|
|
275
|
+
self.0
|
|
276
|
+
}
|
|
128
277
|
}
|
|
129
278
|
|
|
279
|
+
// ── ConversionOptionsUpdate (for binding crate compatibility) ────────────
|
|
280
|
+
|
|
281
|
+
use crate::options::preprocessing::PreprocessingOptionsUpdate;
|
|
282
|
+
|
|
130
283
|
/// Partial update for `ConversionOptions`.
|
|
131
284
|
///
|
|
132
|
-
///
|
|
133
|
-
///
|
|
134
|
-
/// corresponding fields unchanged when applied via [`ConversionOptions::apply_update`].
|
|
285
|
+
/// Uses `Option<T>` fields for selective updates. Bindings use this to construct
|
|
286
|
+
/// options from language-native types. Prefer [`ConversionOptionsBuilder`] for Rust code.
|
|
135
287
|
#[derive(Debug, Clone, Default)]
|
|
136
288
|
#[cfg_attr(
|
|
137
289
|
any(feature = "serde", feature = "metadata"),
|
|
138
290
|
derive(serde::Serialize, serde::Deserialize)
|
|
139
291
|
)]
|
|
140
|
-
#[cfg_attr(
|
|
292
|
+
#[cfg_attr(
|
|
293
|
+
any(feature = "serde", feature = "metadata"),
|
|
294
|
+
serde(rename_all = "camelCase", deny_unknown_fields)
|
|
295
|
+
)]
|
|
141
296
|
pub struct ConversionOptionsUpdate {
|
|
142
|
-
/// Optional
|
|
297
|
+
/// Optional override for [`ConversionOptions::heading_style`].
|
|
143
298
|
pub heading_style: Option<HeadingStyle>,
|
|
144
|
-
|
|
145
|
-
/// Optional list indentation type override (Spaces or Tabs)
|
|
299
|
+
/// Optional override for [`ConversionOptions::list_indent_type`].
|
|
146
300
|
pub list_indent_type: Option<ListIndentType>,
|
|
147
|
-
|
|
148
|
-
/// Optional list indentation width override in spaces
|
|
301
|
+
/// Optional override for [`ConversionOptions::list_indent_width`].
|
|
149
302
|
pub list_indent_width: Option<usize>,
|
|
150
|
-
|
|
151
|
-
/// Optional bullet characters override for unordered lists
|
|
303
|
+
/// Optional override for [`ConversionOptions::bullets`].
|
|
152
304
|
pub bullets: Option<String>,
|
|
153
|
-
|
|
154
|
-
/// Optional strong/emphasis symbol override (* or _)
|
|
305
|
+
/// Optional override for [`ConversionOptions::strong_em_symbol`].
|
|
155
306
|
pub strong_em_symbol: Option<char>,
|
|
156
|
-
|
|
157
|
-
/// Optional asterisk escaping override in text content
|
|
307
|
+
/// Optional override for [`ConversionOptions::escape_asterisks`].
|
|
158
308
|
pub escape_asterisks: Option<bool>,
|
|
159
|
-
|
|
160
|
-
/// Optional underscore escaping override in text content
|
|
309
|
+
/// Optional override for [`ConversionOptions::escape_underscores`].
|
|
161
310
|
pub escape_underscores: Option<bool>,
|
|
162
|
-
|
|
163
|
-
/// Optional miscellaneous character escaping override (\ & < ` [ > ~ # = + | -)
|
|
311
|
+
/// Optional override for [`ConversionOptions::escape_misc`].
|
|
164
312
|
pub escape_misc: Option<bool>,
|
|
165
|
-
|
|
166
|
-
/// Optional ASCII punctuation escaping override (for spec compliance testing)
|
|
313
|
+
/// Optional override for [`ConversionOptions::escape_ascii`].
|
|
167
314
|
pub escape_ascii: Option<bool>,
|
|
168
|
-
|
|
169
|
-
/// Optional default code language override for fenced code blocks
|
|
315
|
+
/// Optional override for [`ConversionOptions::code_language`].
|
|
170
316
|
pub code_language: Option<String>,
|
|
171
|
-
|
|
172
|
-
/// Optional autolinks syntax override for bare URLs
|
|
317
|
+
/// Optional override for [`ConversionOptions::autolinks`].
|
|
173
318
|
pub autolinks: Option<bool>,
|
|
174
|
-
|
|
175
|
-
/// Optional default title element injection override
|
|
319
|
+
/// Optional override for [`ConversionOptions::default_title`].
|
|
176
320
|
pub default_title: Option<bool>,
|
|
177
|
-
|
|
178
|
-
/// Optional HTML <br> usage in tables override
|
|
321
|
+
/// Optional override for [`ConversionOptions::br_in_tables`].
|
|
179
322
|
pub br_in_tables: Option<bool>,
|
|
180
|
-
|
|
181
|
-
/// Optional spatial table reconstruction for hOCR documents override.
|
|
182
|
-
///
|
|
183
|
-
/// **Deprecated since 2.30.0**: hOCR support will be removed in v3.
|
|
184
|
-
pub hocr_spatial_tables: Option<bool>,
|
|
185
|
-
|
|
186
|
-
/// Optional highlight style override for <mark> elements
|
|
323
|
+
/// Optional override for [`ConversionOptions::highlight_style`].
|
|
187
324
|
pub highlight_style: Option<HighlightStyle>,
|
|
188
|
-
|
|
189
|
-
/// Optional metadata extraction override (title, description, images, links)
|
|
325
|
+
/// Optional override for [`ConversionOptions::extract_metadata`].
|
|
190
326
|
pub extract_metadata: Option<bool>,
|
|
191
|
-
|
|
192
|
-
/// Optional whitespace handling mode override (Normalized or Strict)
|
|
327
|
+
/// Optional override for [`ConversionOptions::whitespace_mode`].
|
|
193
328
|
pub whitespace_mode: Option<WhitespaceMode>,
|
|
194
|
-
|
|
195
|
-
/// Optional newline stripping override before processing
|
|
329
|
+
/// Optional override for [`ConversionOptions::strip_newlines`].
|
|
196
330
|
pub strip_newlines: Option<bool>,
|
|
197
|
-
|
|
198
|
-
/// Optional automatic text wrapping override
|
|
331
|
+
/// Optional override for [`ConversionOptions::wrap`].
|
|
199
332
|
pub wrap: Option<bool>,
|
|
200
|
-
|
|
201
|
-
/// Optional text wrapping width override in characters
|
|
333
|
+
/// Optional override for [`ConversionOptions::wrap_width`].
|
|
202
334
|
pub wrap_width: Option<usize>,
|
|
203
|
-
|
|
204
|
-
/// Optional block-level to inline conversion override
|
|
335
|
+
/// Optional override for [`ConversionOptions::convert_as_inline`].
|
|
205
336
|
pub convert_as_inline: Option<bool>,
|
|
206
|
-
|
|
207
|
-
/// Optional subscript symbol override
|
|
337
|
+
/// Optional override for [`ConversionOptions::sub_symbol`].
|
|
208
338
|
pub sub_symbol: Option<String>,
|
|
209
|
-
|
|
210
|
-
/// Optional superscript symbol override
|
|
339
|
+
/// Optional override for [`ConversionOptions::sup_symbol`].
|
|
211
340
|
pub sup_symbol: Option<String>,
|
|
212
|
-
|
|
213
|
-
/// Optional newline style override for markdown output
|
|
341
|
+
/// Optional override for [`ConversionOptions::newline_style`].
|
|
214
342
|
pub newline_style: Option<NewlineStyle>,
|
|
215
|
-
|
|
216
|
-
/// Optional code block fence style override (Indented, Backticks, Tildes)
|
|
343
|
+
/// Optional override for [`ConversionOptions::code_block_style`].
|
|
217
344
|
pub code_block_style: Option<CodeBlockStyle>,
|
|
218
|
-
|
|
219
|
-
/// Optional context elements where images remain as markdown links override
|
|
345
|
+
/// Optional override for [`ConversionOptions::keep_inline_images_in`].
|
|
220
346
|
pub keep_inline_images_in: Option<Vec<String>>,
|
|
221
|
-
|
|
222
|
-
/// Optional preprocessing options partial update
|
|
347
|
+
/// Optional override for [`ConversionOptions::preprocessing`].
|
|
223
348
|
pub preprocessing: Option<PreprocessingOptionsUpdate>,
|
|
224
|
-
|
|
225
|
-
/// Optional source document encoding override
|
|
349
|
+
/// Optional override for [`ConversionOptions::encoding`].
|
|
226
350
|
pub encoding: Option<String>,
|
|
227
|
-
|
|
228
|
-
/// Optional debug mode override for diagnostic warnings
|
|
351
|
+
/// Optional override for [`ConversionOptions::debug`].
|
|
229
352
|
pub debug: Option<bool>,
|
|
230
|
-
|
|
231
|
-
/// Optional HTML tags to strip override (extract text, no conversion)
|
|
353
|
+
/// Optional override for [`ConversionOptions::strip_tags`].
|
|
232
354
|
pub strip_tags: Option<Vec<String>>,
|
|
233
|
-
|
|
234
|
-
/// Optional HTML tags to preserve as-is override in output
|
|
355
|
+
/// Optional override for [`ConversionOptions::preserve_tags`].
|
|
235
356
|
pub preserve_tags: Option<Vec<String>>,
|
|
236
|
-
|
|
237
|
-
/// Optional skip images override
|
|
357
|
+
/// Optional override for [`ConversionOptions::skip_images`].
|
|
238
358
|
pub skip_images: Option<bool>,
|
|
239
|
-
|
|
240
|
-
/// Optional output format override (Markdown, Djot, or Plain)
|
|
359
|
+
/// Optional override for [`ConversionOptions::output_format`].
|
|
241
360
|
pub output_format: Option<OutputFormat>,
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
escape_asterisks: false,
|
|
253
|
-
escape_underscores: false,
|
|
254
|
-
escape_misc: false,
|
|
255
|
-
escape_ascii: false,
|
|
256
|
-
code_language: String::new(),
|
|
257
|
-
autolinks: true,
|
|
258
|
-
default_title: false,
|
|
259
|
-
br_in_tables: false,
|
|
260
|
-
hocr_spatial_tables: true,
|
|
261
|
-
highlight_style: HighlightStyle::default(),
|
|
262
|
-
extract_metadata: true,
|
|
263
|
-
whitespace_mode: WhitespaceMode::default(),
|
|
264
|
-
strip_newlines: false,
|
|
265
|
-
wrap: false,
|
|
266
|
-
wrap_width: 80,
|
|
267
|
-
convert_as_inline: false,
|
|
268
|
-
sub_symbol: String::new(),
|
|
269
|
-
sup_symbol: String::new(),
|
|
270
|
-
newline_style: NewlineStyle::Spaces,
|
|
271
|
-
code_block_style: CodeBlockStyle::default(),
|
|
272
|
-
keep_inline_images_in: Vec::new(),
|
|
273
|
-
preprocessing: PreprocessingOptions::default(),
|
|
274
|
-
encoding: "utf-8".to_string(),
|
|
275
|
-
debug: false,
|
|
276
|
-
strip_tags: Vec::new(),
|
|
277
|
-
preserve_tags: Vec::new(),
|
|
278
|
-
skip_images: false,
|
|
279
|
-
output_format: OutputFormat::default(),
|
|
280
|
-
}
|
|
281
|
-
}
|
|
361
|
+
/// Optional override for [`ConversionOptions::include_document_structure`].
|
|
362
|
+
pub include_document_structure: Option<bool>,
|
|
363
|
+
/// Optional override for [`ConversionOptions::extract_images`].
|
|
364
|
+
pub extract_images: Option<bool>,
|
|
365
|
+
/// Optional override for [`ConversionOptions::max_image_size`].
|
|
366
|
+
pub max_image_size: Option<u64>,
|
|
367
|
+
/// Optional override for [`ConversionOptions::capture_svg`].
|
|
368
|
+
pub capture_svg: Option<bool>,
|
|
369
|
+
/// Optional override for [`ConversionOptions::infer_dimensions`].
|
|
370
|
+
pub infer_dimensions: Option<bool>,
|
|
282
371
|
}
|
|
283
372
|
|
|
284
373
|
impl ConversionOptions {
|
|
285
374
|
/// Apply a partial update to these conversion options.
|
|
286
|
-
///
|
|
287
|
-
/// Any specified fields in the update will override the current values.
|
|
288
|
-
/// Unspecified fields (None) are left unchanged.
|
|
289
|
-
///
|
|
290
|
-
/// # Arguments
|
|
291
|
-
///
|
|
292
|
-
/// * `update` - Partial options update with fields to override
|
|
293
375
|
pub fn apply_update(&mut self, update: ConversionOptionsUpdate) {
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
self.highlight_style = highlight_style;
|
|
338
|
-
}
|
|
339
|
-
if let Some(extract_metadata) = update.extract_metadata {
|
|
340
|
-
self.extract_metadata = extract_metadata;
|
|
341
|
-
}
|
|
342
|
-
if let Some(whitespace_mode) = update.whitespace_mode {
|
|
343
|
-
self.whitespace_mode = whitespace_mode;
|
|
344
|
-
}
|
|
345
|
-
if let Some(strip_newlines) = update.strip_newlines {
|
|
346
|
-
self.strip_newlines = strip_newlines;
|
|
347
|
-
}
|
|
348
|
-
if let Some(wrap) = update.wrap {
|
|
349
|
-
self.wrap = wrap;
|
|
350
|
-
}
|
|
351
|
-
if let Some(wrap_width) = update.wrap_width {
|
|
352
|
-
self.wrap_width = wrap_width;
|
|
353
|
-
}
|
|
354
|
-
if let Some(convert_as_inline) = update.convert_as_inline {
|
|
355
|
-
self.convert_as_inline = convert_as_inline;
|
|
356
|
-
}
|
|
357
|
-
if let Some(sub_symbol) = update.sub_symbol {
|
|
358
|
-
self.sub_symbol = sub_symbol;
|
|
359
|
-
}
|
|
360
|
-
if let Some(sup_symbol) = update.sup_symbol {
|
|
361
|
-
self.sup_symbol = sup_symbol;
|
|
362
|
-
}
|
|
363
|
-
if let Some(newline_style) = update.newline_style {
|
|
364
|
-
self.newline_style = newline_style;
|
|
365
|
-
}
|
|
366
|
-
if let Some(code_block_style) = update.code_block_style {
|
|
367
|
-
self.code_block_style = code_block_style;
|
|
368
|
-
}
|
|
369
|
-
if let Some(keep_inline_images_in) = update.keep_inline_images_in {
|
|
370
|
-
self.keep_inline_images_in = keep_inline_images_in;
|
|
371
|
-
}
|
|
376
|
+
macro_rules! apply {
|
|
377
|
+
($field:ident) => {
|
|
378
|
+
if let Some(v) = update.$field {
|
|
379
|
+
self.$field = v;
|
|
380
|
+
}
|
|
381
|
+
};
|
|
382
|
+
}
|
|
383
|
+
apply!(heading_style);
|
|
384
|
+
apply!(list_indent_type);
|
|
385
|
+
apply!(list_indent_width);
|
|
386
|
+
apply!(bullets);
|
|
387
|
+
apply!(strong_em_symbol);
|
|
388
|
+
apply!(escape_asterisks);
|
|
389
|
+
apply!(escape_underscores);
|
|
390
|
+
apply!(escape_misc);
|
|
391
|
+
apply!(escape_ascii);
|
|
392
|
+
apply!(code_language);
|
|
393
|
+
apply!(autolinks);
|
|
394
|
+
apply!(default_title);
|
|
395
|
+
apply!(br_in_tables);
|
|
396
|
+
apply!(highlight_style);
|
|
397
|
+
apply!(extract_metadata);
|
|
398
|
+
apply!(whitespace_mode);
|
|
399
|
+
apply!(strip_newlines);
|
|
400
|
+
apply!(wrap);
|
|
401
|
+
apply!(wrap_width);
|
|
402
|
+
apply!(convert_as_inline);
|
|
403
|
+
apply!(sub_symbol);
|
|
404
|
+
apply!(sup_symbol);
|
|
405
|
+
apply!(newline_style);
|
|
406
|
+
apply!(code_block_style);
|
|
407
|
+
apply!(keep_inline_images_in);
|
|
408
|
+
apply!(encoding);
|
|
409
|
+
apply!(debug);
|
|
410
|
+
apply!(strip_tags);
|
|
411
|
+
apply!(preserve_tags);
|
|
412
|
+
apply!(skip_images);
|
|
413
|
+
apply!(output_format);
|
|
414
|
+
apply!(include_document_structure);
|
|
415
|
+
apply!(extract_images);
|
|
416
|
+
apply!(max_image_size);
|
|
417
|
+
apply!(capture_svg);
|
|
418
|
+
apply!(infer_dimensions);
|
|
372
419
|
if let Some(preprocessing) = update.preprocessing {
|
|
373
420
|
self.preprocessing.apply_update(preprocessing);
|
|
374
421
|
}
|
|
375
|
-
if let Some(encoding) = update.encoding {
|
|
376
|
-
self.encoding = encoding;
|
|
377
|
-
}
|
|
378
|
-
if let Some(debug) = update.debug {
|
|
379
|
-
self.debug = debug;
|
|
380
|
-
}
|
|
381
|
-
if let Some(strip_tags) = update.strip_tags {
|
|
382
|
-
self.strip_tags = strip_tags;
|
|
383
|
-
}
|
|
384
|
-
if let Some(preserve_tags) = update.preserve_tags {
|
|
385
|
-
self.preserve_tags = preserve_tags;
|
|
386
|
-
}
|
|
387
|
-
if let Some(skip_images) = update.skip_images {
|
|
388
|
-
self.skip_images = skip_images;
|
|
389
|
-
}
|
|
390
|
-
if let Some(output_format) = update.output_format {
|
|
391
|
-
self.output_format = output_format;
|
|
392
|
-
}
|
|
393
422
|
}
|
|
394
423
|
|
|
395
|
-
/// Create
|
|
396
|
-
///
|
|
397
|
-
/// Creates a new `ConversionOptions` struct with defaults, then applies the update.
|
|
398
|
-
/// Fields not specified in the update keep their default values.
|
|
399
|
-
///
|
|
400
|
-
/// # Arguments
|
|
401
|
-
///
|
|
402
|
-
/// * `update` - Partial options update with fields to set
|
|
403
|
-
///
|
|
404
|
-
/// # Returns
|
|
405
|
-
///
|
|
406
|
-
/// New `ConversionOptions` with specified updates applied to defaults
|
|
424
|
+
/// Create from a partial update, applying to defaults.
|
|
407
425
|
#[must_use]
|
|
408
426
|
pub fn from_update(update: ConversionOptionsUpdate) -> Self {
|
|
409
427
|
let mut options = Self::default();
|
|
@@ -418,28 +436,25 @@ impl From<ConversionOptionsUpdate> for ConversionOptions {
|
|
|
418
436
|
}
|
|
419
437
|
}
|
|
420
438
|
|
|
439
|
+
// ── Tests ───────────────────────────────────────────────────────────────────
|
|
440
|
+
|
|
421
441
|
#[cfg(all(test, any(feature = "serde", feature = "metadata")))]
|
|
422
442
|
mod tests {
|
|
423
443
|
use super::*;
|
|
424
444
|
|
|
425
445
|
#[test]
|
|
426
446
|
fn test_conversion_options_serde() {
|
|
427
|
-
let options = ConversionOptions
|
|
428
|
-
heading_style
|
|
429
|
-
list_indent_width
|
|
430
|
-
bullets
|
|
431
|
-
escape_asterisks
|
|
432
|
-
whitespace_mode
|
|
433
|
-
|
|
434
|
-
};
|
|
435
|
-
|
|
436
|
-
// Serialize to JSON
|
|
437
|
-
let json = serde_json::to_string(&options).expect("Failed to serialize");
|
|
447
|
+
let options = ConversionOptions::builder()
|
|
448
|
+
.heading_style(HeadingStyle::AtxClosed)
|
|
449
|
+
.list_indent_width(4)
|
|
450
|
+
.bullets("*")
|
|
451
|
+
.escape_asterisks(true)
|
|
452
|
+
.whitespace_mode(WhitespaceMode::Strict)
|
|
453
|
+
.build();
|
|
438
454
|
|
|
439
|
-
|
|
455
|
+
let json = serde_json::to_string(&options).expect("Failed to serialize");
|
|
440
456
|
let deserialized: ConversionOptions = serde_json::from_str(&json).expect("Failed to deserialize");
|
|
441
457
|
|
|
442
|
-
// Verify values
|
|
443
458
|
assert_eq!(deserialized.list_indent_width, 4);
|
|
444
459
|
assert_eq!(deserialized.bullets, "*");
|
|
445
460
|
assert!(deserialized.escape_asterisks);
|
|
@@ -449,7 +464,6 @@ mod tests {
|
|
|
449
464
|
|
|
450
465
|
#[test]
|
|
451
466
|
fn test_conversion_options_partial_deserialization() {
|
|
452
|
-
// Test that partial JSON can be deserialized using defaults for missing fields
|
|
453
467
|
let partial_json = r#"{
|
|
454
468
|
"headingStyle": "atxClosed",
|
|
455
469
|
"listIndentWidth": 4,
|
|
@@ -459,14 +473,28 @@ mod tests {
|
|
|
459
473
|
let deserialized: ConversionOptions =
|
|
460
474
|
serde_json::from_str(partial_json).expect("Failed to deserialize partial JSON");
|
|
461
475
|
|
|
462
|
-
// Verify specified values
|
|
463
476
|
assert_eq!(deserialized.heading_style, HeadingStyle::AtxClosed);
|
|
464
477
|
assert_eq!(deserialized.list_indent_width, 4);
|
|
465
478
|
assert_eq!(deserialized.bullets, "*");
|
|
479
|
+
assert!(!deserialized.escape_asterisks);
|
|
480
|
+
assert!(!deserialized.escape_underscores);
|
|
481
|
+
assert_eq!(deserialized.list_indent_type, ListIndentType::Spaces);
|
|
482
|
+
}
|
|
466
483
|
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
484
|
+
#[test]
|
|
485
|
+
fn test_builder_pattern() {
|
|
486
|
+
let options = ConversionOptions::builder()
|
|
487
|
+
.heading_style(HeadingStyle::Underlined)
|
|
488
|
+
.wrap(true)
|
|
489
|
+
.wrap_width(100)
|
|
490
|
+
.include_document_structure(true)
|
|
491
|
+
.extract_images(true)
|
|
492
|
+
.build();
|
|
493
|
+
|
|
494
|
+
assert_eq!(options.heading_style, HeadingStyle::Underlined);
|
|
495
|
+
assert!(options.wrap);
|
|
496
|
+
assert_eq!(options.wrap_width, 100);
|
|
497
|
+
assert!(options.include_document_structure);
|
|
498
|
+
assert!(options.extract_images);
|
|
471
499
|
}
|
|
472
500
|
}
|