html-to-markdown 3.1.0 → 3.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +42 -12
- data/Gemfile +1 -0
- data/Gemfile.lock +27 -55
- data/README.md +9 -10
- data/Rakefile +4 -10
- data/ext/html-to-markdown_rb/Cargo.toml +14 -0
- data/ext/html_to_markdown_rb/Cargo.toml +16 -0
- data/ext/html_to_markdown_rb/extconf.rb +10 -0
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +6 -0
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +9 -0
- data/ext/html_to_markdown_rb/src/lib.rs +3941 -0
- data/html-to-markdown-rb.gemspec +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +31 -21
- data/{ext/html-to-markdown-rb/native/extconf.rb → lib/html_to_markdown_rs.rb} +1 -1
- data/sig/html_to_markdown.rbs +17 -5
- data/vendor/Cargo.toml +4 -4
- data/vendor/html-to-markdown-rs/Cargo.toml +2 -2
- data/vendor/html-to-markdown-rs/examples/test_deser.rs +12 -0
- data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +10 -2
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +1 -1
- data/vendor/html-to-markdown-rs/src/exports.rs +3 -3
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +1 -2
- data/vendor/html-to-markdown-rs/src/metadata/config.rs +1 -1
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +5 -5
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +6 -12
- data/vendor/html-to-markdown-rs/src/options/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +3 -9
- data/vendor/html-to-markdown-rs/src/options/validation.rs +3 -3
- data/vendor/html-to-markdown-rs/src/types/document.rs +11 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +5 -2
- data/vendor/html-to-markdown-rs/src/types/tables.rs +1 -1
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/state.rs +1 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/traversal.rs +1 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +6 -0
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +3 -3
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +4 -4
- metadata +11 -18
- data/ext/html-to-markdown-rb/extconf.rb +0 -41
- data/ext/html-to-markdown-rb/native/Cargo.lock +0 -934
- data/ext/html-to-markdown-rb/native/Cargo.toml +0 -48
- data/ext/html-to-markdown-rb/native/README.md +0 -215
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +0 -54
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +0 -158
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -11
- data/ext/html-to-markdown-rb/native/src/lib.rs +0 -128
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -238
- data/ext/html-to-markdown-rb/native/src/types.rs +0 -24
- data/lib/html_to_markdown/cli.rb +0 -21
- data/lib/html_to_markdown/cli_proxy.rb +0 -74
- data/spec/cli_proxy_spec.rb +0 -42
- data/spec/spec_helper.rb +0 -10
data/html-to-markdown-rb.gemspec
CHANGED
|
@@ -87,7 +87,7 @@ Gem::Specification.new do |spec|
|
|
|
87
87
|
spec.files = files
|
|
88
88
|
spec.extra_rdoc_files = ['README.md']
|
|
89
89
|
|
|
90
|
-
spec.extensions = ['ext/
|
|
90
|
+
spec.extensions = ['ext/html_to_markdown_rb/extconf.rb']
|
|
91
91
|
|
|
92
92
|
spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
|
|
93
93
|
spec.metadata['rubygems_mfa_required'] = 'true'
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -3,28 +3,38 @@
|
|
|
3
3
|
require_relative 'html_to_markdown/version'
|
|
4
4
|
require 'html_to_markdown_rb'
|
|
5
5
|
|
|
6
|
+
# High-performance HTML to Markdown conversion.
|
|
7
|
+
#
|
|
8
|
+
# @example Simple conversion
|
|
9
|
+
# HtmlToMarkdown.convert('<h1>Hello</h1>') # => "# Hello\n\n"
|
|
10
|
+
#
|
|
11
|
+
# @example With options
|
|
12
|
+
# HtmlToMarkdown.convert('<h1>Hello</h1>', heading_style: 'atx')
|
|
6
13
|
module HtmlToMarkdown
|
|
7
|
-
|
|
8
|
-
autoload :CLIProxy, 'html_to_markdown/cli_proxy'
|
|
9
|
-
|
|
10
|
-
class << self
|
|
11
|
-
alias native_convert convert
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
module_function
|
|
15
|
-
|
|
16
|
-
# Convert HTML to Markdown, returning a Hash with:
|
|
17
|
-
# - :content [String, nil] the converted Markdown output
|
|
18
|
-
# - :document [nil] document structure (not yet exposed)
|
|
19
|
-
# - :metadata [Hash, nil] extracted HTML metadata
|
|
20
|
-
# - :tables [Array<Hash>] extracted tables with :grid and :markdown
|
|
21
|
-
# - :images [Array<Hash>] extracted inline images
|
|
22
|
-
# - :warnings [Array<Hash>] processing warnings
|
|
14
|
+
# Convert HTML to Markdown.
|
|
23
15
|
#
|
|
24
|
-
# @param html [String] HTML
|
|
25
|
-
# @param options [Hash
|
|
26
|
-
#
|
|
27
|
-
|
|
28
|
-
|
|
16
|
+
# @param html [String] The HTML content to convert.
|
|
17
|
+
# @param options [Hash] Optional conversion options.
|
|
18
|
+
# Supported keys (all optional):
|
|
19
|
+
# - :heading_style - 'atx', 'atx_closed', 'setext', 'underlined'
|
|
20
|
+
# - :code_block_style - 'backticks', 'tildes', 'indented'
|
|
21
|
+
# - :escape_asterisks - Boolean
|
|
22
|
+
# - :escape_underscores - Boolean
|
|
23
|
+
# - :escape_misc - Boolean
|
|
24
|
+
# - :escape_ascii - Boolean
|
|
25
|
+
# - :strip_newlines - Boolean
|
|
26
|
+
# - :keep_inline_images_in - Array of tag names
|
|
27
|
+
# - :strip_tags - Array of tag names to strip
|
|
28
|
+
# - :preserve_tags - Array of tag names to preserve verbatim
|
|
29
|
+
# (and more, matching ConversionOptions fields)
|
|
30
|
+
# @return [String] The converted Markdown content.
|
|
31
|
+
def self.convert(html, options = {})
|
|
32
|
+
opts = if options.nil? || options.empty?
|
|
33
|
+
nil
|
|
34
|
+
else
|
|
35
|
+
HtmlToMarkdownRs::ConversionOptions.new(options)
|
|
36
|
+
end
|
|
37
|
+
result = HtmlToMarkdownRs.convert(html, opts)
|
|
38
|
+
result.content || ''
|
|
29
39
|
end
|
|
30
40
|
end
|
data/sig/html_to_markdown.rbs
CHANGED
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Native extension module (Magnus/rb-sys)
|
|
2
|
+
module HtmlToMarkdownRs
|
|
3
|
+
class ConversionOptions
|
|
4
|
+
def initialize: (Hash[Symbol, untyped]) -> void
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
class ConversionResult
|
|
8
|
+
def content: () -> String?
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.convert: (String html, ConversionOptions? options) -> ConversionResult
|
|
12
|
+
end
|
|
13
|
+
|
|
1
14
|
# Type definitions for HtmlToMarkdown Ruby gem
|
|
2
15
|
module HtmlToMarkdown
|
|
3
16
|
VERSION: String
|
|
@@ -8,6 +21,7 @@ module HtmlToMarkdown
|
|
|
8
21
|
type whitespace_mode = :normalized | :strict
|
|
9
22
|
type newline_style = :spaces | :backslash
|
|
10
23
|
type code_block_style = :indented | :backticks | :tildes
|
|
24
|
+
type link_style = :inline | :reference
|
|
11
25
|
type output_format = :markdown | :djot
|
|
12
26
|
type preprocessing_preset = :minimal | :standard | :aggressive
|
|
13
27
|
|
|
@@ -49,6 +63,7 @@ module HtmlToMarkdown
|
|
|
49
63
|
debug?: bool,
|
|
50
64
|
strip_tags?: Array[String],
|
|
51
65
|
preserve_tags?: Array[String],
|
|
66
|
+
link_style?: link_style,
|
|
52
67
|
output_format?: output_format,
|
|
53
68
|
skip_images?: bool,
|
|
54
69
|
include_document_structure?: bool,
|
|
@@ -126,12 +141,9 @@ module HtmlToMarkdown
|
|
|
126
141
|
|
|
127
142
|
public
|
|
128
143
|
|
|
129
|
-
# Convert HTML to Markdown, returning
|
|
144
|
+
# Convert HTML to Markdown, returning the markdown content string.
|
|
130
145
|
#
|
|
131
146
|
# Example:
|
|
132
147
|
# result = HtmlToMarkdown.convert(html)
|
|
133
|
-
def self.convert: (String html, ?conversion_options options) ->
|
|
134
|
-
|
|
135
|
-
# Instance method version (created by module_function)
|
|
136
|
-
def convert: (String html, ?conversion_options options) -> Hash[String, untyped]
|
|
148
|
+
def self.convert: (String html, ?conversion_options options) -> String
|
|
137
149
|
end
|
data/vendor/Cargo.toml
CHANGED
|
@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
|
|
|
3
3
|
resolver = "2"
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "3.1
|
|
6
|
+
version = "3.2.1"
|
|
7
7
|
edition = "2024"
|
|
8
8
|
rust-version = "1.85"
|
|
9
9
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -18,11 +18,11 @@ clap = { version = "4.6", features = ["derive"] }
|
|
|
18
18
|
clap_complete = "4.6"
|
|
19
19
|
clap_mangen = "0.3"
|
|
20
20
|
encoding_rs = "0.8"
|
|
21
|
-
ext-php-rs = "0.15.
|
|
21
|
+
ext-php-rs = "0.15.10"
|
|
22
22
|
html5ever = "0.39.0"
|
|
23
23
|
once_cell = "1.21"
|
|
24
|
-
pyo3 = { version = "0.28.
|
|
25
|
-
rayon = "1.
|
|
24
|
+
pyo3 = { version = "0.28.3", features = ["abi3-py310"] }
|
|
25
|
+
rayon = "1.12"
|
|
26
26
|
regex = "1.12"
|
|
27
27
|
serde = { version = "1.0", features = ["derive"] }
|
|
28
28
|
serde_json = "1.0"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "html-to-markdown-rs"
|
|
3
|
-
version = "3.1
|
|
3
|
+
version = "3.2.1"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -39,7 +39,7 @@ image = { version = "0.25", default-features = false, features = [
|
|
|
39
39
|
"bmp",
|
|
40
40
|
"webp",
|
|
41
41
|
], optional = true }
|
|
42
|
-
lru = "0.
|
|
42
|
+
lru = "0.17"
|
|
43
43
|
memchr = "2"
|
|
44
44
|
once_cell = "1.21"
|
|
45
45
|
regex = "1.12"
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
#![allow(missing_docs)]
|
|
2
|
+
use html_to_markdown_rs::ConversionOptions;
|
|
3
|
+
|
|
4
|
+
fn main() {
|
|
5
|
+
let json = r#"{"headingStyle":"","listIndentType":"","listIndentWidth":2,"bullets":"-*+","strongEmSymbol":"*","escapeAsterisks":false,"escapeUnderscores":false,"escapeMisc":false,"escapeAscii":false,"codeLanguage":"","autolinks":true,"defaultTitle":false,"brInTables":false,"highlightStyle":"","extractMetadata":true,"whitespaceMode":"","stripNewlines":false,"wrap":false,"wrapWidth":80,"convertAsInline":false,"subSymbol":"","supSymbol":"","newlineStyle":"spaces","codeBlockStyle":"tildes","keepInlineImagesIn":null,"preprocessing":{"enabled":false,"preset":"","removeNavigation":false,"removeForms":false},"encoding":"utf-8","debug":false,"stripTags":null,"preserveTags":null,"skipImages":false,"linkStyle":"","outputFormat":"","includeDocumentStructure":false,"extractImages":false,"maxImageSize":5242880,"captureSvg":false,"inferDimensions":true}"#;
|
|
6
|
+
|
|
7
|
+
let opts: ConversionOptions = serde_json::from_str(json).unwrap();
|
|
8
|
+
println!("code_block_style: {:?}", opts.code_block_style);
|
|
9
|
+
|
|
10
|
+
let result = html_to_markdown_rs::convert("<pre><code>some code</code></pre>", Some(opts)).unwrap();
|
|
11
|
+
println!("result: {:?}", result.content);
|
|
12
|
+
}
|
|
@@ -29,7 +29,7 @@ pub(crate) use caption::handle_caption;
|
|
|
29
29
|
/// Dispatches table element handling to the main convert_table function.
|
|
30
30
|
///
|
|
31
31
|
/// # Usage in converter.rs
|
|
32
|
-
/// ```
|
|
32
|
+
/// ```text
|
|
33
33
|
/// if "table" == tag_name {
|
|
34
34
|
/// crate::converter::block::table::handle_table(
|
|
35
35
|
/// node_handle,
|
|
@@ -198,7 +198,11 @@ pub(crate) fn handle_li(
|
|
|
198
198
|
} else {
|
|
199
199
|
let bullets: Vec<char> = options.bullets.chars().collect();
|
|
200
200
|
let bullet_index = if ctx.ul_depth > 0 { ctx.ul_depth - 1 } else { 0 };
|
|
201
|
-
let bullet =
|
|
201
|
+
let bullet = if bullets.is_empty() {
|
|
202
|
+
'*'
|
|
203
|
+
} else {
|
|
204
|
+
bullets[bullet_index % bullets.len()]
|
|
205
|
+
};
|
|
202
206
|
output.push(bullet);
|
|
203
207
|
output.push(' ');
|
|
204
208
|
}
|
|
@@ -265,7 +269,11 @@ pub(crate) fn handle_li(
|
|
|
265
269
|
} else {
|
|
266
270
|
let bullets: Vec<char> = options.bullets.chars().collect();
|
|
267
271
|
let bullet_index = if ctx.ul_depth > 0 { ctx.ul_depth - 1 } else { 0 };
|
|
268
|
-
let bullet =
|
|
272
|
+
let bullet = if bullets.is_empty() {
|
|
273
|
+
'*'
|
|
274
|
+
} else {
|
|
275
|
+
bullets[bullet_index % bullets.len()]
|
|
276
|
+
};
|
|
269
277
|
let bullet_str = bullet.to_string();
|
|
270
278
|
let text_start = last_line.find(bullet).map_or(0, |pos| pos + 1);
|
|
271
279
|
(bullet_str, last_line[text_start..].trim().to_string())
|
|
@@ -40,7 +40,7 @@
|
|
|
40
40
|
//!
|
|
41
41
|
//! Each submodule (block, inline, list, etc.) follows a consistent pattern:
|
|
42
42
|
//!
|
|
43
|
-
//! ```
|
|
43
|
+
//! ```text
|
|
44
44
|
//! // Module declares handlers for specific element types
|
|
45
45
|
//! pub fn dispatch_<category>_handler(
|
|
46
46
|
//! tag_name: &str,
|
|
@@ -74,7 +74,7 @@
|
|
|
74
74
|
//! Once `converter.rs` is refactored to use `converter/main.rs`, the walk_node function
|
|
75
75
|
//! will use dispatch functions like:
|
|
76
76
|
//!
|
|
77
|
-
//! ```
|
|
77
|
+
//! ```text
|
|
78
78
|
//! use crate::converter::{block, inline, list, media, semantic, form};
|
|
79
79
|
//!
|
|
80
80
|
//! fn walk_node(...) {
|
|
@@ -166,7 +166,7 @@ pub(crate) fn is_block_level_element(tag_name: &str) -> bool {
|
|
|
166
166
|
/// If `index` is already a char boundary it is returned unchanged.
|
|
167
167
|
/// Otherwise it walks backwards to find one. Returns 0 if no boundary
|
|
168
168
|
/// is found before `index`.
|
|
169
|
-
pub
|
|
169
|
+
pub fn floor_char_boundary(s: &str, index: usize) -> usize {
|
|
170
170
|
if index >= s.len() {
|
|
171
171
|
s.len()
|
|
172
172
|
} else {
|
|
@@ -18,7 +18,7 @@ pub use crate::metadata::{
|
|
|
18
18
|
};
|
|
19
19
|
|
|
20
20
|
pub use crate::options::{
|
|
21
|
-
CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle,
|
|
22
|
-
ListIndentType, NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate,
|
|
23
|
-
WhitespaceMode,
|
|
21
|
+
CodeBlockStyle, ConversionOptions, ConversionOptionsBuilder, ConversionOptionsUpdate, HeadingStyle, HighlightStyle,
|
|
22
|
+
LinkStyle, ListIndentType, NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate,
|
|
23
|
+
PreprocessingPreset, WhitespaceMode,
|
|
24
24
|
};
|
|
@@ -26,7 +26,7 @@ pub const DEFAULT_INLINE_IMAGE_LIMIT: u64 = 5 * 1024 * 1024;
|
|
|
26
26
|
/// corresponding fields unchanged when applied via [`InlineImageConfig::apply_update`].
|
|
27
27
|
#[derive(Debug, Clone, Default)]
|
|
28
28
|
#[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
|
|
29
|
-
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(
|
|
29
|
+
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
|
|
30
30
|
pub struct InlineImageConfigUpdate {
|
|
31
31
|
/// Optional maximum decoded size override in bytes.
|
|
32
32
|
pub max_decoded_size_bytes: Option<u64>,
|
|
@@ -77,9 +77,8 @@ mod validation;
|
|
|
77
77
|
pub use exports::*;
|
|
78
78
|
pub use types::{
|
|
79
79
|
AnnotationKind, ConversionResult, DocumentNode, DocumentStructure, GridCell, NodeContent, ProcessingWarning,
|
|
80
|
-
TableGrid, TextAnnotation, WarningKind,
|
|
80
|
+
TableData, TableGrid, TextAnnotation, WarningKind,
|
|
81
81
|
};
|
|
82
|
-
// Note: types::TableData will replace convert_api::TableData when convert() is refactored
|
|
83
82
|
|
|
84
83
|
// ============================================================================
|
|
85
84
|
// Main Public API Functions
|
|
@@ -133,7 +133,7 @@ pub struct MetadataConfig {
|
|
|
133
133
|
/// ```
|
|
134
134
|
#[derive(Debug, Clone, Default)]
|
|
135
135
|
#[cfg_attr(any(feature = "serde", feature = "metadata"), derive(serde::Deserialize))]
|
|
136
|
-
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(
|
|
136
|
+
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
|
|
137
137
|
pub struct MetadataConfigUpdate {
|
|
138
138
|
/// Optional override for extracting document-level metadata.
|
|
139
139
|
///
|
|
@@ -46,7 +46,7 @@
|
|
|
46
46
|
//!
|
|
47
47
|
//! ## Basic Usage with `convert()`
|
|
48
48
|
//!
|
|
49
|
-
//! ```
|
|
49
|
+
//! ```text
|
|
50
50
|
//! use html_to_markdown_rs::convert;
|
|
51
51
|
//!
|
|
52
52
|
//! let html = r#"
|
|
@@ -87,7 +87,7 @@
|
|
|
87
87
|
//!
|
|
88
88
|
//! ## Selective Extraction
|
|
89
89
|
//!
|
|
90
|
-
//! ```
|
|
90
|
+
//! ```text
|
|
91
91
|
//! use html_to_markdown_rs::{convert, ConversionOptions};
|
|
92
92
|
//!
|
|
93
93
|
//! let options = ConversionOptions {
|
|
@@ -102,7 +102,7 @@
|
|
|
102
102
|
//!
|
|
103
103
|
//! ## Analyzing Link Types
|
|
104
104
|
//!
|
|
105
|
-
//! ```
|
|
105
|
+
//! ```text
|
|
106
106
|
//! use html_to_markdown_rs::convert;
|
|
107
107
|
//! use html_to_markdown_rs::metadata::LinkType;
|
|
108
108
|
//!
|
|
@@ -126,7 +126,7 @@
|
|
|
126
126
|
//! All types in this module support serialization via `serde` when the `metadata` feature is enabled.
|
|
127
127
|
//! This enables easy export to JSON, YAML, or other formats:
|
|
128
128
|
//!
|
|
129
|
-
//! ```
|
|
129
|
+
//! ```text
|
|
130
130
|
//! use html_to_markdown_rs::convert;
|
|
131
131
|
//!
|
|
132
132
|
//! let result = convert(html, None)?;
|
|
@@ -160,7 +160,7 @@ use std::rc::Rc;
|
|
|
160
160
|
///
|
|
161
161
|
/// # Examples
|
|
162
162
|
///
|
|
163
|
-
/// ```
|
|
163
|
+
/// ```text
|
|
164
164
|
/// let collector = MetadataCollector::new(MetadataConfig::default());
|
|
165
165
|
/// let handle = Rc::new(RefCell::new(collector));
|
|
166
166
|
///
|
|
@@ -13,7 +13,7 @@ use crate::options::validation::{
|
|
|
13
13
|
///
|
|
14
14
|
/// # Example
|
|
15
15
|
///
|
|
16
|
-
/// ```
|
|
16
|
+
/// ```text
|
|
17
17
|
/// use html_to_markdown_rs::ConversionOptions;
|
|
18
18
|
///
|
|
19
19
|
/// let options = ConversionOptions::builder()
|
|
@@ -27,10 +27,7 @@ use crate::options::validation::{
|
|
|
27
27
|
any(feature = "serde", feature = "metadata"),
|
|
28
28
|
derive(serde::Serialize, serde::Deserialize)
|
|
29
29
|
)]
|
|
30
|
-
#[cfg_attr(
|
|
31
|
-
any(feature = "serde", feature = "metadata"),
|
|
32
|
-
serde(rename_all = "camelCase", default, deny_unknown_fields)
|
|
33
|
-
)]
|
|
30
|
+
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(default, deny_unknown_fields))]
|
|
34
31
|
pub struct ConversionOptions {
|
|
35
32
|
/// Heading style to use in Markdown output (ATX `#` or Setext underline).
|
|
36
33
|
pub heading_style: HeadingStyle,
|
|
@@ -116,7 +113,7 @@ impl Default for ConversionOptions {
|
|
|
116
113
|
heading_style: HeadingStyle::default(),
|
|
117
114
|
list_indent_type: ListIndentType::default(),
|
|
118
115
|
list_indent_width: 2,
|
|
119
|
-
bullets: "
|
|
116
|
+
bullets: "-*+".to_string(),
|
|
120
117
|
strong_em_symbol: '*',
|
|
121
118
|
escape_asterisks: false,
|
|
122
119
|
escape_underscores: false,
|
|
@@ -293,10 +290,7 @@ use crate::options::preprocessing::PreprocessingOptionsUpdate;
|
|
|
293
290
|
any(feature = "serde", feature = "metadata"),
|
|
294
291
|
derive(serde::Serialize, serde::Deserialize)
|
|
295
292
|
)]
|
|
296
|
-
#[cfg_attr(
|
|
297
|
-
any(feature = "serde", feature = "metadata"),
|
|
298
|
-
serde(rename_all = "camelCase", deny_unknown_fields)
|
|
299
|
-
)]
|
|
293
|
+
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
|
|
300
294
|
pub struct ConversionOptionsUpdate {
|
|
301
295
|
/// Optional override for [`ConversionOptions::heading_style`].
|
|
302
296
|
pub heading_style: Option<HeadingStyle>,
|
|
@@ -472,8 +466,8 @@ mod tests {
|
|
|
472
466
|
#[test]
|
|
473
467
|
fn test_conversion_options_partial_deserialization() {
|
|
474
468
|
let partial_json = r#"{
|
|
475
|
-
"
|
|
476
|
-
"
|
|
469
|
+
"heading_style": "atxclosed",
|
|
470
|
+
"list_indent_width": 4,
|
|
477
471
|
"bullets": "*"
|
|
478
472
|
}"#;
|
|
479
473
|
|
|
@@ -10,7 +10,7 @@ pub mod preprocessing;
|
|
|
10
10
|
pub mod validation;
|
|
11
11
|
|
|
12
12
|
// Re-exports for easy access
|
|
13
|
-
pub use conversion::{ConversionOptions, ConversionOptionsUpdate};
|
|
13
|
+
pub use conversion::{ConversionOptions, ConversionOptionsBuilder, ConversionOptionsUpdate};
|
|
14
14
|
pub use preprocessing::{PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset};
|
|
15
15
|
pub use validation::{
|
|
16
16
|
CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
|
|
@@ -42,10 +42,7 @@ impl PreprocessingPreset {
|
|
|
42
42
|
any(feature = "serde", feature = "metadata"),
|
|
43
43
|
derive(serde::Serialize, serde::Deserialize)
|
|
44
44
|
)]
|
|
45
|
-
#[cfg_attr(
|
|
46
|
-
any(feature = "serde", feature = "metadata"),
|
|
47
|
-
serde(rename_all = "camelCase", deny_unknown_fields)
|
|
48
|
-
)]
|
|
45
|
+
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(default, deny_unknown_fields))]
|
|
49
46
|
pub struct PreprocessingOptions {
|
|
50
47
|
/// Enable HTML preprocessing globally
|
|
51
48
|
pub enabled: bool,
|
|
@@ -70,10 +67,7 @@ pub struct PreprocessingOptions {
|
|
|
70
67
|
any(feature = "serde", feature = "metadata"),
|
|
71
68
|
derive(serde::Serialize, serde::Deserialize)
|
|
72
69
|
)]
|
|
73
|
-
#[cfg_attr(
|
|
74
|
-
any(feature = "serde", feature = "metadata"),
|
|
75
|
-
serde(rename_all = "camelCase", deny_unknown_fields)
|
|
76
|
-
)]
|
|
70
|
+
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(deny_unknown_fields))]
|
|
77
71
|
pub struct PreprocessingOptionsUpdate {
|
|
78
72
|
/// Optional global preprocessing enablement override
|
|
79
73
|
pub enabled: Option<bool>,
|
|
@@ -91,7 +85,7 @@ pub struct PreprocessingOptionsUpdate {
|
|
|
91
85
|
impl Default for PreprocessingOptions {
|
|
92
86
|
fn default() -> Self {
|
|
93
87
|
Self {
|
|
94
|
-
enabled:
|
|
88
|
+
enabled: true,
|
|
95
89
|
preset: PreprocessingPreset::default(),
|
|
96
90
|
remove_navigation: true,
|
|
97
91
|
remove_forms: true,
|
|
@@ -115,10 +115,10 @@ impl NewlineStyle {
|
|
|
115
115
|
/// Determines how code blocks (`<pre><code>`) are rendered in Markdown.
|
|
116
116
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
|
|
117
117
|
pub enum CodeBlockStyle {
|
|
118
|
-
/// Indented code blocks (4 spaces).
|
|
119
|
-
#[default]
|
|
118
|
+
/// Indented code blocks (4 spaces). `CommonMark` standard.
|
|
120
119
|
Indented,
|
|
121
|
-
/// Fenced code blocks with backticks (```). Supports language hints.
|
|
120
|
+
/// Fenced code blocks with backticks (```). Default (GFM). Supports language hints.
|
|
121
|
+
#[default]
|
|
122
122
|
Backticks,
|
|
123
123
|
/// Fenced code blocks with tildes (~~~). Supports language hints.
|
|
124
124
|
Tildes,
|
|
@@ -147,8 +147,10 @@ pub struct TextAnnotation {
|
|
|
147
147
|
/// Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
|
|
148
148
|
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
|
149
149
|
#[serde(tag = "annotation_type", rename_all = "snake_case")]
|
|
150
|
+
#[derive(Default)]
|
|
150
151
|
pub enum AnnotationKind {
|
|
151
152
|
/// Bold / strong emphasis.
|
|
153
|
+
#[default]
|
|
152
154
|
Bold,
|
|
153
155
|
/// Italic / emphasis.
|
|
154
156
|
Italic,
|
|
@@ -173,3 +175,12 @@ pub enum AnnotationKind {
|
|
|
173
175
|
title: Option<String>,
|
|
174
176
|
},
|
|
175
177
|
}
|
|
178
|
+
|
|
179
|
+
impl Default for NodeContent {
|
|
180
|
+
fn default() -> Self {
|
|
181
|
+
Self::Heading {
|
|
182
|
+
level: 1,
|
|
183
|
+
text: String::new(),
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
//! The primary result type for HTML conversion and extraction.
|
|
2
2
|
|
|
3
|
+
use serde::{Deserialize, Serialize};
|
|
4
|
+
|
|
3
5
|
use super::document::DocumentStructure;
|
|
4
6
|
use super::tables::TableData;
|
|
5
7
|
use super::warnings::ProcessingWarning;
|
|
@@ -11,14 +13,14 @@ use super::warnings::ProcessingWarning;
|
|
|
11
13
|
///
|
|
12
14
|
/// # Example
|
|
13
15
|
///
|
|
14
|
-
/// ```
|
|
16
|
+
/// ```text
|
|
15
17
|
/// use html_to_markdown_rs::{convert, ConversionOptions};
|
|
16
18
|
///
|
|
17
19
|
/// let result = convert("<h1>Hello</h1><p>World</p>", None)?;
|
|
18
20
|
/// assert!(result.content.is_some());
|
|
19
21
|
/// assert!(result.warnings.is_empty());
|
|
20
22
|
/// ```
|
|
21
|
-
#[derive(Debug, Clone, Default)]
|
|
23
|
+
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
|
22
24
|
pub struct ConversionResult {
|
|
23
25
|
/// Converted text output (markdown, djot, or plain text).
|
|
24
26
|
///
|
|
@@ -42,6 +44,7 @@ pub struct ConversionResult {
|
|
|
42
44
|
///
|
|
43
45
|
/// Populated when `extract_images` is `true` in options.
|
|
44
46
|
#[cfg(feature = "inline-images")]
|
|
47
|
+
#[serde(skip)]
|
|
45
48
|
pub images: Vec<crate::inline_images::InlineImage>,
|
|
46
49
|
|
|
47
50
|
/// Non-fatal processing warnings.
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
use serde::{Deserialize, Serialize};
|
|
4
4
|
|
|
5
5
|
/// A structured table grid with cell-level data including spans.
|
|
6
|
-
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
|
6
|
+
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
|
|
7
7
|
pub struct TableGrid {
|
|
8
8
|
/// Number of rows.
|
|
9
9
|
pub rows: u32,
|