html-to-markdown 2.29.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +18 -41
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +7 -4
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +127 -51
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "html-to-markdown-rb"
|
|
3
|
-
version = "
|
|
3
|
+
version = "3.0.0"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -26,8 +26,6 @@ html-to-markdown-rs = { path = "../../../vendor/html-to-markdown-rs", features =
|
|
|
26
26
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
|
|
27
27
|
"rb-sys",
|
|
28
28
|
] }
|
|
29
|
-
pprof = { version = "0.15", features = ["flamegraph"], optional = true }
|
|
30
|
-
|
|
31
29
|
[dev-dependencies]
|
|
32
30
|
pretty_assertions = "1.4"
|
|
33
31
|
|
|
@@ -36,7 +34,6 @@ default = ["inline-images", "metadata", "visitor"]
|
|
|
36
34
|
inline-images = ["html-to-markdown-rs/inline-images"]
|
|
37
35
|
metadata = ["html-to-markdown-rs/metadata"]
|
|
38
36
|
visitor = ["html-to-markdown-rs/visitor"]
|
|
39
|
-
profiling = ["dep:pprof"]
|
|
40
37
|
|
|
41
38
|
[lints.rust]
|
|
42
39
|
unsafe_code = "forbid"
|
|
@@ -144,25 +144,16 @@ markdown = HtmlToMarkdown.convert(
|
|
|
144
144
|
|
|
145
145
|
### Inline Images
|
|
146
146
|
|
|
147
|
-
|
|
147
|
+
Convert HTML with inline images (data URIs, SVG) to Markdown.
|
|
148
148
|
|
|
149
149
|
```ruby
|
|
150
150
|
require 'html_to_markdown'
|
|
151
151
|
|
|
152
|
-
|
|
153
|
-
'<img src="data:image/png;base64,iVBORw0..." alt="Pixel">'
|
|
154
|
-
image_config: {
|
|
155
|
-
max_decoded_size_bytes: 1 * 1024 * 1024,
|
|
156
|
-
infer_dimensions: true,
|
|
157
|
-
filename_prefix: 'img_',
|
|
158
|
-
capture_svg: true
|
|
159
|
-
}
|
|
152
|
+
markdown = HtmlToMarkdown.convert(
|
|
153
|
+
'<img src="data:image/png;base64,iVBORw0..." alt="Pixel">'
|
|
160
154
|
)
|
|
161
155
|
|
|
162
|
-
puts
|
|
163
|
-
result.inline_images.each do |img|
|
|
164
|
-
puts "#{img.filename} -> #{img.format} (#{img.data.bytesize} bytes)"
|
|
165
|
-
end
|
|
156
|
+
puts markdown
|
|
166
157
|
```
|
|
167
158
|
|
|
168
159
|
## CLI
|
|
@@ -1,53 +1,8 @@
|
|
|
1
1
|
//! Inline image configuration and conversion functions.
|
|
2
2
|
|
|
3
|
-
use
|
|
4
|
-
use html_to_markdown_rs::{
|
|
5
|
-
DEFAULT_INLINE_IMAGE_LIMIT, HtmlExtraction, InlineImage, InlineImageConfig, InlineImageConfigUpdate,
|
|
6
|
-
InlineImageWarning,
|
|
7
|
-
};
|
|
3
|
+
use html_to_markdown_rs::InlineImage;
|
|
8
4
|
use magnus::prelude::*;
|
|
9
|
-
use magnus::
|
|
10
|
-
use magnus::{Error, RHash, Ruby, TryConvert, Value};
|
|
11
|
-
|
|
12
|
-
pub fn build_inline_image_config(_ruby: &Ruby, config: Option<Value>) -> Result<InlineImageConfig, Error> {
|
|
13
|
-
let mut update = InlineImageConfigUpdate::default();
|
|
14
|
-
|
|
15
|
-
let Some(config) = config else {
|
|
16
|
-
return Ok(InlineImageConfig::new(DEFAULT_INLINE_IMAGE_LIMIT));
|
|
17
|
-
};
|
|
18
|
-
|
|
19
|
-
if config.is_nil() {
|
|
20
|
-
return Ok(InlineImageConfig::new(DEFAULT_INLINE_IMAGE_LIMIT));
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
let hash = RHash::from_value(config).ok_or_else(|| arg_error("inline image config must be provided as a Hash"))?;
|
|
24
|
-
|
|
25
|
-
hash.foreach(|key: Value, val: Value| {
|
|
26
|
-
let key_name = symbol_to_string(key)?;
|
|
27
|
-
match key_name.as_str() {
|
|
28
|
-
"max_decoded_size_bytes" => {
|
|
29
|
-
update.max_decoded_size_bytes = Some(u64::try_convert(val)?);
|
|
30
|
-
}
|
|
31
|
-
"filename_prefix" => {
|
|
32
|
-
update.filename_prefix = if val.is_nil() {
|
|
33
|
-
None
|
|
34
|
-
} else {
|
|
35
|
-
Some(String::try_convert(val)?)
|
|
36
|
-
};
|
|
37
|
-
}
|
|
38
|
-
"capture_svg" => {
|
|
39
|
-
update.capture_svg = Some(bool::try_convert(val)?);
|
|
40
|
-
}
|
|
41
|
-
"infer_dimensions" => {
|
|
42
|
-
update.infer_dimensions = Some(bool::try_convert(val)?);
|
|
43
|
-
}
|
|
44
|
-
_ => {}
|
|
45
|
-
}
|
|
46
|
-
Ok(ForEach::Continue)
|
|
47
|
-
})?;
|
|
48
|
-
|
|
49
|
-
Ok(InlineImageConfig::from_update(update))
|
|
50
|
-
}
|
|
5
|
+
use magnus::{Error, Ruby, Value};
|
|
51
6
|
|
|
52
7
|
pub fn inline_image_to_value(ruby: &Ruby, image: InlineImage) -> Result<Value, Error> {
|
|
53
8
|
let InlineImage {
|
|
@@ -97,29 +52,3 @@ pub fn inline_image_to_value(ruby: &Ruby, image: InlineImage) -> Result<Value, E
|
|
|
97
52
|
|
|
98
53
|
Ok(hash.as_value())
|
|
99
54
|
}
|
|
100
|
-
|
|
101
|
-
pub fn warning_to_value(ruby: &Ruby, warning: InlineImageWarning) -> Result<Value, Error> {
|
|
102
|
-
let hash = ruby.hash_new();
|
|
103
|
-
hash.aset(ruby.intern("index"), warning.index as i64)?;
|
|
104
|
-
hash.aset(ruby.intern("message"), warning.message)?;
|
|
105
|
-
Ok(hash.as_value())
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
pub fn extraction_to_value(ruby: &Ruby, extraction: HtmlExtraction) -> Result<Value, Error> {
|
|
109
|
-
let hash = ruby.hash_new();
|
|
110
|
-
hash.aset(ruby.intern("markdown"), extraction.markdown)?;
|
|
111
|
-
|
|
112
|
-
let inline_images = ruby.ary_new();
|
|
113
|
-
for image in extraction.inline_images {
|
|
114
|
-
inline_images.push(inline_image_to_value(ruby, image)?)?;
|
|
115
|
-
}
|
|
116
|
-
hash.aset(ruby.intern("inline_images"), inline_images)?;
|
|
117
|
-
|
|
118
|
-
let warnings = ruby.ary_new();
|
|
119
|
-
for warning in extraction.warnings {
|
|
120
|
-
warnings.push(warning_to_value(ruby, warning)?)?;
|
|
121
|
-
}
|
|
122
|
-
hash.aset(ruby.intern("warnings"), warnings)?;
|
|
123
|
-
|
|
124
|
-
Ok(hash.as_value())
|
|
125
|
-
}
|
|
@@ -1,56 +1,12 @@
|
|
|
1
1
|
//! Metadata configuration and conversion functions.
|
|
2
2
|
|
|
3
|
-
use crate::types::{arg_error, symbol_to_string};
|
|
4
3
|
use html_to_markdown_rs::metadata::{
|
|
5
|
-
DocumentMetadata as RustDocumentMetadata,
|
|
6
|
-
|
|
7
|
-
|
|
4
|
+
DocumentMetadata as RustDocumentMetadata, HeaderMetadata as RustHeaderMetadata,
|
|
5
|
+
HtmlMetadata as RustHtmlMetadata, ImageMetadata as RustImageMetadata, LinkMetadata as RustLinkMetadata,
|
|
6
|
+
StructuredData as RustStructuredData, TextDirection as RustTextDirection,
|
|
8
7
|
};
|
|
9
8
|
use magnus::prelude::*;
|
|
10
|
-
use magnus::
|
|
11
|
-
use magnus::{Error, RHash, Ruby, TryConvert, Value};
|
|
12
|
-
|
|
13
|
-
pub fn build_metadata_config(_ruby: &Ruby, config: Option<Value>) -> Result<RustMetadataConfig, Error> {
|
|
14
|
-
let mut cfg = RustMetadataConfig::default();
|
|
15
|
-
|
|
16
|
-
let Some(config) = config else {
|
|
17
|
-
return Ok(cfg);
|
|
18
|
-
};
|
|
19
|
-
|
|
20
|
-
if config.is_nil() {
|
|
21
|
-
return Ok(cfg);
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
let hash = RHash::from_value(config).ok_or_else(|| arg_error("metadata_config must be provided as a Hash"))?;
|
|
25
|
-
|
|
26
|
-
hash.foreach(|key: Value, val: Value| {
|
|
27
|
-
let key_name = symbol_to_string(key)?;
|
|
28
|
-
match key_name.as_str() {
|
|
29
|
-
"extract_document" => {
|
|
30
|
-
cfg.extract_document = bool::try_convert(val)?;
|
|
31
|
-
}
|
|
32
|
-
"extract_headers" => {
|
|
33
|
-
cfg.extract_headers = bool::try_convert(val)?;
|
|
34
|
-
}
|
|
35
|
-
"extract_links" => {
|
|
36
|
-
cfg.extract_links = bool::try_convert(val)?;
|
|
37
|
-
}
|
|
38
|
-
"extract_images" => {
|
|
39
|
-
cfg.extract_images = bool::try_convert(val)?;
|
|
40
|
-
}
|
|
41
|
-
"extract_structured_data" => {
|
|
42
|
-
cfg.extract_structured_data = bool::try_convert(val)?;
|
|
43
|
-
}
|
|
44
|
-
"max_structured_data_size" => {
|
|
45
|
-
cfg.max_structured_data_size = usize::try_convert(val)?;
|
|
46
|
-
}
|
|
47
|
-
_ => {}
|
|
48
|
-
}
|
|
49
|
-
Ok(ForEach::Continue)
|
|
50
|
-
})?;
|
|
51
|
-
|
|
52
|
-
Ok(cfg)
|
|
53
|
-
}
|
|
9
|
+
use magnus::{Error, Ruby, Value};
|
|
54
10
|
|
|
55
11
|
fn opt_string_to_ruby(ruby: &Ruby, opt: Option<String>) -> Result<Value, Error> {
|
|
56
12
|
match opt {
|
|
@@ -183,7 +139,7 @@ fn structured_data_to_ruby(ruby: &Ruby, data: Vec<RustStructuredData>) -> Result
|
|
|
183
139
|
Ok(array.as_value())
|
|
184
140
|
}
|
|
185
141
|
|
|
186
|
-
pub fn extended_metadata_to_ruby(ruby: &Ruby, metadata:
|
|
142
|
+
pub fn extended_metadata_to_ruby(ruby: &Ruby, metadata: RustHtmlMetadata) -> Result<Value, Error> {
|
|
187
143
|
let hash = ruby.hash_new();
|
|
188
144
|
|
|
189
145
|
hash.aset(
|
|
@@ -5,13 +5,7 @@ pub mod inline_images;
|
|
|
5
5
|
#[cfg(feature = "metadata")]
|
|
6
6
|
pub mod metadata;
|
|
7
7
|
|
|
8
|
-
#[cfg(feature = "visitor")]
|
|
9
|
-
pub mod tables;
|
|
10
|
-
|
|
11
8
|
pub use inline_images::*;
|
|
12
9
|
|
|
13
10
|
#[cfg(feature = "metadata")]
|
|
14
11
|
pub use metadata::*;
|
|
15
|
-
|
|
16
|
-
#[cfg(feature = "visitor")]
|
|
17
|
-
pub use tables::*;
|
|
@@ -1,52 +1,19 @@
|
|
|
1
1
|
#![allow(clippy::all, clippy::pedantic, clippy::nursery, missing_docs)]
|
|
2
2
|
|
|
3
|
-
use html_to_markdown_rs::{
|
|
4
|
-
ConversionOptions, convert as convert_inner, convert_with_inline_images as convert_with_inline_images_inner,
|
|
5
|
-
error::ConversionError, safety::guard_panic,
|
|
6
|
-
};
|
|
7
|
-
|
|
8
|
-
#[cfg(feature = "visitor")]
|
|
9
|
-
use html_to_markdown_rs::convert_with_visitor as convert_with_visitor_inner;
|
|
10
|
-
|
|
11
|
-
#[cfg(feature = "visitor")]
|
|
12
|
-
use html_to_markdown_rs::convert_with_tables as convert_with_tables_inner;
|
|
13
|
-
|
|
14
|
-
#[cfg(feature = "metadata")]
|
|
15
|
-
use html_to_markdown_rs::convert_with_metadata as convert_with_metadata_inner;
|
|
3
|
+
use html_to_markdown_rs::{error::ConversionError, safety::guard_panic};
|
|
16
4
|
|
|
17
5
|
mod conversion;
|
|
18
6
|
mod options;
|
|
19
|
-
mod profiling;
|
|
20
7
|
mod types;
|
|
21
8
|
|
|
22
|
-
#[cfg(feature = "visitor")]
|
|
23
|
-
mod visitor;
|
|
24
|
-
|
|
25
|
-
use conversion::{build_inline_image_config, extraction_to_value};
|
|
26
9
|
use options::build_conversion_options;
|
|
27
10
|
use types::{arg_error, runtime_error};
|
|
28
11
|
|
|
29
12
|
#[cfg(feature = "metadata")]
|
|
30
|
-
use conversion::
|
|
31
|
-
|
|
32
|
-
#[cfg(feature = "visitor")]
|
|
33
|
-
use conversion::tables_result_to_ruby;
|
|
34
|
-
|
|
35
|
-
#[cfg(feature = "visitor")]
|
|
36
|
-
use visitor::RubyVisitorWrapper;
|
|
13
|
+
use conversion::extended_metadata_to_ruby;
|
|
37
14
|
|
|
38
15
|
use magnus::prelude::*;
|
|
39
|
-
use magnus::{Error, Ruby,
|
|
40
|
-
|
|
41
|
-
#[cfg(feature = "visitor")]
|
|
42
|
-
use std::panic::AssertUnwindSafe;
|
|
43
|
-
|
|
44
|
-
#[cfg(feature = "profiling")]
|
|
45
|
-
use std::path::PathBuf;
|
|
46
|
-
|
|
47
|
-
#[derive(Clone)]
|
|
48
|
-
#[magnus::wrap(class = "HtmlToMarkdown::Options", free_immediately)]
|
|
49
|
-
struct OptionsHandle(ConversionOptions);
|
|
16
|
+
use magnus::{Error, Ruby, Value, function, scan_args::scan_args};
|
|
50
17
|
|
|
51
18
|
fn conversion_error(err: ConversionError) -> Error {
|
|
52
19
|
match err {
|
|
@@ -58,208 +25,104 @@ fn conversion_error(err: ConversionError) -> Error {
|
|
|
58
25
|
}
|
|
59
26
|
}
|
|
60
27
|
|
|
61
|
-
fn
|
|
28
|
+
fn convert_full_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
|
|
62
29
|
let parsed = scan_args::<(String,), (Option<Value>,), (), (), (), ()>(args)?;
|
|
63
30
|
let html = parsed.required.0;
|
|
64
31
|
let options = build_conversion_options(ruby, parsed.optional.0)?;
|
|
65
32
|
|
|
66
|
-
guard_panic(||
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
fn options_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<OptionsHandle, Error> {
|
|
70
|
-
let parsed = scan_args::<(), (Option<Value>,), (), (), (), ()>(args)?;
|
|
71
|
-
let options = build_conversion_options(ruby, parsed.optional.0)?;
|
|
72
|
-
Ok(OptionsHandle(options))
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
fn convert_with_options_handle_fn(_ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
|
|
76
|
-
let parsed = scan_args::<(String, &OptionsHandle), (), (), (), (), ()>(args)?;
|
|
77
|
-
let html = parsed.required.0;
|
|
78
|
-
let handle = parsed.required.1;
|
|
79
|
-
let options = handle.0.clone();
|
|
80
|
-
|
|
81
|
-
guard_panic(|| profiling::maybe_profile(|| convert_inner(&html, Some(options)))).map_err(conversion_error)
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
#[cfg(feature = "inline-images")]
|
|
85
|
-
fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
|
|
86
|
-
let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
|
|
87
|
-
let html = parsed.required.0;
|
|
88
|
-
let options = build_conversion_options(ruby, parsed.optional.0)?;
|
|
89
|
-
let config = build_inline_image_config(ruby, parsed.optional.1)?;
|
|
90
|
-
|
|
91
|
-
let extraction = guard_panic(|| convert_with_inline_images_inner(&html, Some(options), config, None))
|
|
92
|
-
.map_err(conversion_error)?;
|
|
93
|
-
|
|
94
|
-
extraction_to_value(ruby, extraction)
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
#[cfg(feature = "inline-images")]
|
|
98
|
-
fn convert_with_inline_images_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
|
|
99
|
-
let parsed = scan_args::<(String, &OptionsHandle), (Option<Value>,), (), (), (), ()>(args)?;
|
|
100
|
-
let html = parsed.required.0;
|
|
101
|
-
let handle = parsed.required.1;
|
|
102
|
-
let options = handle.0.clone();
|
|
103
|
-
let config = build_inline_image_config(ruby, parsed.optional.0)?;
|
|
104
|
-
|
|
105
|
-
let extraction = guard_panic(|| convert_with_inline_images_inner(&html, Some(options), config, None))
|
|
106
|
-
.map_err(conversion_error)?;
|
|
107
|
-
|
|
108
|
-
extraction_to_value(ruby, extraction)
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
#[cfg(feature = "metadata")]
|
|
112
|
-
fn convert_with_metadata_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
|
|
113
|
-
let parsed = scan_args::<(String,), (Option<Value>, Option<Value>, Option<Value>), (), (), (), ()>(args)?;
|
|
114
|
-
let html = parsed.required.0;
|
|
115
|
-
let options = build_conversion_options(ruby, parsed.optional.0)?;
|
|
116
|
-
let metadata_config = build_metadata_config(ruby, parsed.optional.1)?;
|
|
117
|
-
let _visitor = parsed.optional.2;
|
|
118
|
-
|
|
119
|
-
let (markdown, metadata) = guard_panic(|| convert_with_metadata_inner(&html, Some(options), metadata_config, None))
|
|
33
|
+
let result = guard_panic(|| html_to_markdown_rs::convert(&html, Some(options.clone())))
|
|
120
34
|
.map_err(conversion_error)?;
|
|
121
35
|
|
|
122
|
-
let
|
|
123
|
-
array.push(markdown)?;
|
|
124
|
-
array.push(extended_metadata_to_ruby(ruby, metadata)?)?;
|
|
125
|
-
|
|
126
|
-
Ok(array.as_value())
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
#[cfg(feature = "metadata")]
|
|
130
|
-
fn convert_with_metadata_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
|
|
131
|
-
let parsed = scan_args::<(String, &OptionsHandle), (Option<Value>,), (), (), (), ()>(args)?;
|
|
132
|
-
let html = parsed.required.0;
|
|
133
|
-
let handle = parsed.required.1;
|
|
134
|
-
let options = handle.0.clone();
|
|
135
|
-
let metadata_config = build_metadata_config(ruby, parsed.optional.0)?;
|
|
36
|
+
let hash = ruby.hash_new();
|
|
136
37
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
array.push(extended_metadata_to_ruby(ruby, metadata)?)?;
|
|
143
|
-
|
|
144
|
-
Ok(array.as_value())
|
|
145
|
-
}
|
|
38
|
+
// content: Option<String>
|
|
39
|
+
match result.content {
|
|
40
|
+
Some(ref s) => hash.aset(ruby.intern("content"), s.as_str())?,
|
|
41
|
+
None => hash.aset(ruby.intern("content"), ruby.qnil())?,
|
|
42
|
+
}
|
|
146
43
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
|
|
150
|
-
let html = parsed.required.0;
|
|
151
|
-
let options = build_conversion_options(ruby, parsed.optional.0)?;
|
|
44
|
+
// document: not yet exposed
|
|
45
|
+
hash.aset(ruby.intern("document"), ruby.qnil())?;
|
|
152
46
|
|
|
47
|
+
// metadata
|
|
153
48
|
#[cfg(feature = "metadata")]
|
|
154
|
-
|
|
49
|
+
{
|
|
50
|
+
let metadata_value = extended_metadata_to_ruby(ruby, result.metadata)?;
|
|
51
|
+
hash.aset(ruby.intern("metadata"), metadata_value)?;
|
|
52
|
+
}
|
|
155
53
|
#[cfg(not(feature = "metadata"))]
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
let result =
|
|
159
|
-
guard_panic(|| convert_with_tables_inner(&html, Some(options), metadata_config)).map_err(conversion_error)?;
|
|
54
|
+
hash.aset(ruby.intern("metadata"), ruby.qnil())?;
|
|
160
55
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
if val.is_nil() {
|
|
180
|
-
return guard_panic(AssertUnwindSafe(|| {
|
|
181
|
-
profiling::maybe_profile(|| convert_inner(&html, Some(options)))
|
|
182
|
-
}))
|
|
183
|
-
.map_err(conversion_error);
|
|
56
|
+
// tables: Vec<TableData> with grid and markdown
|
|
57
|
+
{
|
|
58
|
+
let tables_array = ruby.ary_new();
|
|
59
|
+
for table in &result.tables {
|
|
60
|
+
let table_hash = ruby.hash_new();
|
|
61
|
+
let grid_hash = ruby.hash_new();
|
|
62
|
+
grid_hash.aset(ruby.intern("rows"), table.grid.rows as i64)?;
|
|
63
|
+
grid_hash.aset(ruby.intern("cols"), table.grid.cols as i64)?;
|
|
64
|
+
let cells_array = ruby.ary_new();
|
|
65
|
+
for cell in &table.grid.cells {
|
|
66
|
+
let cell_hash = ruby.hash_new();
|
|
67
|
+
cell_hash.aset(ruby.intern("content"), cell.content.as_str())?;
|
|
68
|
+
cell_hash.aset(ruby.intern("row"), cell.row as i64)?;
|
|
69
|
+
cell_hash.aset(ruby.intern("col"), cell.col as i64)?;
|
|
70
|
+
cell_hash.aset(ruby.intern("row_span"), cell.row_span as i64)?;
|
|
71
|
+
cell_hash.aset(ruby.intern("col_span"), cell.col_span as i64)?;
|
|
72
|
+
cell_hash.aset(ruby.intern("is_header"), cell.is_header)?;
|
|
73
|
+
cells_array.push(cell_hash)?;
|
|
184
74
|
}
|
|
185
|
-
|
|
75
|
+
grid_hash.aset(ruby.intern("cells"), cells_array)?;
|
|
76
|
+
table_hash.aset(ruby.intern("grid"), grid_hash)?;
|
|
77
|
+
table_hash.aset(ruby.intern("markdown"), table.markdown.as_str())?;
|
|
78
|
+
tables_array.push(table_hash)?;
|
|
186
79
|
}
|
|
187
|
-
|
|
188
|
-
};
|
|
189
|
-
|
|
190
|
-
let visitor_wrapper = RubyVisitorWrapper::new(visitor_value);
|
|
191
|
-
let visitor_handle = std::rc::Rc::new(std::cell::RefCell::new(visitor_wrapper.clone()));
|
|
192
|
-
|
|
193
|
-
let result = guard_panic(AssertUnwindSafe(|| {
|
|
194
|
-
profiling::maybe_profile(|| convert_with_visitor_inner(&html, Some(options), Some(visitor_handle)))
|
|
195
|
-
}))
|
|
196
|
-
.map_err(conversion_error)?;
|
|
197
|
-
|
|
198
|
-
if let Some(error_msg) = visitor_wrapper.last_error.borrow().as_ref() {
|
|
199
|
-
return Err(runtime_error(error_msg.clone()));
|
|
80
|
+
hash.aset(ruby.intern("tables"), tables_array)?;
|
|
200
81
|
}
|
|
201
82
|
|
|
202
|
-
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
#[cfg(feature = "profiling")]
|
|
206
|
-
fn start_profiling_fn(_ruby: &Ruby, args: &[Value]) -> Result<bool, Error> {
|
|
207
|
-
let output = args.first().ok_or_else(|| arg_error("output_path required"))?;
|
|
208
|
-
let output: String = String::try_convert(*output)?;
|
|
209
|
-
let freq = if let Some(value) = args.get(1) {
|
|
210
|
-
i32::try_convert(*value)?
|
|
211
|
-
} else {
|
|
212
|
-
1000
|
|
213
|
-
};
|
|
214
|
-
profiling::start(PathBuf::from(output), freq).map_err(conversion_error)?;
|
|
215
|
-
Ok(true)
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
#[cfg(feature = "profiling")]
|
|
219
|
-
fn stop_profiling_fn(_ruby: &Ruby, _args: &[Value]) -> Result<bool, Error> {
|
|
220
|
-
profiling::stop().map_err(conversion_error)?;
|
|
221
|
-
Ok(true)
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
#[magnus::init]
|
|
225
|
-
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
226
|
-
let module = ruby.define_module("HtmlToMarkdown")?;
|
|
227
|
-
module.define_singleton_method("convert", function!(convert_fn, -1))?;
|
|
228
|
-
module.define_singleton_method("options", function!(options_handle_fn, -1))?;
|
|
229
|
-
module.define_singleton_method("convert_with_options", function!(convert_with_options_handle_fn, -1))?;
|
|
230
|
-
|
|
83
|
+
// images
|
|
231
84
|
#[cfg(feature = "inline-images")]
|
|
232
85
|
{
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
)?;
|
|
86
|
+
use conversion::inline_image_to_value;
|
|
87
|
+
let images_array = ruby.ary_new();
|
|
88
|
+
for image in result.images {
|
|
89
|
+
let image_value = inline_image_to_value(ruby, image)?;
|
|
90
|
+
images_array.push(image_value)?;
|
|
91
|
+
}
|
|
92
|
+
hash.aset(ruby.intern("images"), images_array)?;
|
|
241
93
|
}
|
|
242
|
-
|
|
243
|
-
#[cfg(feature = "metadata")]
|
|
94
|
+
#[cfg(not(feature = "inline-images"))]
|
|
244
95
|
{
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
"convert_with_metadata_handle",
|
|
248
|
-
function!(convert_with_metadata_handle_fn, -1),
|
|
249
|
-
)?;
|
|
96
|
+
let empty = ruby.ary_new();
|
|
97
|
+
hash.aset(ruby.intern("images"), empty)?;
|
|
250
98
|
}
|
|
251
99
|
|
|
252
|
-
|
|
100
|
+
// warnings
|
|
253
101
|
{
|
|
254
|
-
|
|
255
|
-
|
|
102
|
+
let warnings_array = ruby.ary_new();
|
|
103
|
+
for warning in &result.warnings {
|
|
104
|
+
let w_hash = ruby.hash_new();
|
|
105
|
+
w_hash.aset(ruby.intern("message"), warning.message.as_str())?;
|
|
106
|
+
let kind_str = match warning.kind {
|
|
107
|
+
html_to_markdown_rs::WarningKind::ImageExtractionFailed => "image_extraction_failed",
|
|
108
|
+
html_to_markdown_rs::WarningKind::EncodingFallback => "encoding_fallback",
|
|
109
|
+
html_to_markdown_rs::WarningKind::TruncatedInput => "truncated_input",
|
|
110
|
+
html_to_markdown_rs::WarningKind::MalformedHtml => "malformed_html",
|
|
111
|
+
html_to_markdown_rs::WarningKind::SanitizationApplied => "sanitization_applied",
|
|
112
|
+
};
|
|
113
|
+
w_hash.aset(ruby.intern("kind"), kind_str)?;
|
|
114
|
+
warnings_array.push(w_hash)?;
|
|
115
|
+
}
|
|
116
|
+
hash.aset(ruby.intern("warnings"), warnings_array)?;
|
|
256
117
|
}
|
|
257
118
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
119
|
+
Ok(hash.as_value())
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
#[magnus::init]
|
|
123
|
+
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
124
|
+
let module = ruby.define_module("HtmlToMarkdown")?;
|
|
125
|
+
module.define_singleton_method("convert", function!(convert_full_fn, -1))?;
|
|
263
126
|
|
|
264
127
|
Ok(())
|
|
265
128
|
}
|
|
@@ -175,9 +175,6 @@ pub fn build_conversion_options(ruby: &Ruby, options: Option<Value>) -> Result<C
|
|
|
175
175
|
"br_in_tables" => {
|
|
176
176
|
update.br_in_tables = Some(bool::try_convert(val)?);
|
|
177
177
|
}
|
|
178
|
-
"hocr_spatial_tables" => {
|
|
179
|
-
update.hocr_spatial_tables = Some(bool::try_convert(val)?);
|
|
180
|
-
}
|
|
181
178
|
"highlight_style" => {
|
|
182
179
|
update.highlight_style = Some(parse_highlight_style(val)?);
|
|
183
180
|
}
|