html-to-markdown 2.29.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +18 -41
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +7 -4
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +127 -51
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -1,9 +1,16 @@
|
|
|
1
1
|
#![allow(missing_docs)]
|
|
2
2
|
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
9
|
+
|
|
3
10
|
use std::fs;
|
|
4
11
|
use std::path::PathBuf;
|
|
5
12
|
|
|
6
|
-
use html_to_markdown_rs::
|
|
13
|
+
use html_to_markdown_rs::ConversionOptions;
|
|
7
14
|
|
|
8
15
|
fn fixture_path(name: &str) -> PathBuf {
|
|
9
16
|
[env!("CARGO_MANIFEST_DIR"), "../../test_documents/html/issues", name]
|
|
@@ -690,11 +690,13 @@ fn test_convert_accepts_visitor_parameter() {
|
|
|
690
690
|
}
|
|
691
691
|
|
|
692
692
|
/// Test visitor + `inline_images` feature combination
|
|
693
|
+
///
|
|
694
|
+
/// In v3, `convert()` handles inline-image extraction via `ConversionResult.images`,
|
|
695
|
+
/// and `convert_with_visitor()` handles visitor callbacks. We verify both paths
|
|
696
|
+
/// work on the same HTML.
|
|
693
697
|
#[cfg(feature = "inline-images")]
|
|
694
698
|
#[test]
|
|
695
699
|
fn test_convert_with_inline_images_accepts_visitor() {
|
|
696
|
-
use html_to_markdown_rs::convert_with_inline_images;
|
|
697
|
-
|
|
698
700
|
#[derive(Debug, Default)]
|
|
699
701
|
struct ImageTrackingVisitor {
|
|
700
702
|
images_seen: usize,
|
|
@@ -715,15 +717,10 @@ fn test_convert_with_inline_images_accepts_visitor() {
|
|
|
715
717
|
<p>Some content</p>
|
|
716
718
|
"#;
|
|
717
719
|
|
|
720
|
+
// Verify visitor callbacks fire via convert_with_visitor
|
|
718
721
|
let visitor = Rc::new(RefCell::new(ImageTrackingVisitor::default()));
|
|
722
|
+
let markdown = convert_with_visitor(html, None, Some(visitor.clone())).expect("convert_with_visitor should work");
|
|
719
723
|
|
|
720
|
-
// Test convert_with_inline_images with visitor
|
|
721
|
-
let image_cfg =
|
|
722
|
-
html_to_markdown_rs::InlineImageConfig::from_update(html_to_markdown_rs::InlineImageConfigUpdate::default());
|
|
723
|
-
let result = convert_with_inline_images(html, None, image_cfg, Some(visitor.clone()))
|
|
724
|
-
.expect("convert_with_inline_images with visitor should work");
|
|
725
|
-
|
|
726
|
-
// Verify that both visitor and inline image collection worked
|
|
727
724
|
assert_eq!(
|
|
728
725
|
visitor.borrow().images_seen,
|
|
729
726
|
1,
|
|
@@ -731,15 +728,17 @@ fn test_convert_with_inline_images_accepts_visitor() {
|
|
|
731
728
|
);
|
|
732
729
|
|
|
733
730
|
// Markdown should still be generated
|
|
734
|
-
assert!(!
|
|
731
|
+
assert!(!markdown.is_empty(), "Should produce markdown output");
|
|
735
732
|
}
|
|
736
733
|
|
|
737
|
-
/// Test visitor + metadata
|
|
734
|
+
/// Test visitor + metadata: visitor callbacks fire and metadata is collected.
|
|
735
|
+
///
|
|
736
|
+
/// In v3, `convert()` always extracts metadata into `ConversionResult.metadata`,
|
|
737
|
+
/// and `convert_with_visitor()` handles visitor callbacks. We verify both paths
|
|
738
|
+
/// work on the same HTML.
|
|
738
739
|
#[cfg(feature = "metadata")]
|
|
739
740
|
#[test]
|
|
740
|
-
fn
|
|
741
|
-
use html_to_markdown_rs::convert_with_metadata;
|
|
742
|
-
|
|
741
|
+
fn test_visitor_and_metadata_both_work() {
|
|
743
742
|
#[derive(Debug, Default)]
|
|
744
743
|
struct MetadataAwareVisitor {
|
|
745
744
|
heading_count: usize,
|
|
@@ -770,14 +769,10 @@ fn test_convert_with_metadata_accepts_visitor() {
|
|
|
770
769
|
</html>
|
|
771
770
|
"#;
|
|
772
771
|
|
|
772
|
+
// Verify visitor callbacks fire via convert_with_visitor
|
|
773
773
|
let visitor = Rc::new(RefCell::new(MetadataAwareVisitor::default()));
|
|
774
|
+
let markdown = convert_with_visitor(html, None, Some(visitor.clone())).expect("convert_with_visitor should work");
|
|
774
775
|
|
|
775
|
-
// Test convert_with_metadata with visitor
|
|
776
|
-
let metadata_cfg = html_to_markdown_rs::MetadataConfig::default();
|
|
777
|
-
let (markdown, metadata) = convert_with_metadata(html, None, metadata_cfg, Some(visitor.clone()))
|
|
778
|
-
.expect("convert_with_metadata with visitor should work");
|
|
779
|
-
|
|
780
|
-
// Verify visitor was invoked
|
|
781
776
|
let borrowed = visitor.borrow();
|
|
782
777
|
assert!(
|
|
783
778
|
borrowed.heading_count >= 2,
|
|
@@ -789,8 +784,13 @@ fn test_convert_with_metadata_accepts_visitor() {
|
|
|
789
784
|
"Visitor should see 2 links, got {}",
|
|
790
785
|
borrowed.link_count
|
|
791
786
|
);
|
|
787
|
+
assert!(!markdown.is_empty(), "Should produce markdown output");
|
|
788
|
+
drop(borrowed);
|
|
789
|
+
|
|
790
|
+
// Verify metadata extraction via convert()
|
|
791
|
+
let result = html_to_markdown_rs::convert(html, None).expect("convert should work");
|
|
792
|
+
let metadata = result.metadata;
|
|
792
793
|
|
|
793
|
-
// Verify metadata was also collected
|
|
794
794
|
assert_eq!(
|
|
795
795
|
metadata.document.title,
|
|
796
796
|
Some("Test Page".to_string()),
|
|
@@ -807,17 +807,16 @@ fn test_convert_with_metadata_accepts_visitor() {
|
|
|
807
807
|
"Metadata should extract 2 links, got {}",
|
|
808
808
|
metadata.links.len()
|
|
809
809
|
);
|
|
810
|
-
|
|
811
|
-
// Verify markdown was produced
|
|
812
|
-
assert!(!markdown.is_empty(), "Should produce markdown output");
|
|
813
810
|
}
|
|
814
811
|
|
|
815
812
|
/// Test visitor + both `inline_images` and `metadata` features together
|
|
813
|
+
///
|
|
814
|
+
/// In v3, `convert()` handles metadata and inline-image extraction via `ConversionResult`,
|
|
815
|
+
/// and `convert_with_visitor()` handles visitor callbacks. We verify both paths
|
|
816
|
+
/// work on the same HTML.
|
|
816
817
|
#[cfg(all(feature = "inline-images", feature = "metadata"))]
|
|
817
818
|
#[test]
|
|
818
819
|
fn test_convert_with_all_features_and_visitor() {
|
|
819
|
-
use html_to_markdown_rs::convert_with_inline_images;
|
|
820
|
-
|
|
821
820
|
#[derive(Debug, Default)]
|
|
822
821
|
struct ComprehensiveVisitor {
|
|
823
822
|
headings: usize,
|
|
@@ -855,13 +854,9 @@ fn test_convert_with_all_features_and_visitor() {
|
|
|
855
854
|
</html>
|
|
856
855
|
"#;
|
|
857
856
|
|
|
857
|
+
// Verify visitor callbacks fire via convert_with_visitor
|
|
858
858
|
let visitor = Rc::new(RefCell::new(ComprehensiveVisitor::default()));
|
|
859
|
-
|
|
860
|
-
// Test with inline images feature (metadata feature doesn't affect inline-images function)
|
|
861
|
-
let image_cfg =
|
|
862
|
-
html_to_markdown_rs::InlineImageConfig::from_update(html_to_markdown_rs::InlineImageConfigUpdate::default());
|
|
863
|
-
let result = convert_with_inline_images(html, None, image_cfg, Some(visitor.clone()))
|
|
864
|
-
.expect("convert_with_inline_images with visitor should work");
|
|
859
|
+
let markdown = convert_with_visitor(html, None, Some(visitor.clone())).expect("convert_with_visitor should work");
|
|
865
860
|
|
|
866
861
|
// Verify all visitor callbacks were invoked
|
|
867
862
|
let borrowed = visitor.borrow();
|
|
@@ -876,9 +871,10 @@ fn test_convert_with_all_features_and_visitor() {
|
|
|
876
871
|
borrowed.images
|
|
877
872
|
);
|
|
878
873
|
assert_eq!(borrowed.links, 2, "Visitor should see 2 links, got {}", borrowed.links);
|
|
874
|
+
drop(borrowed);
|
|
879
875
|
|
|
880
876
|
// Verify markdown was produced
|
|
881
|
-
assert!(!
|
|
877
|
+
assert!(!markdown.is_empty(), "Should produce markdown output");
|
|
882
878
|
}
|
|
883
879
|
|
|
884
880
|
/// Regression test: image visitor returning Custom with metadata extraction used to panic
|
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
#![allow(missing_docs)]
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
fn convert(
|
|
4
|
+
html: &str,
|
|
5
|
+
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
|
+
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
+
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
use html_to_markdown_rs::ConversionOptions;
|
|
4
11
|
|
|
5
12
|
#[test]
|
|
6
13
|
fn test_basic_row_and_cell_conversion() {
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 3.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-03-
|
|
11
|
+
date: 2026-03-30 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -50,7 +50,6 @@ files:
|
|
|
50
50
|
- README.md
|
|
51
51
|
- Rakefile
|
|
52
52
|
- Steepfile
|
|
53
|
-
- bin/benchmark.rb
|
|
54
53
|
- exe/html-to-markdown
|
|
55
54
|
- ext/html-to-markdown-rb/extconf.rb
|
|
56
55
|
- ext/html-to-markdown-rb/native/Cargo.lock
|
|
@@ -60,14 +59,9 @@ files:
|
|
|
60
59
|
- ext/html-to-markdown-rb/native/src/conversion/inline_images.rs
|
|
61
60
|
- ext/html-to-markdown-rb/native/src/conversion/metadata.rs
|
|
62
61
|
- ext/html-to-markdown-rb/native/src/conversion/mod.rs
|
|
63
|
-
- ext/html-to-markdown-rb/native/src/conversion/tables.rs
|
|
64
62
|
- ext/html-to-markdown-rb/native/src/lib.rs
|
|
65
63
|
- ext/html-to-markdown-rb/native/src/options.rs
|
|
66
|
-
- ext/html-to-markdown-rb/native/src/profiling.rs
|
|
67
64
|
- ext/html-to-markdown-rb/native/src/types.rs
|
|
68
|
-
- ext/html-to-markdown-rb/native/src/visitor/bridge.rs
|
|
69
|
-
- ext/html-to-markdown-rb/native/src/visitor/callbacks.rs
|
|
70
|
-
- ext/html-to-markdown-rb/native/src/visitor/mod.rs
|
|
71
65
|
- html-to-markdown-rb.gemspec
|
|
72
66
|
- lib/html_to_markdown.rb
|
|
73
67
|
- lib/html_to_markdown/cli.rb
|
|
@@ -78,12 +72,7 @@ files:
|
|
|
78
72
|
- sig/html_to_markdown/cli_proxy.rbs
|
|
79
73
|
- sig/open3.rbs
|
|
80
74
|
- spec/cli_proxy_spec.rb
|
|
81
|
-
- spec/convert_spec.rb
|
|
82
|
-
- spec/convert_with_tables_spec.rb
|
|
83
|
-
- spec/metadata_extraction_spec.rb
|
|
84
75
|
- spec/spec_helper.rb
|
|
85
|
-
- spec/visitor_issue_187_spec.rb
|
|
86
|
-
- spec/visitor_spec.rb
|
|
87
76
|
- vendor/Cargo.toml
|
|
88
77
|
- vendor/html-to-markdown-rs/Cargo.toml
|
|
89
78
|
- vendor/html-to-markdown-rs/README.md
|
|
@@ -174,23 +163,6 @@ files:
|
|
|
174
163
|
- vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs
|
|
175
164
|
- vendor/html-to-markdown-rs/src/error.rs
|
|
176
165
|
- vendor/html-to-markdown-rs/src/exports.rs
|
|
177
|
-
- vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs
|
|
178
|
-
- vendor/html-to-markdown-rs/src/hocr/converter/core.rs
|
|
179
|
-
- vendor/html-to-markdown-rs/src/hocr/converter/elements.rs
|
|
180
|
-
- vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs
|
|
181
|
-
- vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs
|
|
182
|
-
- vendor/html-to-markdown-rs/src/hocr/converter/layout.rs
|
|
183
|
-
- vendor/html-to-markdown-rs/src/hocr/converter/mod.rs
|
|
184
|
-
- vendor/html-to-markdown-rs/src/hocr/converter/output.rs
|
|
185
|
-
- vendor/html-to-markdown-rs/src/hocr/extractor.rs
|
|
186
|
-
- vendor/html-to-markdown-rs/src/hocr/mod.rs
|
|
187
|
-
- vendor/html-to-markdown-rs/src/hocr/parser.rs
|
|
188
|
-
- vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs
|
|
189
|
-
- vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs
|
|
190
|
-
- vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs
|
|
191
|
-
- vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs
|
|
192
|
-
- vendor/html-to-markdown-rs/src/hocr/spatial/output.rs
|
|
193
|
-
- vendor/html-to-markdown-rs/src/hocr/types.rs
|
|
194
166
|
- vendor/html-to-markdown-rs/src/inline_images.rs
|
|
195
167
|
- vendor/html-to-markdown-rs/src/lib.rs
|
|
196
168
|
- vendor/html-to-markdown-rs/src/metadata/collector.rs
|
|
@@ -207,16 +179,19 @@ files:
|
|
|
207
179
|
- vendor/html-to-markdown-rs/src/rcdom.rs
|
|
208
180
|
- vendor/html-to-markdown-rs/src/safety.rs
|
|
209
181
|
- vendor/html-to-markdown-rs/src/text.rs
|
|
182
|
+
- vendor/html-to-markdown-rs/src/types/document.rs
|
|
183
|
+
- vendor/html-to-markdown-rs/src/types/mod.rs
|
|
184
|
+
- vendor/html-to-markdown-rs/src/types/result.rs
|
|
185
|
+
- vendor/html-to-markdown-rs/src/types/structure_builder.rs
|
|
186
|
+
- vendor/html-to-markdown-rs/src/types/structure_collector.rs
|
|
187
|
+
- vendor/html-to-markdown-rs/src/types/tables.rs
|
|
188
|
+
- vendor/html-to-markdown-rs/src/types/warnings.rs
|
|
210
189
|
- vendor/html-to-markdown-rs/src/validation.rs
|
|
211
|
-
- vendor/html-to-markdown-rs/src/visitor/async_traits.rs
|
|
212
190
|
- vendor/html-to-markdown-rs/src/visitor/default_impl.rs
|
|
213
191
|
- vendor/html-to-markdown-rs/src/visitor/mod.rs
|
|
214
192
|
- vendor/html-to-markdown-rs/src/visitor/traits.rs
|
|
215
193
|
- vendor/html-to-markdown-rs/src/visitor/types.rs
|
|
216
194
|
- vendor/html-to-markdown-rs/src/visitor_helpers.rs
|
|
217
|
-
- vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs
|
|
218
|
-
- vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs
|
|
219
|
-
- vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs
|
|
220
195
|
- vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs
|
|
221
196
|
- vendor/html-to-markdown-rs/src/visitor_helpers/helpers/content.rs
|
|
222
197
|
- vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs
|
|
@@ -225,12 +200,9 @@ files:
|
|
|
225
200
|
- vendor/html-to-markdown-rs/src/wrapper.rs
|
|
226
201
|
- vendor/html-to-markdown-rs/src/wrapper/sync.rs
|
|
227
202
|
- vendor/html-to-markdown-rs/src/wrapper/utils.rs
|
|
228
|
-
- vendor/html-to-markdown-rs/tests/async_visitor_test.rs
|
|
229
203
|
- vendor/html-to-markdown-rs/tests/br_in_inline_test.rs
|
|
230
204
|
- vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs
|
|
231
|
-
- vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs
|
|
232
205
|
- vendor/html-to-markdown-rs/tests/djot_output_test.rs
|
|
233
|
-
- vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs
|
|
234
206
|
- vendor/html-to-markdown-rs/tests/integration_test.rs
|
|
235
207
|
- vendor/html-to-markdown-rs/tests/issue_121_regressions.rs
|
|
236
208
|
- vendor/html-to-markdown-rs/tests/issue_127_regressions.rs
|
data/bin/benchmark.rb
DELETED
|
@@ -1,232 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
# frozen_string_literal: true
|
|
3
|
-
|
|
4
|
-
require 'optparse'
|
|
5
|
-
require 'time'
|
|
6
|
-
|
|
7
|
-
$LOAD_PATH.unshift(File.expand_path('../lib', __dir__))
|
|
8
|
-
require 'html_to_markdown'
|
|
9
|
-
|
|
10
|
-
def json_escape(value)
|
|
11
|
-
value.to_s.gsub(/["\\\n\r]/) do |char|
|
|
12
|
-
case char
|
|
13
|
-
when '"', '\\'
|
|
14
|
-
"\\#{char}"
|
|
15
|
-
when "\n"
|
|
16
|
-
'\\n'
|
|
17
|
-
when "\r"
|
|
18
|
-
'\\r'
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
options = {
|
|
24
|
-
iterations: 50,
|
|
25
|
-
format: 'html',
|
|
26
|
-
scenario: 'convert-default',
|
|
27
|
-
visitor: nil
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
OptionParser.new do |parser|
|
|
31
|
-
parser.banner = 'ruby benchmark.rb --file path/to/fixture.html [--iterations 200]'
|
|
32
|
-
|
|
33
|
-
parser.on('--file FILE', 'HTML fixture to convert repeatedly') do |file|
|
|
34
|
-
options[:file] = file
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
parser.on('--iterations N', Integer, 'Number of conversion iterations (default: 50)') do |n|
|
|
38
|
-
options[:iterations] = n.positive? ? n : 1
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
parser.on('--scenario SCENARIO', 'Scenario to benchmark') do |scenario|
|
|
42
|
-
options[:scenario] = scenario
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
parser.on('--format FORMAT', 'Fixture format (html or hocr)') do |format|
|
|
46
|
-
options[:format] = format.downcase
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
parser.on('--visitor VISITOR', 'Visitor type (noop, simple, custom, complex)') do |visitor|
|
|
50
|
-
options[:visitor] = visitor if %w[noop simple custom complex].include?(visitor)
|
|
51
|
-
end
|
|
52
|
-
end.parse!
|
|
53
|
-
|
|
54
|
-
fixture = options.fetch(:file) do
|
|
55
|
-
warn 'Missing --file parameter'
|
|
56
|
-
exit 1
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
unless File.exist?(fixture)
|
|
60
|
-
warn "Fixture not found: #{fixture}"
|
|
61
|
-
exit 1
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
unless %w[html hocr].include?(options[:format])
|
|
65
|
-
warn "Unsupported format: #{options[:format]}"
|
|
66
|
-
exit 1
|
|
67
|
-
end
|
|
68
|
-
|
|
69
|
-
supported_scenarios = %w[
|
|
70
|
-
convert-default
|
|
71
|
-
convert-options
|
|
72
|
-
inline-images-default
|
|
73
|
-
inline-images-options
|
|
74
|
-
metadata-default
|
|
75
|
-
metadata-options
|
|
76
|
-
]
|
|
77
|
-
unless supported_scenarios.include?(options[:scenario])
|
|
78
|
-
warn "Unsupported scenario: #{options[:scenario]}"
|
|
79
|
-
exit 1
|
|
80
|
-
end
|
|
81
|
-
|
|
82
|
-
# Visitor factory functions
|
|
83
|
-
def create_noop_visitor
|
|
84
|
-
{
|
|
85
|
-
visit_text: proc { |_ctx, _text| 'continue' },
|
|
86
|
-
visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
|
|
87
|
-
visit_paragraph: proc { |_ctx, _text| 'continue' },
|
|
88
|
-
visit_link: proc { |_ctx, _href, _text, _title| 'continue' },
|
|
89
|
-
visit_image: proc { |_ctx, _src, _alt, _title| 'continue' },
|
|
90
|
-
visit_strong: proc { |_ctx, _text| 'continue' },
|
|
91
|
-
visit_em: proc { |_ctx, _text| 'continue' },
|
|
92
|
-
visit_code: proc { |_ctx, _text| 'continue' },
|
|
93
|
-
visit_br: proc { |_ctx| 'continue' }
|
|
94
|
-
}
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
def create_simple_visitor
|
|
98
|
-
{
|
|
99
|
-
text_count: 0,
|
|
100
|
-
link_count: 0,
|
|
101
|
-
image_count: 0,
|
|
102
|
-
visit_text: proc { |_ctx, _text| 'continue' },
|
|
103
|
-
visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
|
|
104
|
-
visit_paragraph: proc { |_ctx, _text| 'continue' },
|
|
105
|
-
visit_link: proc { |_ctx, _href, _text, _title| 'continue' },
|
|
106
|
-
visit_image: proc { |_ctx, _src, _alt, _title| 'continue' },
|
|
107
|
-
visit_strong: proc { |_ctx, _text| 'continue' },
|
|
108
|
-
visit_em: proc { |_ctx, _text| 'continue' },
|
|
109
|
-
visit_code: proc { |_ctx, _text| 'continue' },
|
|
110
|
-
visit_br: proc { |_ctx| 'continue' }
|
|
111
|
-
}
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
def create_custom_visitor
|
|
115
|
-
{
|
|
116
|
-
visit_text: proc { |_ctx, _text| 'continue' },
|
|
117
|
-
visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
|
|
118
|
-
visit_paragraph: proc { |_ctx, _text| 'continue' },
|
|
119
|
-
visit_link: proc { |_ctx, href, text, _title| ['custom', "LINK[#{text}](#{href})"] },
|
|
120
|
-
visit_image: proc { |_ctx, src, alt, _title| ['custom', ""] },
|
|
121
|
-
visit_strong: proc { |_ctx, _text| 'continue' },
|
|
122
|
-
visit_em: proc { |_ctx, _text| 'continue' },
|
|
123
|
-
visit_code: proc { |_ctx, _text| 'continue' },
|
|
124
|
-
visit_br: proc { |_ctx| 'continue' }
|
|
125
|
-
}
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
def create_complex_visitor
|
|
129
|
-
{
|
|
130
|
-
texts: 0,
|
|
131
|
-
links: 0,
|
|
132
|
-
images: 0,
|
|
133
|
-
headings: 0,
|
|
134
|
-
visit_text: proc { |_ctx, _text| 'continue' },
|
|
135
|
-
visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
|
|
136
|
-
visit_paragraph: proc { |_ctx, _text| 'continue' },
|
|
137
|
-
visit_link: proc { |_ctx, href, text, _title| ['custom', "[#{text}](#{href})"] },
|
|
138
|
-
visit_image: proc { |_ctx, _src, _alt, _title| 'skip' },
|
|
139
|
-
visit_strong: proc { |_ctx, _text| 'continue' },
|
|
140
|
-
visit_em: proc { |_ctx, _text| 'continue' },
|
|
141
|
-
visit_code: proc { |_ctx, _text| 'continue' },
|
|
142
|
-
visit_br: proc { |_ctx| 'continue' }
|
|
143
|
-
}
|
|
144
|
-
end
|
|
145
|
-
|
|
146
|
-
html = File.binread(fixture)
|
|
147
|
-
html.force_encoding(Encoding::UTF_8)
|
|
148
|
-
html.freeze
|
|
149
|
-
iterations = options[:iterations]
|
|
150
|
-
conversion_options = options[:format] == 'hocr' ? { hocr_spatial_tables: false } : {}
|
|
151
|
-
options_handle = if %w[convert-options inline-images-options metadata-options].include?(options[:scenario])
|
|
152
|
-
HtmlToMarkdown.options(conversion_options)
|
|
153
|
-
end
|
|
154
|
-
|
|
155
|
-
# Create visitor if specified
|
|
156
|
-
visitor = nil
|
|
157
|
-
if options[:visitor]
|
|
158
|
-
visitor_creators = {
|
|
159
|
-
'noop' => method(:create_noop_visitor),
|
|
160
|
-
'simple' => method(:create_simple_visitor),
|
|
161
|
-
'custom' => method(:create_custom_visitor),
|
|
162
|
-
'complex' => method(:create_complex_visitor)
|
|
163
|
-
}
|
|
164
|
-
creator = visitor_creators[options[:visitor]]
|
|
165
|
-
visitor = creator.call if creator
|
|
166
|
-
end
|
|
167
|
-
|
|
168
|
-
SCENARIO_RUNNERS = {
|
|
169
|
-
'convert-default' => ->(html, _options, _handle, _visitor) { HtmlToMarkdown.convert(html) },
|
|
170
|
-
'convert-options' => lambda do |html, _options, handle, _visitor|
|
|
171
|
-
raise ArgumentError, 'options handle required' unless handle
|
|
172
|
-
|
|
173
|
-
HtmlToMarkdown.convert_with_options(html, handle)
|
|
174
|
-
end,
|
|
175
|
-
'inline-images-default' => lambda { |html, _options, _handle, _visitor|
|
|
176
|
-
HtmlToMarkdown.convert_with_inline_images(html, nil, nil)
|
|
177
|
-
},
|
|
178
|
-
'inline-images-options' => lambda do |html, _options, handle, _visitor|
|
|
179
|
-
raise ArgumentError, 'options handle required' unless handle
|
|
180
|
-
|
|
181
|
-
HtmlToMarkdown.convert_with_inline_images_handle(html, handle, nil)
|
|
182
|
-
end,
|
|
183
|
-
'metadata-default' => ->(html, _options, _handle, _visitor) { HtmlToMarkdown.convert_with_metadata(html, nil, nil) },
|
|
184
|
-
'metadata-options' => lambda do |html, _options, handle, _visitor|
|
|
185
|
-
raise ArgumentError, 'options handle required' unless handle
|
|
186
|
-
|
|
187
|
-
HtmlToMarkdown.convert_with_metadata_handle(html, handle, nil)
|
|
188
|
-
end
|
|
189
|
-
}.freeze
|
|
190
|
-
|
|
191
|
-
def run_scenario(html, scenario, options, handle, visitor = nil)
|
|
192
|
-
if visitor
|
|
193
|
-
HtmlToMarkdown.convert_with_visitor(html, nil, visitor)
|
|
194
|
-
else
|
|
195
|
-
runner = SCENARIO_RUNNERS.fetch(scenario) { raise ArgumentError, "Unsupported scenario: #{scenario}" }
|
|
196
|
-
runner.call(html, options, handle, visitor)
|
|
197
|
-
end
|
|
198
|
-
end
|
|
199
|
-
|
|
200
|
-
run_scenario(html, options[:scenario], conversion_options, options_handle, visitor)
|
|
201
|
-
|
|
202
|
-
profile_output = ENV.fetch('HTML_TO_MARKDOWN_PROFILE_OUTPUT', nil)
|
|
203
|
-
if profile_output && HtmlToMarkdown.respond_to?(:start_profiling)
|
|
204
|
-
freq = Integer(ENV.fetch('HTML_TO_MARKDOWN_PROFILE_FREQUENCY', '1000'), 10)
|
|
205
|
-
HtmlToMarkdown.start_profiling(profile_output, freq)
|
|
206
|
-
end
|
|
207
|
-
|
|
208
|
-
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
209
|
-
iterations.times { run_scenario(html, options[:scenario], conversion_options, options_handle, visitor) }
|
|
210
|
-
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
|
|
211
|
-
|
|
212
|
-
HtmlToMarkdown.stop_profiling if profile_output && HtmlToMarkdown.respond_to?(:stop_profiling)
|
|
213
|
-
|
|
214
|
-
payload_size_bytes = html.bytesize
|
|
215
|
-
bytes_processed = payload_size_bytes * iterations
|
|
216
|
-
ops_per_sec = iterations / elapsed
|
|
217
|
-
mb_per_sec = (bytes_processed.to_f / (1024 * 1024)) / elapsed
|
|
218
|
-
|
|
219
|
-
payload = %({
|
|
220
|
-
"language":"ruby",
|
|
221
|
-
"fixture":"#{json_escape(File.basename(fixture))}",
|
|
222
|
-
"fixture_path":"#{json_escape(fixture)}",
|
|
223
|
-
"scenario":"#{json_escape(options[:scenario])}",
|
|
224
|
-
"iterations":#{iterations},
|
|
225
|
-
"elapsed_seconds":#{format('%.8f', elapsed)},
|
|
226
|
-
"ops_per_sec":#{format('%.4f', ops_per_sec)},
|
|
227
|
-
"mb_per_sec":#{format('%.4f', mb_per_sec)},
|
|
228
|
-
"bytes_processed":#{bytes_processed},
|
|
229
|
-
"payload_size_bytes":#{payload_size_bytes}
|
|
230
|
-
})
|
|
231
|
-
|
|
232
|
-
puts payload.strip
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
//! Table extraction conversion functions for Ruby bindings.
|
|
2
|
-
|
|
3
|
-
use html_to_markdown_rs::{ConversionWithTables, TableData};
|
|
4
|
-
use magnus::prelude::*;
|
|
5
|
-
use magnus::{Error, Ruby, Value};
|
|
6
|
-
|
|
7
|
-
#[cfg(feature = "metadata")]
|
|
8
|
-
use super::metadata::extended_metadata_to_ruby;
|
|
9
|
-
|
|
10
|
-
fn table_data_to_ruby(ruby: &Ruby, table: TableData) -> Result<Value, Error> {
|
|
11
|
-
let hash = ruby.hash_new();
|
|
12
|
-
|
|
13
|
-
// cells: Array[Array[String]]
|
|
14
|
-
let cells_array = ruby.ary_new();
|
|
15
|
-
for row in table.cells {
|
|
16
|
-
let row_array = ruby.ary_new();
|
|
17
|
-
for cell in row {
|
|
18
|
-
row_array.push(cell)?;
|
|
19
|
-
}
|
|
20
|
-
cells_array.push(row_array)?;
|
|
21
|
-
}
|
|
22
|
-
hash.aset(ruby.intern("cells"), cells_array)?;
|
|
23
|
-
|
|
24
|
-
// markdown: String
|
|
25
|
-
hash.aset(ruby.intern("markdown"), table.markdown)?;
|
|
26
|
-
|
|
27
|
-
// is_header_row: Array[bool]
|
|
28
|
-
let header_array = ruby.ary_new();
|
|
29
|
-
for is_header in table.is_header_row {
|
|
30
|
-
header_array.push(is_header)?;
|
|
31
|
-
}
|
|
32
|
-
hash.aset(ruby.intern("is_header_row"), header_array)?;
|
|
33
|
-
|
|
34
|
-
Ok(hash.as_value())
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
/// Convert a `ConversionWithTables` result to a Ruby Hash.
|
|
38
|
-
///
|
|
39
|
-
/// Returns a Hash with keys `:content`, `:metadata`, `:tables`.
|
|
40
|
-
pub fn tables_result_to_ruby(ruby: &Ruby, result: ConversionWithTables) -> Result<Value, Error> {
|
|
41
|
-
let hash = ruby.hash_new();
|
|
42
|
-
|
|
43
|
-
// content: String
|
|
44
|
-
hash.aset(ruby.intern("content"), result.content)?;
|
|
45
|
-
|
|
46
|
-
// metadata: Hash or nil
|
|
47
|
-
#[cfg(feature = "metadata")]
|
|
48
|
-
{
|
|
49
|
-
match result.metadata {
|
|
50
|
-
Some(metadata) => {
|
|
51
|
-
hash.aset(ruby.intern("metadata"), extended_metadata_to_ruby(ruby, metadata)?)?;
|
|
52
|
-
}
|
|
53
|
-
None => {
|
|
54
|
-
hash.aset(ruby.intern("metadata"), ruby.qnil())?;
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
#[cfg(not(feature = "metadata"))]
|
|
59
|
-
{
|
|
60
|
-
hash.aset(ruby.intern("metadata"), ruby.qnil())?;
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
// tables: Array[Hash]
|
|
64
|
-
let tables_array = ruby.ary_new();
|
|
65
|
-
for table in result.tables {
|
|
66
|
-
tables_array.push(table_data_to_ruby(ruby, table)?)?;
|
|
67
|
-
}
|
|
68
|
-
hash.aset(ruby.intern("tables"), tables_array)?;
|
|
69
|
-
|
|
70
|
-
Ok(hash.as_value())
|
|
71
|
-
}
|