html-to-markdown 2.29.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +18 -41
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +7 -4
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +127 -51
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,9 +1,16 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
3
10
  use std::fs;
4
11
  use std::path::PathBuf;
5
12
 
6
- use html_to_markdown_rs::{ConversionOptions, convert};
13
+ use html_to_markdown_rs::ConversionOptions;
7
14
 
8
15
  fn fixture_path(name: &str) -> PathBuf {
9
16
  [env!("CARGO_MANIFEST_DIR"), "../../test_documents/html/issues", name]
@@ -690,11 +690,13 @@ fn test_convert_accepts_visitor_parameter() {
690
690
  }
691
691
 
692
692
  /// Test visitor + `inline_images` feature combination
693
+ ///
694
+ /// In v3, `convert()` handles inline-image extraction via `ConversionResult.images`,
695
+ /// and `convert_with_visitor()` handles visitor callbacks. We verify both paths
696
+ /// work on the same HTML.
693
697
  #[cfg(feature = "inline-images")]
694
698
  #[test]
695
699
  fn test_convert_with_inline_images_accepts_visitor() {
696
- use html_to_markdown_rs::convert_with_inline_images;
697
-
698
700
  #[derive(Debug, Default)]
699
701
  struct ImageTrackingVisitor {
700
702
  images_seen: usize,
@@ -715,15 +717,10 @@ fn test_convert_with_inline_images_accepts_visitor() {
715
717
  <p>Some content</p>
716
718
  "#;
717
719
 
720
+ // Verify visitor callbacks fire via convert_with_visitor
718
721
  let visitor = Rc::new(RefCell::new(ImageTrackingVisitor::default()));
722
+ let markdown = convert_with_visitor(html, None, Some(visitor.clone())).expect("convert_with_visitor should work");
719
723
 
720
- // Test convert_with_inline_images with visitor
721
- let image_cfg =
722
- html_to_markdown_rs::InlineImageConfig::from_update(html_to_markdown_rs::InlineImageConfigUpdate::default());
723
- let result = convert_with_inline_images(html, None, image_cfg, Some(visitor.clone()))
724
- .expect("convert_with_inline_images with visitor should work");
725
-
726
- // Verify that both visitor and inline image collection worked
727
724
  assert_eq!(
728
725
  visitor.borrow().images_seen,
729
726
  1,
@@ -731,15 +728,17 @@ fn test_convert_with_inline_images_accepts_visitor() {
731
728
  );
732
729
 
733
730
  // Markdown should still be generated
734
- assert!(!result.markdown.is_empty(), "Should produce markdown output");
731
+ assert!(!markdown.is_empty(), "Should produce markdown output");
735
732
  }
736
733
 
737
- /// Test visitor + metadata feature combination
734
+ /// Test visitor + metadata: visitor callbacks fire and metadata is collected.
735
+ ///
736
+ /// In v3, `convert()` always extracts metadata into `ConversionResult.metadata`,
737
+ /// and `convert_with_visitor()` handles visitor callbacks. We verify both paths
738
+ /// work on the same HTML.
738
739
  #[cfg(feature = "metadata")]
739
740
  #[test]
740
- fn test_convert_with_metadata_accepts_visitor() {
741
- use html_to_markdown_rs::convert_with_metadata;
742
-
741
+ fn test_visitor_and_metadata_both_work() {
743
742
  #[derive(Debug, Default)]
744
743
  struct MetadataAwareVisitor {
745
744
  heading_count: usize,
@@ -770,14 +769,10 @@ fn test_convert_with_metadata_accepts_visitor() {
770
769
  </html>
771
770
  "#;
772
771
 
772
+ // Verify visitor callbacks fire via convert_with_visitor
773
773
  let visitor = Rc::new(RefCell::new(MetadataAwareVisitor::default()));
774
+ let markdown = convert_with_visitor(html, None, Some(visitor.clone())).expect("convert_with_visitor should work");
774
775
 
775
- // Test convert_with_metadata with visitor
776
- let metadata_cfg = html_to_markdown_rs::MetadataConfig::default();
777
- let (markdown, metadata) = convert_with_metadata(html, None, metadata_cfg, Some(visitor.clone()))
778
- .expect("convert_with_metadata with visitor should work");
779
-
780
- // Verify visitor was invoked
781
776
  let borrowed = visitor.borrow();
782
777
  assert!(
783
778
  borrowed.heading_count >= 2,
@@ -789,8 +784,13 @@ fn test_convert_with_metadata_accepts_visitor() {
789
784
  "Visitor should see 2 links, got {}",
790
785
  borrowed.link_count
791
786
  );
787
+ assert!(!markdown.is_empty(), "Should produce markdown output");
788
+ drop(borrowed);
789
+
790
+ // Verify metadata extraction via convert()
791
+ let result = html_to_markdown_rs::convert(html, None).expect("convert should work");
792
+ let metadata = result.metadata;
792
793
 
793
- // Verify metadata was also collected
794
794
  assert_eq!(
795
795
  metadata.document.title,
796
796
  Some("Test Page".to_string()),
@@ -807,17 +807,16 @@ fn test_convert_with_metadata_accepts_visitor() {
807
807
  "Metadata should extract 2 links, got {}",
808
808
  metadata.links.len()
809
809
  );
810
-
811
- // Verify markdown was produced
812
- assert!(!markdown.is_empty(), "Should produce markdown output");
813
810
  }
814
811
 
815
812
  /// Test visitor + both `inline_images` and `metadata` features together
813
+ ///
814
+ /// In v3, `convert()` handles metadata and inline-image extraction via `ConversionResult`,
815
+ /// and `convert_with_visitor()` handles visitor callbacks. We verify both paths
816
+ /// work on the same HTML.
816
817
  #[cfg(all(feature = "inline-images", feature = "metadata"))]
817
818
  #[test]
818
819
  fn test_convert_with_all_features_and_visitor() {
819
- use html_to_markdown_rs::convert_with_inline_images;
820
-
821
820
  #[derive(Debug, Default)]
822
821
  struct ComprehensiveVisitor {
823
822
  headings: usize,
@@ -855,13 +854,9 @@ fn test_convert_with_all_features_and_visitor() {
855
854
  </html>
856
855
  "#;
857
856
 
857
+ // Verify visitor callbacks fire via convert_with_visitor
858
858
  let visitor = Rc::new(RefCell::new(ComprehensiveVisitor::default()));
859
-
860
- // Test with inline images feature (metadata feature doesn't affect inline-images function)
861
- let image_cfg =
862
- html_to_markdown_rs::InlineImageConfig::from_update(html_to_markdown_rs::InlineImageConfigUpdate::default());
863
- let result = convert_with_inline_images(html, None, image_cfg, Some(visitor.clone()))
864
- .expect("convert_with_inline_images with visitor should work");
859
+ let markdown = convert_with_visitor(html, None, Some(visitor.clone())).expect("convert_with_visitor should work");
865
860
 
866
861
  // Verify all visitor callbacks were invoked
867
862
  let borrowed = visitor.borrow();
@@ -876,9 +871,10 @@ fn test_convert_with_all_features_and_visitor() {
876
871
  borrowed.images
877
872
  );
878
873
  assert_eq!(borrowed.links, 2, "Visitor should see 2 links, got {}", borrowed.links);
874
+ drop(borrowed);
879
875
 
880
876
  // Verify markdown was produced
881
- assert!(!result.markdown.is_empty(), "Should produce markdown output");
877
+ assert!(!markdown.is_empty(), "Should produce markdown output");
882
878
  }
883
879
 
884
880
  /// Regression test: image visitor returning Custom with metadata extraction used to panic
@@ -1,6 +1,13 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- use html_to_markdown_rs::{ConversionOptions, convert};
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
10
+ use html_to_markdown_rs::ConversionOptions;
4
11
 
5
12
  #[test]
6
13
  fn test_basic_row_and_cell_conversion() {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.29.0
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-03-22 00:00:00.000000000 Z
11
+ date: 2026-03-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -50,7 +50,6 @@ files:
50
50
  - README.md
51
51
  - Rakefile
52
52
  - Steepfile
53
- - bin/benchmark.rb
54
53
  - exe/html-to-markdown
55
54
  - ext/html-to-markdown-rb/extconf.rb
56
55
  - ext/html-to-markdown-rb/native/Cargo.lock
@@ -60,14 +59,9 @@ files:
60
59
  - ext/html-to-markdown-rb/native/src/conversion/inline_images.rs
61
60
  - ext/html-to-markdown-rb/native/src/conversion/metadata.rs
62
61
  - ext/html-to-markdown-rb/native/src/conversion/mod.rs
63
- - ext/html-to-markdown-rb/native/src/conversion/tables.rs
64
62
  - ext/html-to-markdown-rb/native/src/lib.rs
65
63
  - ext/html-to-markdown-rb/native/src/options.rs
66
- - ext/html-to-markdown-rb/native/src/profiling.rs
67
64
  - ext/html-to-markdown-rb/native/src/types.rs
68
- - ext/html-to-markdown-rb/native/src/visitor/bridge.rs
69
- - ext/html-to-markdown-rb/native/src/visitor/callbacks.rs
70
- - ext/html-to-markdown-rb/native/src/visitor/mod.rs
71
65
  - html-to-markdown-rb.gemspec
72
66
  - lib/html_to_markdown.rb
73
67
  - lib/html_to_markdown/cli.rb
@@ -78,12 +72,7 @@ files:
78
72
  - sig/html_to_markdown/cli_proxy.rbs
79
73
  - sig/open3.rbs
80
74
  - spec/cli_proxy_spec.rb
81
- - spec/convert_spec.rb
82
- - spec/convert_with_tables_spec.rb
83
- - spec/metadata_extraction_spec.rb
84
75
  - spec/spec_helper.rb
85
- - spec/visitor_issue_187_spec.rb
86
- - spec/visitor_spec.rb
87
76
  - vendor/Cargo.toml
88
77
  - vendor/html-to-markdown-rs/Cargo.toml
89
78
  - vendor/html-to-markdown-rs/README.md
@@ -174,23 +163,6 @@ files:
174
163
  - vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs
175
164
  - vendor/html-to-markdown-rs/src/error.rs
176
165
  - vendor/html-to-markdown-rs/src/exports.rs
177
- - vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs
178
- - vendor/html-to-markdown-rs/src/hocr/converter/core.rs
179
- - vendor/html-to-markdown-rs/src/hocr/converter/elements.rs
180
- - vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs
181
- - vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs
182
- - vendor/html-to-markdown-rs/src/hocr/converter/layout.rs
183
- - vendor/html-to-markdown-rs/src/hocr/converter/mod.rs
184
- - vendor/html-to-markdown-rs/src/hocr/converter/output.rs
185
- - vendor/html-to-markdown-rs/src/hocr/extractor.rs
186
- - vendor/html-to-markdown-rs/src/hocr/mod.rs
187
- - vendor/html-to-markdown-rs/src/hocr/parser.rs
188
- - vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs
189
- - vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs
190
- - vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs
191
- - vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs
192
- - vendor/html-to-markdown-rs/src/hocr/spatial/output.rs
193
- - vendor/html-to-markdown-rs/src/hocr/types.rs
194
166
  - vendor/html-to-markdown-rs/src/inline_images.rs
195
167
  - vendor/html-to-markdown-rs/src/lib.rs
196
168
  - vendor/html-to-markdown-rs/src/metadata/collector.rs
@@ -207,16 +179,19 @@ files:
207
179
  - vendor/html-to-markdown-rs/src/rcdom.rs
208
180
  - vendor/html-to-markdown-rs/src/safety.rs
209
181
  - vendor/html-to-markdown-rs/src/text.rs
182
+ - vendor/html-to-markdown-rs/src/types/document.rs
183
+ - vendor/html-to-markdown-rs/src/types/mod.rs
184
+ - vendor/html-to-markdown-rs/src/types/result.rs
185
+ - vendor/html-to-markdown-rs/src/types/structure_builder.rs
186
+ - vendor/html-to-markdown-rs/src/types/structure_collector.rs
187
+ - vendor/html-to-markdown-rs/src/types/tables.rs
188
+ - vendor/html-to-markdown-rs/src/types/warnings.rs
210
189
  - vendor/html-to-markdown-rs/src/validation.rs
211
- - vendor/html-to-markdown-rs/src/visitor/async_traits.rs
212
190
  - vendor/html-to-markdown-rs/src/visitor/default_impl.rs
213
191
  - vendor/html-to-markdown-rs/src/visitor/mod.rs
214
192
  - vendor/html-to-markdown-rs/src/visitor/traits.rs
215
193
  - vendor/html-to-markdown-rs/src/visitor/types.rs
216
194
  - vendor/html-to-markdown-rs/src/visitor_helpers.rs
217
- - vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs
218
- - vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs
219
- - vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs
220
195
  - vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs
221
196
  - vendor/html-to-markdown-rs/src/visitor_helpers/helpers/content.rs
222
197
  - vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs
@@ -225,12 +200,9 @@ files:
225
200
  - vendor/html-to-markdown-rs/src/wrapper.rs
226
201
  - vendor/html-to-markdown-rs/src/wrapper/sync.rs
227
202
  - vendor/html-to-markdown-rs/src/wrapper/utils.rs
228
- - vendor/html-to-markdown-rs/tests/async_visitor_test.rs
229
203
  - vendor/html-to-markdown-rs/tests/br_in_inline_test.rs
230
204
  - vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs
231
- - vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs
232
205
  - vendor/html-to-markdown-rs/tests/djot_output_test.rs
233
- - vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs
234
206
  - vendor/html-to-markdown-rs/tests/integration_test.rs
235
207
  - vendor/html-to-markdown-rs/tests/issue_121_regressions.rs
236
208
  - vendor/html-to-markdown-rs/tests/issue_127_regressions.rs
data/bin/benchmark.rb DELETED
@@ -1,232 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- require 'optparse'
5
- require 'time'
6
-
7
- $LOAD_PATH.unshift(File.expand_path('../lib', __dir__))
8
- require 'html_to_markdown'
9
-
10
- def json_escape(value)
11
- value.to_s.gsub(/["\\\n\r]/) do |char|
12
- case char
13
- when '"', '\\'
14
- "\\#{char}"
15
- when "\n"
16
- '\\n'
17
- when "\r"
18
- '\\r'
19
- end
20
- end
21
- end
22
-
23
- options = {
24
- iterations: 50,
25
- format: 'html',
26
- scenario: 'convert-default',
27
- visitor: nil
28
- }
29
-
30
- OptionParser.new do |parser|
31
- parser.banner = 'ruby benchmark.rb --file path/to/fixture.html [--iterations 200]'
32
-
33
- parser.on('--file FILE', 'HTML fixture to convert repeatedly') do |file|
34
- options[:file] = file
35
- end
36
-
37
- parser.on('--iterations N', Integer, 'Number of conversion iterations (default: 50)') do |n|
38
- options[:iterations] = n.positive? ? n : 1
39
- end
40
-
41
- parser.on('--scenario SCENARIO', 'Scenario to benchmark') do |scenario|
42
- options[:scenario] = scenario
43
- end
44
-
45
- parser.on('--format FORMAT', 'Fixture format (html or hocr)') do |format|
46
- options[:format] = format.downcase
47
- end
48
-
49
- parser.on('--visitor VISITOR', 'Visitor type (noop, simple, custom, complex)') do |visitor|
50
- options[:visitor] = visitor if %w[noop simple custom complex].include?(visitor)
51
- end
52
- end.parse!
53
-
54
- fixture = options.fetch(:file) do
55
- warn 'Missing --file parameter'
56
- exit 1
57
- end
58
-
59
- unless File.exist?(fixture)
60
- warn "Fixture not found: #{fixture}"
61
- exit 1
62
- end
63
-
64
- unless %w[html hocr].include?(options[:format])
65
- warn "Unsupported format: #{options[:format]}"
66
- exit 1
67
- end
68
-
69
- supported_scenarios = %w[
70
- convert-default
71
- convert-options
72
- inline-images-default
73
- inline-images-options
74
- metadata-default
75
- metadata-options
76
- ]
77
- unless supported_scenarios.include?(options[:scenario])
78
- warn "Unsupported scenario: #{options[:scenario]}"
79
- exit 1
80
- end
81
-
82
- # Visitor factory functions
83
- def create_noop_visitor
84
- {
85
- visit_text: proc { |_ctx, _text| 'continue' },
86
- visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
87
- visit_paragraph: proc { |_ctx, _text| 'continue' },
88
- visit_link: proc { |_ctx, _href, _text, _title| 'continue' },
89
- visit_image: proc { |_ctx, _src, _alt, _title| 'continue' },
90
- visit_strong: proc { |_ctx, _text| 'continue' },
91
- visit_em: proc { |_ctx, _text| 'continue' },
92
- visit_code: proc { |_ctx, _text| 'continue' },
93
- visit_br: proc { |_ctx| 'continue' }
94
- }
95
- end
96
-
97
- def create_simple_visitor
98
- {
99
- text_count: 0,
100
- link_count: 0,
101
- image_count: 0,
102
- visit_text: proc { |_ctx, _text| 'continue' },
103
- visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
104
- visit_paragraph: proc { |_ctx, _text| 'continue' },
105
- visit_link: proc { |_ctx, _href, _text, _title| 'continue' },
106
- visit_image: proc { |_ctx, _src, _alt, _title| 'continue' },
107
- visit_strong: proc { |_ctx, _text| 'continue' },
108
- visit_em: proc { |_ctx, _text| 'continue' },
109
- visit_code: proc { |_ctx, _text| 'continue' },
110
- visit_br: proc { |_ctx| 'continue' }
111
- }
112
- end
113
-
114
- def create_custom_visitor
115
- {
116
- visit_text: proc { |_ctx, _text| 'continue' },
117
- visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
118
- visit_paragraph: proc { |_ctx, _text| 'continue' },
119
- visit_link: proc { |_ctx, href, text, _title| ['custom', "LINK[#{text}](#{href})"] },
120
- visit_image: proc { |_ctx, src, alt, _title| ['custom', "![#{alt}](#{src})"] },
121
- visit_strong: proc { |_ctx, _text| 'continue' },
122
- visit_em: proc { |_ctx, _text| 'continue' },
123
- visit_code: proc { |_ctx, _text| 'continue' },
124
- visit_br: proc { |_ctx| 'continue' }
125
- }
126
- end
127
-
128
- def create_complex_visitor
129
- {
130
- texts: 0,
131
- links: 0,
132
- images: 0,
133
- headings: 0,
134
- visit_text: proc { |_ctx, _text| 'continue' },
135
- visit_heading: proc { |_ctx, _level, _text, _id| 'continue' },
136
- visit_paragraph: proc { |_ctx, _text| 'continue' },
137
- visit_link: proc { |_ctx, href, text, _title| ['custom', "[#{text}](#{href})"] },
138
- visit_image: proc { |_ctx, _src, _alt, _title| 'skip' },
139
- visit_strong: proc { |_ctx, _text| 'continue' },
140
- visit_em: proc { |_ctx, _text| 'continue' },
141
- visit_code: proc { |_ctx, _text| 'continue' },
142
- visit_br: proc { |_ctx| 'continue' }
143
- }
144
- end
145
-
146
- html = File.binread(fixture)
147
- html.force_encoding(Encoding::UTF_8)
148
- html.freeze
149
- iterations = options[:iterations]
150
- conversion_options = options[:format] == 'hocr' ? { hocr_spatial_tables: false } : {}
151
- options_handle = if %w[convert-options inline-images-options metadata-options].include?(options[:scenario])
152
- HtmlToMarkdown.options(conversion_options)
153
- end
154
-
155
- # Create visitor if specified
156
- visitor = nil
157
- if options[:visitor]
158
- visitor_creators = {
159
- 'noop' => method(:create_noop_visitor),
160
- 'simple' => method(:create_simple_visitor),
161
- 'custom' => method(:create_custom_visitor),
162
- 'complex' => method(:create_complex_visitor)
163
- }
164
- creator = visitor_creators[options[:visitor]]
165
- visitor = creator.call if creator
166
- end
167
-
168
- SCENARIO_RUNNERS = {
169
- 'convert-default' => ->(html, _options, _handle, _visitor) { HtmlToMarkdown.convert(html) },
170
- 'convert-options' => lambda do |html, _options, handle, _visitor|
171
- raise ArgumentError, 'options handle required' unless handle
172
-
173
- HtmlToMarkdown.convert_with_options(html, handle)
174
- end,
175
- 'inline-images-default' => lambda { |html, _options, _handle, _visitor|
176
- HtmlToMarkdown.convert_with_inline_images(html, nil, nil)
177
- },
178
- 'inline-images-options' => lambda do |html, _options, handle, _visitor|
179
- raise ArgumentError, 'options handle required' unless handle
180
-
181
- HtmlToMarkdown.convert_with_inline_images_handle(html, handle, nil)
182
- end,
183
- 'metadata-default' => ->(html, _options, _handle, _visitor) { HtmlToMarkdown.convert_with_metadata(html, nil, nil) },
184
- 'metadata-options' => lambda do |html, _options, handle, _visitor|
185
- raise ArgumentError, 'options handle required' unless handle
186
-
187
- HtmlToMarkdown.convert_with_metadata_handle(html, handle, nil)
188
- end
189
- }.freeze
190
-
191
- def run_scenario(html, scenario, options, handle, visitor = nil)
192
- if visitor
193
- HtmlToMarkdown.convert_with_visitor(html, nil, visitor)
194
- else
195
- runner = SCENARIO_RUNNERS.fetch(scenario) { raise ArgumentError, "Unsupported scenario: #{scenario}" }
196
- runner.call(html, options, handle, visitor)
197
- end
198
- end
199
-
200
- run_scenario(html, options[:scenario], conversion_options, options_handle, visitor)
201
-
202
- profile_output = ENV.fetch('HTML_TO_MARKDOWN_PROFILE_OUTPUT', nil)
203
- if profile_output && HtmlToMarkdown.respond_to?(:start_profiling)
204
- freq = Integer(ENV.fetch('HTML_TO_MARKDOWN_PROFILE_FREQUENCY', '1000'), 10)
205
- HtmlToMarkdown.start_profiling(profile_output, freq)
206
- end
207
-
208
- start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
209
- iterations.times { run_scenario(html, options[:scenario], conversion_options, options_handle, visitor) }
210
- elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
211
-
212
- HtmlToMarkdown.stop_profiling if profile_output && HtmlToMarkdown.respond_to?(:stop_profiling)
213
-
214
- payload_size_bytes = html.bytesize
215
- bytes_processed = payload_size_bytes * iterations
216
- ops_per_sec = iterations / elapsed
217
- mb_per_sec = (bytes_processed.to_f / (1024 * 1024)) / elapsed
218
-
219
- payload = %({
220
- "language":"ruby",
221
- "fixture":"#{json_escape(File.basename(fixture))}",
222
- "fixture_path":"#{json_escape(fixture)}",
223
- "scenario":"#{json_escape(options[:scenario])}",
224
- "iterations":#{iterations},
225
- "elapsed_seconds":#{format('%.8f', elapsed)},
226
- "ops_per_sec":#{format('%.4f', ops_per_sec)},
227
- "mb_per_sec":#{format('%.4f', mb_per_sec)},
228
- "bytes_processed":#{bytes_processed},
229
- "payload_size_bytes":#{payload_size_bytes}
230
- })
231
-
232
- puts payload.strip
@@ -1,71 +0,0 @@
1
- //! Table extraction conversion functions for Ruby bindings.
2
-
3
- use html_to_markdown_rs::{ConversionWithTables, TableData};
4
- use magnus::prelude::*;
5
- use magnus::{Error, Ruby, Value};
6
-
7
- #[cfg(feature = "metadata")]
8
- use super::metadata::extended_metadata_to_ruby;
9
-
10
- fn table_data_to_ruby(ruby: &Ruby, table: TableData) -> Result<Value, Error> {
11
- let hash = ruby.hash_new();
12
-
13
- // cells: Array[Array[String]]
14
- let cells_array = ruby.ary_new();
15
- for row in table.cells {
16
- let row_array = ruby.ary_new();
17
- for cell in row {
18
- row_array.push(cell)?;
19
- }
20
- cells_array.push(row_array)?;
21
- }
22
- hash.aset(ruby.intern("cells"), cells_array)?;
23
-
24
- // markdown: String
25
- hash.aset(ruby.intern("markdown"), table.markdown)?;
26
-
27
- // is_header_row: Array[bool]
28
- let header_array = ruby.ary_new();
29
- for is_header in table.is_header_row {
30
- header_array.push(is_header)?;
31
- }
32
- hash.aset(ruby.intern("is_header_row"), header_array)?;
33
-
34
- Ok(hash.as_value())
35
- }
36
-
37
- /// Convert a `ConversionWithTables` result to a Ruby Hash.
38
- ///
39
- /// Returns a Hash with keys `:content`, `:metadata`, `:tables`.
40
- pub fn tables_result_to_ruby(ruby: &Ruby, result: ConversionWithTables) -> Result<Value, Error> {
41
- let hash = ruby.hash_new();
42
-
43
- // content: String
44
- hash.aset(ruby.intern("content"), result.content)?;
45
-
46
- // metadata: Hash or nil
47
- #[cfg(feature = "metadata")]
48
- {
49
- match result.metadata {
50
- Some(metadata) => {
51
- hash.aset(ruby.intern("metadata"), extended_metadata_to_ruby(ruby, metadata)?)?;
52
- }
53
- None => {
54
- hash.aset(ruby.intern("metadata"), ruby.qnil())?;
55
- }
56
- }
57
- }
58
- #[cfg(not(feature = "metadata"))]
59
- {
60
- hash.aset(ruby.intern("metadata"), ruby.qnil())?;
61
- }
62
-
63
- // tables: Array[Hash]
64
- let tables_array = ruby.ary_new();
65
- for table in result.tables {
66
- tables_array.push(table_data_to_ruby(ruby, table)?)?;
67
- }
68
- hash.aset(ruby.intern("tables"), tables_array)?;
69
-
70
- Ok(hash.as_value())
71
- }