html-to-markdown 2.29.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +18 -41
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +7 -4
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +127 -51
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version = "2.29.0"
3
+ version = "3.0.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -26,8 +26,6 @@ html-to-markdown-rs = { path = "../../../vendor/html-to-markdown-rs", features =
26
26
  magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
27
27
  "rb-sys",
28
28
  ] }
29
- pprof = { version = "0.15", features = ["flamegraph"], optional = true }
30
-
31
29
  [dev-dependencies]
32
30
  pretty_assertions = "1.4"
33
31
 
@@ -36,7 +34,6 @@ default = ["inline-images", "metadata", "visitor"]
36
34
  inline-images = ["html-to-markdown-rs/inline-images"]
37
35
  metadata = ["html-to-markdown-rs/metadata"]
38
36
  visitor = ["html-to-markdown-rs/visitor"]
39
- profiling = ["dep:pprof"]
40
37
 
41
38
  [lints.rust]
42
39
  unsafe_code = "forbid"
@@ -144,25 +144,16 @@ markdown = HtmlToMarkdown.convert(
144
144
 
145
145
  ### Inline Images
146
146
 
147
- Extract inline binary data (data URIs, SVG) together with the converted Markdown.
147
+ Convert HTML with inline images (data URIs, SVG) to Markdown.
148
148
 
149
149
  ```ruby
150
150
  require 'html_to_markdown'
151
151
 
152
- result = HtmlToMarkdown.convert_with_inline_images(
153
- '<img src="data:image/png;base64,iVBORw0..." alt="Pixel">',
154
- image_config: {
155
- max_decoded_size_bytes: 1 * 1024 * 1024,
156
- infer_dimensions: true,
157
- filename_prefix: 'img_',
158
- capture_svg: true
159
- }
152
+ markdown = HtmlToMarkdown.convert(
153
+ '<img src="data:image/png;base64,iVBORw0..." alt="Pixel">'
160
154
  )
161
155
 
162
- puts result.markdown
163
- result.inline_images.each do |img|
164
- puts "#{img.filename} -> #{img.format} (#{img.data.bytesize} bytes)"
165
- end
156
+ puts markdown
166
157
  ```
167
158
 
168
159
  ## CLI
@@ -1,53 +1,8 @@
1
1
  //! Inline image configuration and conversion functions.
2
2
 
3
- use crate::types::{arg_error, symbol_to_string};
4
- use html_to_markdown_rs::{
5
- DEFAULT_INLINE_IMAGE_LIMIT, HtmlExtraction, InlineImage, InlineImageConfig, InlineImageConfigUpdate,
6
- InlineImageWarning,
7
- };
3
+ use html_to_markdown_rs::InlineImage;
8
4
  use magnus::prelude::*;
9
- use magnus::r_hash::ForEach;
10
- use magnus::{Error, RHash, Ruby, TryConvert, Value};
11
-
12
- pub fn build_inline_image_config(_ruby: &Ruby, config: Option<Value>) -> Result<InlineImageConfig, Error> {
13
- let mut update = InlineImageConfigUpdate::default();
14
-
15
- let Some(config) = config else {
16
- return Ok(InlineImageConfig::new(DEFAULT_INLINE_IMAGE_LIMIT));
17
- };
18
-
19
- if config.is_nil() {
20
- return Ok(InlineImageConfig::new(DEFAULT_INLINE_IMAGE_LIMIT));
21
- }
22
-
23
- let hash = RHash::from_value(config).ok_or_else(|| arg_error("inline image config must be provided as a Hash"))?;
24
-
25
- hash.foreach(|key: Value, val: Value| {
26
- let key_name = symbol_to_string(key)?;
27
- match key_name.as_str() {
28
- "max_decoded_size_bytes" => {
29
- update.max_decoded_size_bytes = Some(u64::try_convert(val)?);
30
- }
31
- "filename_prefix" => {
32
- update.filename_prefix = if val.is_nil() {
33
- None
34
- } else {
35
- Some(String::try_convert(val)?)
36
- };
37
- }
38
- "capture_svg" => {
39
- update.capture_svg = Some(bool::try_convert(val)?);
40
- }
41
- "infer_dimensions" => {
42
- update.infer_dimensions = Some(bool::try_convert(val)?);
43
- }
44
- _ => {}
45
- }
46
- Ok(ForEach::Continue)
47
- })?;
48
-
49
- Ok(InlineImageConfig::from_update(update))
50
- }
5
+ use magnus::{Error, Ruby, Value};
51
6
 
52
7
  pub fn inline_image_to_value(ruby: &Ruby, image: InlineImage) -> Result<Value, Error> {
53
8
  let InlineImage {
@@ -97,29 +52,3 @@ pub fn inline_image_to_value(ruby: &Ruby, image: InlineImage) -> Result<Value, E
97
52
 
98
53
  Ok(hash.as_value())
99
54
  }
100
-
101
- pub fn warning_to_value(ruby: &Ruby, warning: InlineImageWarning) -> Result<Value, Error> {
102
- let hash = ruby.hash_new();
103
- hash.aset(ruby.intern("index"), warning.index as i64)?;
104
- hash.aset(ruby.intern("message"), warning.message)?;
105
- Ok(hash.as_value())
106
- }
107
-
108
- pub fn extraction_to_value(ruby: &Ruby, extraction: HtmlExtraction) -> Result<Value, Error> {
109
- let hash = ruby.hash_new();
110
- hash.aset(ruby.intern("markdown"), extraction.markdown)?;
111
-
112
- let inline_images = ruby.ary_new();
113
- for image in extraction.inline_images {
114
- inline_images.push(inline_image_to_value(ruby, image)?)?;
115
- }
116
- hash.aset(ruby.intern("inline_images"), inline_images)?;
117
-
118
- let warnings = ruby.ary_new();
119
- for warning in extraction.warnings {
120
- warnings.push(warning_to_value(ruby, warning)?)?;
121
- }
122
- hash.aset(ruby.intern("warnings"), warnings)?;
123
-
124
- Ok(hash.as_value())
125
- }
@@ -1,56 +1,12 @@
1
1
  //! Metadata configuration and conversion functions.
2
2
 
3
- use crate::types::{arg_error, symbol_to_string};
4
3
  use html_to_markdown_rs::metadata::{
5
- DocumentMetadata as RustDocumentMetadata, ExtendedMetadata as RustExtendedMetadata,
6
- HeaderMetadata as RustHeaderMetadata, ImageMetadata as RustImageMetadata, LinkMetadata as RustLinkMetadata,
7
- MetadataConfig as RustMetadataConfig, StructuredData as RustStructuredData, TextDirection as RustTextDirection,
4
+ DocumentMetadata as RustDocumentMetadata, HeaderMetadata as RustHeaderMetadata,
5
+ HtmlMetadata as RustHtmlMetadata, ImageMetadata as RustImageMetadata, LinkMetadata as RustLinkMetadata,
6
+ StructuredData as RustStructuredData, TextDirection as RustTextDirection,
8
7
  };
9
8
  use magnus::prelude::*;
10
- use magnus::r_hash::ForEach;
11
- use magnus::{Error, RHash, Ruby, TryConvert, Value};
12
-
13
- pub fn build_metadata_config(_ruby: &Ruby, config: Option<Value>) -> Result<RustMetadataConfig, Error> {
14
- let mut cfg = RustMetadataConfig::default();
15
-
16
- let Some(config) = config else {
17
- return Ok(cfg);
18
- };
19
-
20
- if config.is_nil() {
21
- return Ok(cfg);
22
- }
23
-
24
- let hash = RHash::from_value(config).ok_or_else(|| arg_error("metadata_config must be provided as a Hash"))?;
25
-
26
- hash.foreach(|key: Value, val: Value| {
27
- let key_name = symbol_to_string(key)?;
28
- match key_name.as_str() {
29
- "extract_document" => {
30
- cfg.extract_document = bool::try_convert(val)?;
31
- }
32
- "extract_headers" => {
33
- cfg.extract_headers = bool::try_convert(val)?;
34
- }
35
- "extract_links" => {
36
- cfg.extract_links = bool::try_convert(val)?;
37
- }
38
- "extract_images" => {
39
- cfg.extract_images = bool::try_convert(val)?;
40
- }
41
- "extract_structured_data" => {
42
- cfg.extract_structured_data = bool::try_convert(val)?;
43
- }
44
- "max_structured_data_size" => {
45
- cfg.max_structured_data_size = usize::try_convert(val)?;
46
- }
47
- _ => {}
48
- }
49
- Ok(ForEach::Continue)
50
- })?;
51
-
52
- Ok(cfg)
53
- }
9
+ use magnus::{Error, Ruby, Value};
54
10
 
55
11
  fn opt_string_to_ruby(ruby: &Ruby, opt: Option<String>) -> Result<Value, Error> {
56
12
  match opt {
@@ -183,7 +139,7 @@ fn structured_data_to_ruby(ruby: &Ruby, data: Vec<RustStructuredData>) -> Result
183
139
  Ok(array.as_value())
184
140
  }
185
141
 
186
- pub fn extended_metadata_to_ruby(ruby: &Ruby, metadata: RustExtendedMetadata) -> Result<Value, Error> {
142
+ pub fn extended_metadata_to_ruby(ruby: &Ruby, metadata: RustHtmlMetadata) -> Result<Value, Error> {
187
143
  let hash = ruby.hash_new();
188
144
 
189
145
  hash.aset(
@@ -5,13 +5,7 @@ pub mod inline_images;
5
5
  #[cfg(feature = "metadata")]
6
6
  pub mod metadata;
7
7
 
8
- #[cfg(feature = "visitor")]
9
- pub mod tables;
10
-
11
8
  pub use inline_images::*;
12
9
 
13
10
  #[cfg(feature = "metadata")]
14
11
  pub use metadata::*;
15
-
16
- #[cfg(feature = "visitor")]
17
- pub use tables::*;
@@ -1,52 +1,19 @@
1
1
  #![allow(clippy::all, clippy::pedantic, clippy::nursery, missing_docs)]
2
2
 
3
- use html_to_markdown_rs::{
4
- ConversionOptions, convert as convert_inner, convert_with_inline_images as convert_with_inline_images_inner,
5
- error::ConversionError, safety::guard_panic,
6
- };
7
-
8
- #[cfg(feature = "visitor")]
9
- use html_to_markdown_rs::convert_with_visitor as convert_with_visitor_inner;
10
-
11
- #[cfg(feature = "visitor")]
12
- use html_to_markdown_rs::convert_with_tables as convert_with_tables_inner;
13
-
14
- #[cfg(feature = "metadata")]
15
- use html_to_markdown_rs::convert_with_metadata as convert_with_metadata_inner;
3
+ use html_to_markdown_rs::{error::ConversionError, safety::guard_panic};
16
4
 
17
5
  mod conversion;
18
6
  mod options;
19
- mod profiling;
20
7
  mod types;
21
8
 
22
- #[cfg(feature = "visitor")]
23
- mod visitor;
24
-
25
- use conversion::{build_inline_image_config, extraction_to_value};
26
9
  use options::build_conversion_options;
27
10
  use types::{arg_error, runtime_error};
28
11
 
29
12
  #[cfg(feature = "metadata")]
30
- use conversion::{build_metadata_config, extended_metadata_to_ruby};
31
-
32
- #[cfg(feature = "visitor")]
33
- use conversion::tables_result_to_ruby;
34
-
35
- #[cfg(feature = "visitor")]
36
- use visitor::RubyVisitorWrapper;
13
+ use conversion::extended_metadata_to_ruby;
37
14
 
38
15
  use magnus::prelude::*;
39
- use magnus::{Error, Ruby, TryConvert, Value, function, scan_args::scan_args};
40
-
41
- #[cfg(feature = "visitor")]
42
- use std::panic::AssertUnwindSafe;
43
-
44
- #[cfg(feature = "profiling")]
45
- use std::path::PathBuf;
46
-
47
- #[derive(Clone)]
48
- #[magnus::wrap(class = "HtmlToMarkdown::Options", free_immediately)]
49
- struct OptionsHandle(ConversionOptions);
16
+ use magnus::{Error, Ruby, Value, function, scan_args::scan_args};
50
17
 
51
18
  fn conversion_error(err: ConversionError) -> Error {
52
19
  match err {
@@ -58,208 +25,104 @@ fn conversion_error(err: ConversionError) -> Error {
58
25
  }
59
26
  }
60
27
 
61
- fn convert_fn(ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
28
+ fn convert_full_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
62
29
  let parsed = scan_args::<(String,), (Option<Value>,), (), (), (), ()>(args)?;
63
30
  let html = parsed.required.0;
64
31
  let options = build_conversion_options(ruby, parsed.optional.0)?;
65
32
 
66
- guard_panic(|| profiling::maybe_profile(|| convert_inner(&html, Some(options)))).map_err(conversion_error)
67
- }
68
-
69
- fn options_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<OptionsHandle, Error> {
70
- let parsed = scan_args::<(), (Option<Value>,), (), (), (), ()>(args)?;
71
- let options = build_conversion_options(ruby, parsed.optional.0)?;
72
- Ok(OptionsHandle(options))
73
- }
74
-
75
- fn convert_with_options_handle_fn(_ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
76
- let parsed = scan_args::<(String, &OptionsHandle), (), (), (), (), ()>(args)?;
77
- let html = parsed.required.0;
78
- let handle = parsed.required.1;
79
- let options = handle.0.clone();
80
-
81
- guard_panic(|| profiling::maybe_profile(|| convert_inner(&html, Some(options)))).map_err(conversion_error)
82
- }
83
-
84
- #[cfg(feature = "inline-images")]
85
- fn convert_with_inline_images_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
86
- let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
87
- let html = parsed.required.0;
88
- let options = build_conversion_options(ruby, parsed.optional.0)?;
89
- let config = build_inline_image_config(ruby, parsed.optional.1)?;
90
-
91
- let extraction = guard_panic(|| convert_with_inline_images_inner(&html, Some(options), config, None))
92
- .map_err(conversion_error)?;
93
-
94
- extraction_to_value(ruby, extraction)
95
- }
96
-
97
- #[cfg(feature = "inline-images")]
98
- fn convert_with_inline_images_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
99
- let parsed = scan_args::<(String, &OptionsHandle), (Option<Value>,), (), (), (), ()>(args)?;
100
- let html = parsed.required.0;
101
- let handle = parsed.required.1;
102
- let options = handle.0.clone();
103
- let config = build_inline_image_config(ruby, parsed.optional.0)?;
104
-
105
- let extraction = guard_panic(|| convert_with_inline_images_inner(&html, Some(options), config, None))
106
- .map_err(conversion_error)?;
107
-
108
- extraction_to_value(ruby, extraction)
109
- }
110
-
111
- #[cfg(feature = "metadata")]
112
- fn convert_with_metadata_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
113
- let parsed = scan_args::<(String,), (Option<Value>, Option<Value>, Option<Value>), (), (), (), ()>(args)?;
114
- let html = parsed.required.0;
115
- let options = build_conversion_options(ruby, parsed.optional.0)?;
116
- let metadata_config = build_metadata_config(ruby, parsed.optional.1)?;
117
- let _visitor = parsed.optional.2;
118
-
119
- let (markdown, metadata) = guard_panic(|| convert_with_metadata_inner(&html, Some(options), metadata_config, None))
33
+ let result = guard_panic(|| html_to_markdown_rs::convert(&html, Some(options.clone())))
120
34
  .map_err(conversion_error)?;
121
35
 
122
- let array = ruby.ary_new();
123
- array.push(markdown)?;
124
- array.push(extended_metadata_to_ruby(ruby, metadata)?)?;
125
-
126
- Ok(array.as_value())
127
- }
128
-
129
- #[cfg(feature = "metadata")]
130
- fn convert_with_metadata_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
131
- let parsed = scan_args::<(String, &OptionsHandle), (Option<Value>,), (), (), (), ()>(args)?;
132
- let html = parsed.required.0;
133
- let handle = parsed.required.1;
134
- let options = handle.0.clone();
135
- let metadata_config = build_metadata_config(ruby, parsed.optional.0)?;
36
+ let hash = ruby.hash_new();
136
37
 
137
- let (markdown, metadata) = guard_panic(|| convert_with_metadata_inner(&html, Some(options), metadata_config, None))
138
- .map_err(conversion_error)?;
139
-
140
- let array = ruby.ary_new();
141
- array.push(markdown)?;
142
- array.push(extended_metadata_to_ruby(ruby, metadata)?)?;
143
-
144
- Ok(array.as_value())
145
- }
38
+ // content: Option<String>
39
+ match result.content {
40
+ Some(ref s) => hash.aset(ruby.intern("content"), s.as_str())?,
41
+ None => hash.aset(ruby.intern("content"), ruby.qnil())?,
42
+ }
146
43
 
147
- #[cfg(feature = "visitor")]
148
- fn convert_with_tables_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
149
- let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
150
- let html = parsed.required.0;
151
- let options = build_conversion_options(ruby, parsed.optional.0)?;
44
+ // document: not yet exposed
45
+ hash.aset(ruby.intern("document"), ruby.qnil())?;
152
46
 
47
+ // metadata
153
48
  #[cfg(feature = "metadata")]
154
- let metadata_config = Some(build_metadata_config(ruby, parsed.optional.1)?);
49
+ {
50
+ let metadata_value = extended_metadata_to_ruby(ruby, result.metadata)?;
51
+ hash.aset(ruby.intern("metadata"), metadata_value)?;
52
+ }
155
53
  #[cfg(not(feature = "metadata"))]
156
- let metadata_config: Option<()> = None;
157
-
158
- let result =
159
- guard_panic(|| convert_with_tables_inner(&html, Some(options), metadata_config)).map_err(conversion_error)?;
54
+ hash.aset(ruby.intern("metadata"), ruby.qnil())?;
160
55
 
161
- tables_result_to_ruby(ruby, result)
162
- }
163
-
164
- #[cfg(feature = "visitor")]
165
- fn convert_with_visitor_fn(ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
166
- let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
167
- let html = parsed.required.0;
168
-
169
- let options = match parsed.optional.0 {
170
- Some(opt_val) => match <&OptionsHandle>::try_convert(opt_val) {
171
- Ok(handle) => handle.0.clone(),
172
- Err(_) => build_conversion_options(ruby, Some(opt_val))?,
173
- },
174
- None => ConversionOptions::default(),
175
- };
176
-
177
- let visitor_value = match parsed.optional.1 {
178
- Some(val) => {
179
- if val.is_nil() {
180
- return guard_panic(AssertUnwindSafe(|| {
181
- profiling::maybe_profile(|| convert_inner(&html, Some(options)))
182
- }))
183
- .map_err(conversion_error);
56
+ // tables: Vec<TableData> with grid and markdown
57
+ {
58
+ let tables_array = ruby.ary_new();
59
+ for table in &result.tables {
60
+ let table_hash = ruby.hash_new();
61
+ let grid_hash = ruby.hash_new();
62
+ grid_hash.aset(ruby.intern("rows"), table.grid.rows as i64)?;
63
+ grid_hash.aset(ruby.intern("cols"), table.grid.cols as i64)?;
64
+ let cells_array = ruby.ary_new();
65
+ for cell in &table.grid.cells {
66
+ let cell_hash = ruby.hash_new();
67
+ cell_hash.aset(ruby.intern("content"), cell.content.as_str())?;
68
+ cell_hash.aset(ruby.intern("row"), cell.row as i64)?;
69
+ cell_hash.aset(ruby.intern("col"), cell.col as i64)?;
70
+ cell_hash.aset(ruby.intern("row_span"), cell.row_span as i64)?;
71
+ cell_hash.aset(ruby.intern("col_span"), cell.col_span as i64)?;
72
+ cell_hash.aset(ruby.intern("is_header"), cell.is_header)?;
73
+ cells_array.push(cell_hash)?;
184
74
  }
185
- val
75
+ grid_hash.aset(ruby.intern("cells"), cells_array)?;
76
+ table_hash.aset(ruby.intern("grid"), grid_hash)?;
77
+ table_hash.aset(ruby.intern("markdown"), table.markdown.as_str())?;
78
+ tables_array.push(table_hash)?;
186
79
  }
187
- None => return Err(arg_error("visitor argument is required")),
188
- };
189
-
190
- let visitor_wrapper = RubyVisitorWrapper::new(visitor_value);
191
- let visitor_handle = std::rc::Rc::new(std::cell::RefCell::new(visitor_wrapper.clone()));
192
-
193
- let result = guard_panic(AssertUnwindSafe(|| {
194
- profiling::maybe_profile(|| convert_with_visitor_inner(&html, Some(options), Some(visitor_handle)))
195
- }))
196
- .map_err(conversion_error)?;
197
-
198
- if let Some(error_msg) = visitor_wrapper.last_error.borrow().as_ref() {
199
- return Err(runtime_error(error_msg.clone()));
80
+ hash.aset(ruby.intern("tables"), tables_array)?;
200
81
  }
201
82
 
202
- Ok(result)
203
- }
204
-
205
- #[cfg(feature = "profiling")]
206
- fn start_profiling_fn(_ruby: &Ruby, args: &[Value]) -> Result<bool, Error> {
207
- let output = args.first().ok_or_else(|| arg_error("output_path required"))?;
208
- let output: String = String::try_convert(*output)?;
209
- let freq = if let Some(value) = args.get(1) {
210
- i32::try_convert(*value)?
211
- } else {
212
- 1000
213
- };
214
- profiling::start(PathBuf::from(output), freq).map_err(conversion_error)?;
215
- Ok(true)
216
- }
217
-
218
- #[cfg(feature = "profiling")]
219
- fn stop_profiling_fn(_ruby: &Ruby, _args: &[Value]) -> Result<bool, Error> {
220
- profiling::stop().map_err(conversion_error)?;
221
- Ok(true)
222
- }
223
-
224
- #[magnus::init]
225
- fn init(ruby: &Ruby) -> Result<(), Error> {
226
- let module = ruby.define_module("HtmlToMarkdown")?;
227
- module.define_singleton_method("convert", function!(convert_fn, -1))?;
228
- module.define_singleton_method("options", function!(options_handle_fn, -1))?;
229
- module.define_singleton_method("convert_with_options", function!(convert_with_options_handle_fn, -1))?;
230
-
83
+ // images
231
84
  #[cfg(feature = "inline-images")]
232
85
  {
233
- module.define_singleton_method(
234
- "convert_with_inline_images",
235
- function!(convert_with_inline_images_fn, -1),
236
- )?;
237
- module.define_singleton_method(
238
- "convert_with_inline_images_handle",
239
- function!(convert_with_inline_images_handle_fn, -1),
240
- )?;
86
+ use conversion::inline_image_to_value;
87
+ let images_array = ruby.ary_new();
88
+ for image in result.images {
89
+ let image_value = inline_image_to_value(ruby, image)?;
90
+ images_array.push(image_value)?;
91
+ }
92
+ hash.aset(ruby.intern("images"), images_array)?;
241
93
  }
242
-
243
- #[cfg(feature = "metadata")]
94
+ #[cfg(not(feature = "inline-images"))]
244
95
  {
245
- module.define_singleton_method("convert_with_metadata", function!(convert_with_metadata_fn, -1))?;
246
- module.define_singleton_method(
247
- "convert_with_metadata_handle",
248
- function!(convert_with_metadata_handle_fn, -1),
249
- )?;
96
+ let empty = ruby.ary_new();
97
+ hash.aset(ruby.intern("images"), empty)?;
250
98
  }
251
99
 
252
- #[cfg(feature = "visitor")]
100
+ // warnings
253
101
  {
254
- module.define_singleton_method("convert_with_visitor", function!(convert_with_visitor_fn, -1))?;
255
- module.define_singleton_method("convert_with_tables", function!(convert_with_tables_fn, -1))?;
102
+ let warnings_array = ruby.ary_new();
103
+ for warning in &result.warnings {
104
+ let w_hash = ruby.hash_new();
105
+ w_hash.aset(ruby.intern("message"), warning.message.as_str())?;
106
+ let kind_str = match warning.kind {
107
+ html_to_markdown_rs::WarningKind::ImageExtractionFailed => "image_extraction_failed",
108
+ html_to_markdown_rs::WarningKind::EncodingFallback => "encoding_fallback",
109
+ html_to_markdown_rs::WarningKind::TruncatedInput => "truncated_input",
110
+ html_to_markdown_rs::WarningKind::MalformedHtml => "malformed_html",
111
+ html_to_markdown_rs::WarningKind::SanitizationApplied => "sanitization_applied",
112
+ };
113
+ w_hash.aset(ruby.intern("kind"), kind_str)?;
114
+ warnings_array.push(w_hash)?;
115
+ }
116
+ hash.aset(ruby.intern("warnings"), warnings_array)?;
256
117
  }
257
118
 
258
- #[cfg(feature = "profiling")]
259
- {
260
- module.define_singleton_method("start_profiling", function!(start_profiling_fn, -1))?;
261
- module.define_singleton_method("stop_profiling", function!(stop_profiling_fn, -1))?;
262
- }
119
+ Ok(hash.as_value())
120
+ }
121
+
122
+ #[magnus::init]
123
+ fn init(ruby: &Ruby) -> Result<(), Error> {
124
+ let module = ruby.define_module("HtmlToMarkdown")?;
125
+ module.define_singleton_method("convert", function!(convert_full_fn, -1))?;
263
126
 
264
127
  Ok(())
265
128
  }
@@ -175,9 +175,6 @@ pub fn build_conversion_options(ruby: &Ruby, options: Option<Value>) -> Result<C
175
175
  "br_in_tables" => {
176
176
  update.br_in_tables = Some(bool::try_convert(val)?);
177
177
  }
178
- "hocr_spatial_tables" => {
179
- update.hocr_spatial_tables = Some(bool::try_convert(val)?);
180
- }
181
178
  "highlight_style" => {
182
179
  update.highlight_style = Some(parse_highlight_style(val)?);
183
180
  }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.29.0'
4
+ VERSION = '3.0.0'
5
5
  end