html-to-markdown 2.30.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -14
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +5 -2
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +126 -52
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,254 +0,0 @@
1
- //! Code block analysis and validation
2
-
3
- use super::hierarchy::CodeLineInfo;
4
- use super::keywords::{contains_keyword_token, is_shell_prompt, starts_with_keyword};
5
-
6
- /// Check if a line looks like it's part of a bullet list
7
- pub fn is_bullet_like(line: &str) -> bool {
8
- let trimmed = line.trim_start();
9
- if trimmed.is_empty() {
10
- return false;
11
- }
12
-
13
- if trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("+ ") || trimmed.starts_with("•")
14
- {
15
- return true;
16
- }
17
-
18
- let mut chars = trimmed.chars().peekable();
19
- let mut digit_count = 0;
20
- while let Some(&ch) = chars.peek() {
21
- if ch.is_ascii_digit() {
22
- digit_count += 1;
23
- chars.next();
24
- continue;
25
- }
26
- break;
27
- }
28
-
29
- if digit_count > 0 {
30
- if let Some(&ch) = chars.peek() {
31
- if (ch == '.' || ch == ')') && chars.clone().nth(1).is_some_and(char::is_whitespace) {
32
- return true;
33
- }
34
- }
35
- }
36
-
37
- false
38
- }
39
-
40
- /// Determine if lines form a paragraph of code
41
- pub fn is_code_paragraph(lines: &[CodeLineInfo]) -> bool {
42
- if lines.is_empty() {
43
- return false;
44
- }
45
-
46
- let mut strong_markers = 0;
47
- let mut moderate_markers = 0;
48
- let mut total = 0;
49
-
50
- for info in lines {
51
- let text = info.text.trim();
52
- if text.is_empty() {
53
- continue;
54
- }
55
-
56
- if is_bullet_like(&info.text) {
57
- return false;
58
- }
59
-
60
- total += 1;
61
- let lower = text.to_lowercase();
62
- let trimmed = text.trim_start();
63
-
64
- let documentation_tokens = [
65
- "definition",
66
- "theorem",
67
- "lemma",
68
- "proof",
69
- "corollary",
70
- "algorithm",
71
- "figure",
72
- "table",
73
- "appendix",
74
- ];
75
- if documentation_tokens
76
- .iter()
77
- .any(|token| contains_keyword_token(&lower, token))
78
- {
79
- return false;
80
- }
81
-
82
- let has_keyword = (starts_with_keyword(trimmed, "function") && text.contains('('))
83
- || (starts_with_keyword(trimmed, "return")
84
- && trimmed.chars().nth("return".len()).is_none_or(char::is_whitespace))
85
- || trimmed.starts_with("console.")
86
- || starts_with_keyword(trimmed, "async")
87
- || starts_with_keyword(trimmed, "await")
88
- || (starts_with_keyword(trimmed, "class") && (text.contains('{') || text.contains(':')))
89
- || (starts_with_keyword(trimmed, "struct") && text.contains('{'))
90
- || (starts_with_keyword(trimmed, "enum") && text.contains('{'))
91
- || (starts_with_keyword(trimmed, "def") && (text.contains('(') || text.contains(':')))
92
- || (starts_with_keyword(trimmed, "fn") && text.contains('('))
93
- || (starts_with_keyword(trimmed, "pub")
94
- && (text.contains("fn") || text.contains("struct") || text.contains("enum")))
95
- || starts_with_keyword(trimmed, "import")
96
- || starts_with_keyword(trimmed, "using")
97
- || starts_with_keyword(trimmed, "namespace")
98
- || starts_with_keyword(trimmed, "public")
99
- || starts_with_keyword(trimmed, "private")
100
- || starts_with_keyword(trimmed, "protected")
101
- || starts_with_keyword(trimmed, "static")
102
- || starts_with_keyword(trimmed, "void")
103
- || starts_with_keyword(trimmed, "try")
104
- || starts_with_keyword(trimmed, "catch")
105
- || starts_with_keyword(trimmed, "finally")
106
- || starts_with_keyword(trimmed, "throw")
107
- || starts_with_keyword(trimmed, "typedef")
108
- || starts_with_keyword(trimmed, "package")
109
- || starts_with_keyword(trimmed, "module");
110
-
111
- let has_symbol = text.contains(';') || text.contains("::");
112
-
113
- if has_keyword || has_symbol {
114
- strong_markers += 1;
115
- continue;
116
- }
117
-
118
- if is_shell_prompt(text) {
119
- strong_markers += 1;
120
- continue;
121
- }
122
- let has_assignment = text.contains(" = ")
123
- || text.contains("+=")
124
- || text.contains("-=")
125
- || text.contains("*=")
126
- || text.contains("/=")
127
- || text.contains(" := ")
128
- || text.contains(" == ");
129
-
130
- let has_arrow = text.contains("=>");
131
- let has_brace = text.contains('{') || text.contains('}');
132
- let has_pointer_arrow = text.contains("->");
133
-
134
- if has_assignment || has_arrow || has_brace || has_pointer_arrow {
135
- moderate_markers += 1;
136
- }
137
- }
138
-
139
- if total == 0 {
140
- return false;
141
- }
142
- if strong_markers == 0 {
143
- return false;
144
- }
145
- if strong_markers * 2 >= total {
146
- return true;
147
- }
148
- (strong_markers + moderate_markers) * 2 >= total
149
- }
150
-
151
- /// Check if code block is likely valid and not just prose
152
- pub fn is_confident_code_block(lines: &[CodeLineInfo]) -> bool {
153
- let mut total = 0;
154
- let mut keyword_lines = 0;
155
- let mut punctuation_lines = 0;
156
- let mut assignment_lines = 0;
157
- let mut shell_lines = 0;
158
- let mut indent_lines = 0;
159
-
160
- let min_x = lines.iter().map(|info| info.x1).min().unwrap_or_default();
161
-
162
- for info in lines {
163
- let text = info.text.trim();
164
- if text.is_empty() {
165
- continue;
166
- }
167
- total += 1;
168
-
169
- if is_shell_prompt(text) {
170
- shell_lines += 1;
171
- }
172
-
173
- let trimmed = text.trim_start();
174
-
175
- if (starts_with_keyword(trimmed, "function") && text.contains('('))
176
- || trimmed.starts_with("console.")
177
- || (starts_with_keyword(trimmed, "return")
178
- && trimmed.chars().nth("return".len()).is_none_or(char::is_whitespace))
179
- || starts_with_keyword(trimmed, "async")
180
- || starts_with_keyword(trimmed, "await")
181
- || (starts_with_keyword(trimmed, "class") && (text.contains('{') || text.contains(':')))
182
- || (starts_with_keyword(trimmed, "struct") && text.contains('{'))
183
- || (starts_with_keyword(trimmed, "enum") && text.contains('{'))
184
- || (starts_with_keyword(trimmed, "def") && (text.contains('(') || text.contains(':')))
185
- || (starts_with_keyword(trimmed, "fn") && text.contains('('))
186
- || (starts_with_keyword(trimmed, "pub")
187
- && (text.contains("fn") || text.contains("struct") || text.contains("enum")))
188
- || starts_with_keyword(trimmed, "import")
189
- || starts_with_keyword(trimmed, "using")
190
- || starts_with_keyword(trimmed, "namespace")
191
- || starts_with_keyword(trimmed, "public")
192
- || starts_with_keyword(trimmed, "private")
193
- || starts_with_keyword(trimmed, "protected")
194
- || starts_with_keyword(trimmed, "static")
195
- || starts_with_keyword(trimmed, "void")
196
- || starts_with_keyword(trimmed, "try")
197
- || starts_with_keyword(trimmed, "catch")
198
- || starts_with_keyword(trimmed, "finally")
199
- || starts_with_keyword(trimmed, "throw")
200
- || starts_with_keyword(trimmed, "typedef")
201
- || starts_with_keyword(trimmed, "package")
202
- || starts_with_keyword(trimmed, "module")
203
- {
204
- keyword_lines += 1;
205
- }
206
-
207
- if text.contains(';')
208
- || text.contains('{')
209
- || text.contains('}')
210
- || text.contains("::")
211
- || text.contains("->")
212
- || text.contains("=>")
213
- {
214
- punctuation_lines += 1;
215
- }
216
-
217
- if text.contains(" = ")
218
- || text.contains("+=")
219
- || text.contains("-=")
220
- || text.contains("*=")
221
- || text.contains("/=")
222
- || text.contains(" := ")
223
- || text.contains(" == ")
224
- {
225
- assignment_lines += 1;
226
- }
227
-
228
- if info.x1 > min_x + 8 {
229
- indent_lines += 1;
230
- }
231
- }
232
-
233
- if total < 3 {
234
- return false;
235
- }
236
-
237
- if shell_lines >= 2 && shell_lines * 2 >= total {
238
- return true;
239
- }
240
-
241
- if keyword_lines >= 2 && assignment_lines >= 1 {
242
- return true;
243
- }
244
-
245
- if keyword_lines >= 1 && punctuation_lines >= 1 && assignment_lines >= 1 {
246
- return true;
247
- }
248
-
249
- if indent_lines == total && keyword_lines >= 1 && assignment_lines >= 1 {
250
- return true;
251
- }
252
-
253
- false
254
- }
@@ -1,249 +0,0 @@
1
- //! Core conversion logic for hOCR to Markdown
2
-
3
- use super::elements::convert_element;
4
- use super::output::{ConvertContext, collapse_extra_newlines};
5
- use crate::hocr::types::HocrElement;
6
-
7
- /// Convert hOCR elements to Markdown with semantic formatting
8
- ///
9
- /// Transforms hOCR document structure into clean, readable Markdown while preserving
10
- /// document hierarchy and semantic meaning.
11
- ///
12
- /// # Arguments
13
- ///
14
- /// * `elements` - hOCR elements to convert (typically from `extract_hocr_document`)
15
- /// * `preserve_structure` - If `true`, sorts elements by their `order` property to respect reading order
16
- ///
17
- /// # Returns
18
- ///
19
- /// A `String` containing the formatted Markdown output
20
- ///
21
- /// # Semantic Conversion
22
- ///
23
- /// All 40 hOCR 1.2 element types are converted with appropriate markdown formatting:
24
- ///
25
- /// | hOCR Element | Markdown Output |
26
- /// |--------------|-----------------|
27
- /// | `ocr_title`, `ocr_chapter` | `# Heading` |
28
- /// | `ocr_section` | `## Heading` |
29
- /// | `ocr_subsection` | `### Heading` |
30
- /// | `ocr_par` | Paragraph with blank lines |
31
- /// | `ocr_blockquote` | `> Quote` |
32
- /// | `ocr_abstract` | `**Abstract**` header |
33
- /// | `ocr_author` | `*Author*` (italic) |
34
- /// | `ocr_image`, `ocr_photo` | `![alt](path)` |
35
- /// | `ocr_math`, `ocr_chem` | `` `formula` `` (inline code) |
36
- /// | `ocr_display` | ` ```equation``` ` (code block) |
37
- /// | `ocr_separator` | `---` (horizontal rule) |
38
- /// | `ocr_dropcap` | `**Letter**` (bold) |
39
- /// | `ocrx_word` | Word with markdown escaping |
40
- ///
41
- /// # Example
42
- ///
43
- /// ```rust
44
- /// use html_to_markdown_rs::hocr::{extract_hocr_document, convert_to_markdown};
45
- ///
46
- /// let html = r#"<div class="ocr_page">
47
- /// <h1 class="ocr_title">Document Title</h1>
48
- /// <p class="ocr_par" title="order 1">
49
- /// <span class="ocrx_word" title="bbox 10 10 50 30; x_wconf 95">Hello</span>
50
- /// <span class="ocrx_word" title="bbox 60 10 100 30; x_wconf 92">World</span>
51
- /// </p>
52
- /// </div>"#;
53
- ///
54
- /// let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
55
- /// let (elements, _) = extract_hocr_document(&dom);
56
- /// let markdown = convert_to_markdown(&elements, true);
57
- /// // Output: "# Document Title\n\nHello World"
58
- /// ```
59
- #[must_use]
60
- pub fn convert_to_markdown(elements: &[HocrElement], preserve_structure: bool) -> String {
61
- convert_to_markdown_with_options(elements, preserve_structure, true)
62
- }
63
-
64
- /// Convert hOCR elements to Markdown with advanced options.
65
- ///
66
- /// Transforms hOCR document structure into clean, readable Markdown with fine-grained
67
- /// control over structure preservation and spatial table reconstruction behavior.
68
- ///
69
- /// # Arguments
70
- ///
71
- /// * `elements` - hOCR elements to convert (typically from `extract_hocr_document`)
72
- /// * `preserve_structure` - If `true`, sorts elements by their `order` property to respect reading order.
73
- /// If `false`, elements are processed in their original tree order.
74
- /// * `enable_spatial_tables` - If `true`, attempts to reconstruct table structure from spatial
75
- /// positioning of words. If `false`, word positions are ignored and only text content is used.
76
- ///
77
- /// # Returns
78
- ///
79
- /// A `String` containing the formatted Markdown output
80
- ///
81
- /// # Performance
82
- ///
83
- /// - Spatial table reconstruction is more computationally expensive but produces better table formatting
84
- /// - For documents without tables, setting `enable_spatial_tables` to `false` improves performance
85
- /// - Structure preservation requires sorting which adds O(n log n) complexity; disable if not needed
86
- #[must_use]
87
- pub fn convert_to_markdown_with_options(
88
- elements: &[HocrElement],
89
- preserve_structure: bool,
90
- enable_spatial_tables: bool,
91
- ) -> String {
92
- let mut output = String::new();
93
- let mut ctx = ConvertContext::default();
94
-
95
- if preserve_structure && should_sort_children(elements) {
96
- let mut sorted_elements: Vec<&HocrElement> = elements.iter().collect();
97
- sorted_elements.sort_by_key(|e| e.properties.order.unwrap_or(u32::MAX));
98
- for element in sorted_elements {
99
- convert_element(
100
- element,
101
- &mut output,
102
- 0,
103
- preserve_structure,
104
- enable_spatial_tables,
105
- &mut ctx,
106
- );
107
- }
108
- } else {
109
- for element in elements {
110
- convert_element(
111
- element,
112
- &mut output,
113
- 0,
114
- preserve_structure,
115
- enable_spatial_tables,
116
- &mut ctx,
117
- );
118
- }
119
- }
120
-
121
- collapse_extra_newlines(&mut output);
122
- output.trim().to_string()
123
- }
124
-
125
- fn should_sort_children(children: &[HocrElement]) -> bool {
126
- let mut last = 0u32;
127
- let mut saw_any = false;
128
-
129
- for child in children {
130
- let order = child.properties.order.unwrap_or(u32::MAX);
131
- if saw_any && order < last {
132
- return true;
133
- }
134
- last = order;
135
- saw_any = true;
136
- }
137
-
138
- false
139
- }
140
-
141
- #[cfg(test)]
142
- mod tests {
143
- use super::*;
144
- use crate::hocr::types::{BBox, HocrElement, HocrElementType, HocrProperties};
145
-
146
- #[test]
147
- fn test_convert_title() {
148
- let element = HocrElement {
149
- element_type: HocrElementType::OcrTitle,
150
- properties: HocrProperties::default(),
151
- text: "Document Title".to_string(),
152
- children: vec![],
153
- };
154
-
155
- let markdown = convert_to_markdown(&[element], true);
156
- assert_eq!(markdown, "# Document Title");
157
- }
158
-
159
- #[test]
160
- fn test_spatial_table_reconstruction_can_be_disabled() {
161
- fn word(text: &str, x1: u32, y1: u32) -> HocrElement {
162
- HocrElement {
163
- element_type: HocrElementType::OcrxWord,
164
- properties: HocrProperties {
165
- bbox: Some(BBox {
166
- x1,
167
- y1,
168
- x2: x1 + 40,
169
- y2: y1 + 20,
170
- }),
171
- x_wconf: Some(95.0),
172
- ..HocrProperties::default()
173
- },
174
- text: text.to_string(),
175
- children: vec![],
176
- }
177
- }
178
-
179
- let paragraph = HocrElement {
180
- element_type: HocrElementType::OcrPar,
181
- properties: HocrProperties::default(),
182
- text: String::new(),
183
- children: vec![
184
- word("A", 10, 10),
185
- word("B", 120, 10),
186
- word("C", 230, 10),
187
- word("D", 12, 60),
188
- word("E", 122, 60),
189
- word("F", 232, 60),
190
- ],
191
- };
192
-
193
- let markdown_with_tables = convert_to_markdown_with_options(std::slice::from_ref(&paragraph), true, true);
194
- assert!(
195
- markdown_with_tables.contains("| --- |"),
196
- "Expected spatial table reconstruction to produce a markdown table"
197
- );
198
-
199
- let markdown_without_tables = convert_to_markdown_with_options(std::slice::from_ref(&paragraph), true, false);
200
- assert!(
201
- !markdown_without_tables.contains('|'),
202
- "Table reconstruction should be disabled when the flag is false"
203
- );
204
- assert!(
205
- markdown_without_tables.contains("A B C"),
206
- "Plain text output should retain original word order"
207
- );
208
- }
209
-
210
- #[test]
211
- fn test_convert_paragraph_with_words() {
212
- let par = HocrElement {
213
- element_type: HocrElementType::OcrPar,
214
- properties: HocrProperties::default(),
215
- text: String::new(),
216
- children: vec![
217
- HocrElement {
218
- element_type: HocrElementType::OcrxWord,
219
- properties: HocrProperties::default(),
220
- text: "Hello".to_string(),
221
- children: vec![],
222
- },
223
- HocrElement {
224
- element_type: HocrElementType::OcrxWord,
225
- properties: HocrProperties::default(),
226
- text: "World".to_string(),
227
- children: vec![],
228
- },
229
- ],
230
- };
231
-
232
- let markdown = convert_to_markdown(&[par], true);
233
- assert!(markdown.contains("Hello"));
234
- assert!(markdown.contains("World"));
235
- }
236
-
237
- #[test]
238
- fn test_convert_blockquote() {
239
- let quote = HocrElement {
240
- element_type: HocrElementType::OcrBlockquote,
241
- properties: HocrProperties::default(),
242
- text: "This is a quote".to_string(),
243
- children: vec![],
244
- };
245
-
246
- let markdown = convert_to_markdown(&[quote], true);
247
- assert!(markdown.starts_with("> "));
248
- }
249
- }