html-to-markdown 2.29.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +18 -41
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +7 -4
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +127 -51
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,382 +0,0 @@
1
- #![allow(clippy::branches_sharing_code)]
2
- //! Element-specific conversion logic for hOCR to Markdown
3
-
4
- use super::hierarchy::{
5
- append_text_and_children, collect_code_block, detect_heading_paragraph, ensure_heading_prefix,
6
- find_previous_heading,
7
- };
8
- use super::layout::{is_bullet_paragraph, try_spatial_table_reconstruction};
9
- use super::output::{ConvertContext, element_text_content, ensure_trailing_blank_line};
10
- use crate::hocr::types::{HocrElement, HocrElementType};
11
-
12
- pub fn convert_element(
13
- element: &HocrElement,
14
- output: &mut String,
15
- depth: usize,
16
- preserve_structure: bool,
17
- enable_spatial_tables: bool,
18
- ctx: &mut ConvertContext,
19
- ) {
20
- match element.element_type {
21
- HocrElementType::OcrTitle | HocrElementType::OcrChapter | HocrElementType::OcrPart => {
22
- if !output.is_empty() && !output.ends_with("\n\n") {
23
- output.push_str("\n\n");
24
- }
25
- output.push_str("# ");
26
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
27
- if output.ends_with(' ') {
28
- output.pop();
29
- }
30
- output.push_str("\n\n");
31
- }
32
- HocrElementType::OcrSection => {
33
- if !output.is_empty() && !output.ends_with("\n\n") {
34
- output.push_str("\n\n");
35
- }
36
- output.push_str("## ");
37
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
38
- if output.ends_with(' ') {
39
- output.pop();
40
- }
41
- output.push_str("\n\n");
42
- }
43
- HocrElementType::OcrSubsection => {
44
- if !output.is_empty() && !output.ends_with("\n\n") {
45
- output.push_str("\n\n");
46
- }
47
- output.push_str("### ");
48
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
49
- if output.ends_with(' ') {
50
- output.pop();
51
- }
52
- output.push_str("\n\n");
53
- }
54
- HocrElementType::OcrSubsubsection => {
55
- if !output.is_empty() && !output.ends_with("\n\n") {
56
- output.push_str("\n\n");
57
- }
58
- output.push_str("#### ");
59
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
60
- if output.ends_with(' ') {
61
- output.pop();
62
- }
63
- output.push_str("\n\n");
64
- }
65
-
66
- HocrElementType::OcrPar => {
67
- let text_snapshot = element_text_content(element);
68
- let bullet_paragraph = is_bullet_paragraph(element, &text_snapshot);
69
- if !output.is_empty() {
70
- if bullet_paragraph {
71
- if !output.ends_with('\n') {
72
- output.push('\n');
73
- }
74
- } else if !output.ends_with("\n\n") {
75
- output.push_str("\n\n");
76
- }
77
- }
78
-
79
- if let Some(heading) = detect_heading_paragraph(element, &text_snapshot) {
80
- if !output.is_empty() && !output.ends_with("\n\n") {
81
- if output.ends_with('\n') {
82
- output.push('\n');
83
- } else {
84
- output.push_str("\n\n");
85
- }
86
- }
87
- output.push_str("# ");
88
- output.push_str(&heading);
89
- output.push_str("\n\n");
90
- ctx.last_heading = Some(heading);
91
- return;
92
- }
93
-
94
- if enable_spatial_tables {
95
- if let Some(table_markdown) = try_spatial_table_reconstruction(element) {
96
- output.push_str(&table_markdown);
97
- ensure_trailing_blank_line(output);
98
- return;
99
- }
100
- }
101
-
102
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
103
- if output.ends_with(' ') {
104
- output.pop();
105
- }
106
- if bullet_paragraph {
107
- if !output.ends_with('\n') {
108
- output.push('\n');
109
- }
110
- } else {
111
- output.push_str("\n\n");
112
- }
113
- }
114
-
115
- HocrElementType::OcrBlockquote => {
116
- if !output.is_empty() && !output.ends_with("\n\n") {
117
- output.push_str("\n\n");
118
- }
119
- let mut quote_content = String::new();
120
- append_text_and_children(
121
- element,
122
- &mut quote_content,
123
- depth,
124
- preserve_structure,
125
- enable_spatial_tables,
126
- ctx,
127
- );
128
- for line in quote_content.trim().lines() {
129
- output.push_str("> ");
130
- output.push_str(line);
131
- output.push('\n');
132
- }
133
- output.push('\n');
134
- }
135
-
136
- HocrElementType::OcrLine | HocrElementType::OcrxLine => {
137
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
138
- if !output.ends_with(' ') && !output.ends_with('\n') {
139
- output.push(' ');
140
- }
141
- }
142
-
143
- HocrElementType::OcrxWord => {
144
- if !output.is_empty()
145
- && !output.ends_with(' ')
146
- && !output.ends_with('\t')
147
- && !output.ends_with('\n')
148
- && !output.ends_with('*')
149
- && !output.ends_with('`')
150
- && !output.ends_with('_')
151
- && !output.ends_with('[')
152
- {
153
- output.push(' ');
154
- }
155
-
156
- if !element.text.is_empty() {
157
- output.push_str(&element.text);
158
- }
159
- }
160
-
161
- HocrElementType::OcrHeader | HocrElementType::OcrFooter => {
162
- if !output.is_empty() && !output.ends_with("\n\n") {
163
- output.push_str("\n\n");
164
- }
165
- output.push('*');
166
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
167
- if output.ends_with(' ') {
168
- output.pop();
169
- }
170
- output.push_str("*\n\n");
171
- }
172
-
173
- HocrElementType::OcrCaption => {
174
- if !output.is_empty() && !output.ends_with("\n\n") {
175
- output.push_str("\n\n");
176
- }
177
- output.push('*');
178
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
179
- if output.ends_with(' ') {
180
- output.pop();
181
- }
182
- output.push_str("*\n\n");
183
- }
184
-
185
- HocrElementType::OcrPageno => {
186
- if !output.is_empty() && !output.ends_with("\n\n") {
187
- output.push_str("\n\n");
188
- }
189
- output.push_str("---\n");
190
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
191
- output.push_str("\n---\n\n");
192
- }
193
-
194
- HocrElementType::OcrAbstract => {
195
- if !output.is_empty() && !output.ends_with("\n\n") {
196
- output.push_str("\n\n");
197
- }
198
- output.push_str("**Abstract**\n\n");
199
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
200
- if output.ends_with(' ') {
201
- output.pop();
202
- }
203
- output.push_str("\n\n");
204
- }
205
-
206
- HocrElementType::OcrAuthor => {
207
- if !output.is_empty() && !output.ends_with("\n\n") {
208
- output.push_str("\n\n");
209
- }
210
- output.push('*');
211
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
212
- if output.ends_with(' ') {
213
- output.pop();
214
- }
215
- output.push_str("*\n\n");
216
- }
217
-
218
- HocrElementType::OcrSeparator => {
219
- if !output.is_empty() && !output.ends_with("\n\n") {
220
- output.push_str("\n\n");
221
- }
222
- output.push_str("---\n\n");
223
- }
224
-
225
- HocrElementType::OcrTable => {
226
- if !output.is_empty() && !output.ends_with("\n\n") {
227
- output.push_str("\n\n");
228
- }
229
-
230
- if enable_spatial_tables {
231
- if let Some(table_markdown) = try_spatial_table_reconstruction(element) {
232
- output.push_str(&table_markdown);
233
- ensure_trailing_blank_line(output);
234
- } else {
235
- let mut sorted_children: Vec<_> = element.children.iter().collect();
236
- if preserve_structure {
237
- sorted_children.sort_by_key(|e| e.properties.order.unwrap_or(u32::MAX));
238
- }
239
- for child in sorted_children {
240
- convert_element(child, output, depth + 1, preserve_structure, enable_spatial_tables, ctx);
241
- }
242
- ensure_trailing_blank_line(output);
243
- }
244
- } else {
245
- let mut sorted_children: Vec<_> = element.children.iter().collect();
246
- if preserve_structure {
247
- sorted_children.sort_by_key(|e| e.properties.order.unwrap_or(u32::MAX));
248
- }
249
- for child in sorted_children {
250
- convert_element(child, output, depth + 1, preserve_structure, enable_spatial_tables, ctx);
251
- }
252
- ensure_trailing_blank_line(output);
253
- }
254
- }
255
-
256
- HocrElementType::OcrFloat | HocrElementType::OcrTextfloat | HocrElementType::OcrTextimage => {
257
- if !output.is_empty() && !output.ends_with("\n\n") {
258
- output.push_str("\n\n");
259
- }
260
- let mut sorted_children: Vec<_> = element.children.iter().collect();
261
- if preserve_structure {
262
- sorted_children.sort_by_key(|e| e.properties.order.unwrap_or(u32::MAX));
263
- }
264
- for child in sorted_children {
265
- convert_element(child, output, depth + 1, preserve_structure, enable_spatial_tables, ctx);
266
- }
267
- ensure_trailing_blank_line(output);
268
- }
269
-
270
- HocrElementType::OcrImage | HocrElementType::OcrPhoto | HocrElementType::OcrLinedrawing => {
271
- if !output.is_empty() && !output.ends_with("\n\n") {
272
- output.push_str("\n\n");
273
- }
274
- if let Some(ref image_path) = element.properties.image {
275
- output.push_str("![");
276
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
277
- if output.ends_with(' ') {
278
- output.pop();
279
- }
280
- output.push_str("](");
281
- output.push_str(image_path);
282
- output.push_str(")\n\n");
283
- } else {
284
- output.push_str("![Image]\n\n");
285
- }
286
- }
287
-
288
- HocrElementType::OcrMath | HocrElementType::OcrChem => {
289
- output.push('`');
290
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
291
- if output.ends_with(' ') {
292
- output.pop();
293
- }
294
- output.push('`');
295
- }
296
-
297
- HocrElementType::OcrDisplay => {
298
- if !output.is_empty() && !output.ends_with("\n\n") {
299
- output.push_str("\n\n");
300
- }
301
- output.push_str("```\n");
302
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
303
- if output.ends_with(' ') {
304
- output.pop();
305
- }
306
- output.push_str("\n```\n\n");
307
- }
308
-
309
- HocrElementType::OcrDropcap => {
310
- output.push_str("**");
311
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
312
- if output.ends_with(' ') {
313
- output.pop();
314
- }
315
- output.push_str("**");
316
- }
317
-
318
- HocrElementType::OcrGlyph | HocrElementType::OcrGlyphs | HocrElementType::OcrCinfo => {
319
- append_text_and_children(element, output, depth, preserve_structure, enable_spatial_tables, ctx);
320
- }
321
-
322
- HocrElementType::OcrPage
323
- | HocrElementType::OcrCarea
324
- | HocrElementType::OcrDocument
325
- | HocrElementType::OcrLinear
326
- | HocrElementType::OcrxBlock
327
- | HocrElementType::OcrColumn
328
- | HocrElementType::OcrXycut => {
329
- let mut sorted_children: Vec<_> = element.children.iter().collect();
330
- if preserve_structure {
331
- sorted_children.sort_by_key(|e| e.properties.order.unwrap_or(u32::MAX));
332
- }
333
-
334
- let mut idx = 0;
335
- while idx < sorted_children.len() {
336
- let child = sorted_children[idx];
337
- if child.element_type == HocrElementType::OcrPar {
338
- if let Some((code_lines, consumed, language)) = collect_code_block(&sorted_children[idx..]) {
339
- if let Some(heading_text) =
340
- find_previous_heading(&sorted_children, idx).or_else(|| ctx.last_heading.clone())
341
- {
342
- ensure_heading_prefix(output, &heading_text);
343
- }
344
- emit_code_block(output, &code_lines, language);
345
- idx += consumed;
346
- continue;
347
- }
348
- }
349
-
350
- convert_element(child, output, depth + 1, preserve_structure, enable_spatial_tables, ctx);
351
- idx += 1;
352
- }
353
- }
354
-
355
- HocrElementType::OcrNoise => {}
356
- }
357
- }
358
-
359
- pub fn emit_code_block(output: &mut String, lines: &[String], language: Option<&str>) {
360
- if !output.is_empty() {
361
- if output.ends_with('\n') {
362
- if !output.ends_with("\n\n") {
363
- output.push('\n');
364
- }
365
- } else {
366
- output.push_str("\n\n");
367
- }
368
- }
369
-
370
- output.push_str("```");
371
- if let Some(lang) = language {
372
- output.push_str(lang);
373
- }
374
- output.push('\n');
375
-
376
- for line in lines {
377
- output.push_str(line);
378
- output.push('\n');
379
- }
380
-
381
- output.push_str("```\n\n");
382
- }