html-to-markdown 2.30.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -14
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +5 -2
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +126 -52
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,379 +0,0 @@
1
- #![allow(clippy::cast_possible_truncation)]
2
- //! Document hierarchy and code block detection for hOCR conversion
3
-
4
- use super::code_analysis::{is_bullet_like, is_code_paragraph, is_confident_code_block};
5
- use super::keywords::detect_code_language;
6
- use super::output::{ConvertContext, collect_line_words, element_text_content};
7
- use crate::hocr::types::{HocrElement, HocrElementType};
8
-
9
- pub fn append_text_and_children(
10
- element: &HocrElement,
11
- output: &mut String,
12
- depth: usize,
13
- preserve_structure: bool,
14
- enable_spatial_tables: bool,
15
- ctx: &mut ConvertContext,
16
- ) {
17
- use super::elements::convert_element;
18
-
19
- if !element.text.is_empty() {
20
- output.push_str(&element.text);
21
- if !element.children.is_empty() {
22
- output.push(' ');
23
- }
24
- }
25
-
26
- if preserve_structure && should_sort_children(&element.children) {
27
- let mut sorted_children: Vec<&HocrElement> = element.children.iter().collect();
28
- sorted_children.sort_by_key(|e| e.properties.order.unwrap_or(u32::MAX));
29
- for child in sorted_children {
30
- convert_element(child, output, depth + 1, preserve_structure, enable_spatial_tables, ctx);
31
- }
32
- } else {
33
- for child in &element.children {
34
- convert_element(child, output, depth + 1, preserve_structure, enable_spatial_tables, ctx);
35
- }
36
- }
37
- }
38
-
39
- fn should_sort_children(children: &[HocrElement]) -> bool {
40
- let mut last = 0u32;
41
- let mut saw_any = false;
42
-
43
- for child in children {
44
- let order = child.properties.order.unwrap_or(u32::MAX);
45
- if saw_any && order < last {
46
- return true;
47
- }
48
- last = order;
49
- saw_any = true;
50
- }
51
-
52
- false
53
- }
54
-
55
- pub fn detect_heading_paragraph(element: &HocrElement, text: &str) -> Option<String> {
56
- if element.element_type != HocrElementType::OcrPar {
57
- return None;
58
- }
59
-
60
- let line_children: Vec<&HocrElement> = element
61
- .children
62
- .iter()
63
- .filter(|child| matches!(child.element_type, HocrElementType::OcrLine | HocrElementType::OcrxLine))
64
- .collect();
65
-
66
- if line_children.len() != 1 {
67
- return None;
68
- }
69
-
70
- // Determine effective font size from child line elements.
71
- // First check x_fsize, then fall back to bbox height as a proxy.
72
- let font_size = line_children.iter().find_map(|child| {
73
- child
74
- .properties
75
- .x_fsize
76
- .or_else(|| child.properties.bbox.map(|b| b.height()))
77
- });
78
-
79
- let has_large_font = font_size.is_some_and(|size| size >= 14);
80
-
81
- let char_limit = if has_large_font { 80 } else { 60 };
82
-
83
- if text.is_empty() || text.len() > char_limit || text.contains(':') || text.contains('\n') {
84
- return None;
85
- }
86
-
87
- let mut word_count = 0usize;
88
- let mut uppercase_initial = 0usize;
89
- for word in text.split_whitespace() {
90
- word_count += 1;
91
- if word.chars().next().is_some_and(char::is_uppercase) {
92
- uppercase_initial += 1;
93
- }
94
- if word_count > 8 {
95
- return None;
96
- }
97
- }
98
-
99
- // Allow single-word headings when font size is large
100
- let min_words = if has_large_font { 1 } else { 2 };
101
- if word_count < min_words {
102
- return None;
103
- }
104
-
105
- if uppercase_initial < word_count.saturating_sub(1) {
106
- return None;
107
- }
108
-
109
- if text.ends_with('.') {
110
- return None;
111
- }
112
-
113
- Some(text.to_string())
114
- }
115
-
116
- pub fn find_previous_heading(children: &[&HocrElement], idx: usize) -> Option<String> {
117
- if idx == 0 {
118
- return None;
119
- }
120
-
121
- for candidate in children[..idx].iter().rev() {
122
- let text_snapshot = element_text_content(candidate);
123
- if let Some(text) = detect_heading_paragraph(candidate, &text_snapshot) {
124
- return Some(text);
125
- }
126
- }
127
-
128
- None
129
- }
130
-
131
- pub fn ensure_heading_prefix(output: &mut String, heading: &str) {
132
- let snippet = format!("# {heading}\n\n");
133
- if output.ends_with(&snippet) {
134
- return;
135
- }
136
-
137
- if !output.is_empty() && !output.ends_with("\n\n") {
138
- if output.ends_with('\n') {
139
- output.push('\n');
140
- } else {
141
- output.push_str("\n\n");
142
- }
143
- }
144
-
145
- output.push_str(&snippet);
146
- }
147
-
148
- #[derive(Clone)]
149
- pub struct CodeLineInfo {
150
- pub text: String,
151
- pub x1: u32,
152
- }
153
-
154
- pub fn collect_code_block(children: &[&HocrElement]) -> Option<(Vec<String>, usize, Option<&'static str>)> {
155
- let mut collected: Vec<CodeLineInfo> = Vec::new();
156
- let mut consumed = 0;
157
- let mut paragraph_count = 0;
158
-
159
- while consumed < children.len() {
160
- let child = children[consumed];
161
- if child.element_type != HocrElementType::OcrPar {
162
- break;
163
- }
164
-
165
- let lines = extract_code_lines(child);
166
- if lines.is_empty() || !is_code_paragraph(&lines) {
167
- break;
168
- }
169
-
170
- if paragraph_count > 0 && !collected.is_empty() && should_insert_code_paragraph_break(&collected, &lines) {
171
- let gap_x = lines
172
- .first()
173
- .map(|info| info.x1)
174
- .or_else(|| child.properties.bbox.map(|bbox| bbox.x1))
175
- .unwrap_or(0);
176
- collected.push(CodeLineInfo {
177
- text: String::new(),
178
- x1: gap_x,
179
- });
180
- }
181
-
182
- collected.extend(lines);
183
- consumed += 1;
184
- paragraph_count += 1;
185
- }
186
-
187
- if collected.is_empty() {
188
- return None;
189
- }
190
-
191
- if !is_confident_code_block(&collected) {
192
- return None;
193
- }
194
-
195
- let mut x_values: Vec<u32> = collected
196
- .iter()
197
- .filter(|info| !info.text.is_empty())
198
- .map(|info| info.x1)
199
- .collect();
200
-
201
- if x_values.is_empty() {
202
- x_values.push(0);
203
- }
204
-
205
- let min_x = *x_values.iter().min().unwrap_or(&0);
206
- let indent_candidates: Vec<u32> = x_values
207
- .iter()
208
- .filter_map(|&x| if x > min_x { Some(x - min_x) } else { None })
209
- .filter(|&delta| delta > 5)
210
- .collect();
211
-
212
- let mut indent_step = indent_candidates.iter().copied().min().unwrap_or(40);
213
-
214
- if indent_step == 0 {
215
- indent_step = 40;
216
- }
217
-
218
- let mut lines: Vec<String> = Vec::new();
219
- for info in collected {
220
- if info.text.is_empty() {
221
- if !lines.is_empty() && !lines.last().unwrap().is_empty() {
222
- lines.push(String::new());
223
- }
224
- continue;
225
- }
226
-
227
- let indent_level = if info.x1 <= min_x {
228
- 0
229
- } else {
230
- let diff = info.x1 - min_x;
231
- (((diff as f32) / indent_step as f32) + 0.25).floor() as usize
232
- }
233
- .min(6);
234
-
235
- let mut normalized = normalize_code_line(&info.text);
236
- if indent_level > 0 {
237
- let indent = " ".repeat(indent_level);
238
- normalized = format!("{indent}{normalized}");
239
- }
240
- lines.push(normalized);
241
- }
242
-
243
- while matches!(lines.last(), Some(last) if last.is_empty()) {
244
- lines.pop();
245
- }
246
-
247
- let meaningful_lines: Vec<&String> = lines.iter().filter(|line| !line.trim().is_empty()).collect();
248
- let meaningful_count = meaningful_lines.len();
249
- if meaningful_count < 3 {
250
- return None;
251
- }
252
-
253
- let bullet_like = meaningful_lines.iter().filter(|line| is_bullet_like(line)).count();
254
- if bullet_like * 2 >= meaningful_count {
255
- return None;
256
- }
257
-
258
- let language = detect_code_language(&lines);
259
- Some((lines, consumed, language))
260
- }
261
-
262
- fn extract_code_lines(paragraph: &HocrElement) -> Vec<CodeLineInfo> {
263
- let mut lines = Vec::new();
264
-
265
- for child in &paragraph.children {
266
- match child.element_type {
267
- HocrElementType::OcrLine | HocrElementType::OcrxLine => {
268
- let mut words = Vec::new();
269
- collect_line_words(child, &mut words);
270
- if words.is_empty() {
271
- continue;
272
- }
273
- let text = words.join(" ");
274
- if text.trim().is_empty() {
275
- continue;
276
- }
277
- let x1 = child
278
- .properties
279
- .bbox
280
- .map(|bbox| bbox.x1)
281
- .or_else(|| paragraph.properties.bbox.map(|bbox| bbox.x1))
282
- .unwrap_or(0);
283
- lines.push(CodeLineInfo {
284
- text: text.trim().to_string(),
285
- x1,
286
- });
287
- }
288
- _ => {}
289
- }
290
- }
291
-
292
- if lines.is_empty() {
293
- let mut words = Vec::new();
294
- collect_line_words(paragraph, &mut words);
295
- if !words.is_empty() {
296
- let x1 = paragraph.properties.bbox.map_or(0, |bbox| bbox.x1);
297
- lines.push(CodeLineInfo {
298
- text: words.join(" ").trim().to_string(),
299
- x1,
300
- });
301
- }
302
- }
303
-
304
- lines
305
- }
306
-
307
- fn should_insert_code_paragraph_break(previous: &[CodeLineInfo], next: &[CodeLineInfo]) -> bool {
308
- let prev_line = previous.iter().rev().find(|info| !info.text.trim().is_empty());
309
- let next_line = next.iter().find(|info| !info.text.trim().is_empty());
310
-
311
- match (prev_line, next_line) {
312
- (Some(prev), Some(next)) => {
313
- let prev_text = prev.text.trim();
314
- let next_text = next.text.trim();
315
-
316
- if next_text == "}" {
317
- return false;
318
- }
319
-
320
- if prev_text.ends_with('{') && next_text == "}" {
321
- return false;
322
- }
323
-
324
- true
325
- }
326
- _ => false,
327
- }
328
- }
329
-
330
- fn normalize_code_line(text: &str) -> String {
331
- let mut normalized = text.trim().to_string();
332
- let replacements = [("\u{2014}", "-"), ("\u{2013}", "-"), ("\u{2212}", "-")];
333
- for (from, to) in replacements {
334
- normalized = normalized.replace(from, to);
335
- }
336
-
337
- normalized = normalized.replace('+', " + ");
338
-
339
- let mut collapsed = String::new();
340
- let mut last_space = false;
341
- for ch in normalized.chars() {
342
- if ch.is_whitespace() {
343
- if !last_space {
344
- collapsed.push(' ');
345
- last_space = true;
346
- }
347
- } else {
348
- collapsed.push(ch);
349
- last_space = false;
350
- }
351
- }
352
- let mut cleaned = collapsed.trim().to_string();
353
- let punctuation_fixes = [(" ,", ","), (" ;", ";"), (" )", ")"), ("( ", "(")];
354
- for (from, to) in punctuation_fixes {
355
- cleaned = cleaned.replace(from, to);
356
- }
357
- let mut final_line = String::new();
358
- for ch in cleaned.chars() {
359
- match ch {
360
- '{' => {
361
- if !final_line.ends_with(' ') && !final_line.is_empty() {
362
- final_line.push(' ');
363
- }
364
- final_line.push('{');
365
- }
366
- '}' | ';' => {
367
- if final_line.ends_with(' ') {
368
- final_line.pop();
369
- }
370
- final_line.push(ch);
371
- }
372
- _ => final_line.push(ch),
373
- }
374
- }
375
- while final_line.contains(" ") {
376
- final_line = final_line.replace(" ", " ");
377
- }
378
- final_line.trim().to_string()
379
- }
@@ -1,55 +0,0 @@
1
- //! Keyword and language detection for code block identification
2
-
3
- /// Detect if a text line appears to be shell command prompt
4
- pub fn is_shell_prompt(text: &str) -> bool {
5
- let trimmed = text.trim_start();
6
- if trimmed.is_empty() {
7
- return false;
8
- }
9
-
10
- trimmed.starts_with('$')
11
- || trimmed.starts_with('#')
12
- || trimmed.contains("]#")
13
- || trimmed.starts_with("sudo ")
14
- || trimmed.starts_with("./")
15
- || trimmed.starts_with("python ")
16
- || trimmed.starts_with("pip ")
17
- || trimmed.starts_with("uv ")
18
- }
19
-
20
- /// Check if a keyword appears at the start with proper word boundaries
21
- pub fn starts_with_keyword(trimmed: &str, keyword: &str) -> bool {
22
- if !trimmed.starts_with(keyword) {
23
- return false;
24
- }
25
- if let Some(first) = trimmed.chars().next() {
26
- if !first.is_ascii_lowercase() {
27
- return false;
28
- }
29
- }
30
- match trimmed.chars().nth(keyword.len()) {
31
- None => true,
32
- Some(ch) => ch.is_whitespace() || matches!(ch, '(' | ':' | '{' | '[' | '.'),
33
- }
34
- }
35
-
36
- /// Check if a keyword token appears anywhere in text
37
- pub fn contains_keyword_token(text: &str, keyword: &str) -> bool {
38
- text.split(|ch: char| !(ch.is_ascii_alphanumeric() || ch == '_'))
39
- .any(|token| token == keyword)
40
- }
41
-
42
- /// Detect programming language based on code patterns
43
- pub fn detect_code_language(lines: &[String]) -> Option<&'static str> {
44
- let lower_lines: Vec<String> = lines.iter().map(|line| line.to_lowercase()).collect();
45
- if lower_lines.iter().any(|line| line.contains("function"))
46
- || lower_lines.iter().any(|line| line.contains("console."))
47
- || lower_lines.iter().any(|line| line.contains("const "))
48
- {
49
- return Some("javascript");
50
- }
51
- if lower_lines.iter().any(|line| line.contains("printf")) {
52
- return Some("c");
53
- }
54
- None
55
- }