kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,4 @@
1
+ pub mod tsv_parser;
2
+
3
+ pub use html_to_markdown_rs::hocr::{HocrWord, reconstruct_table, table_to_markdown};
4
+ pub use tsv_parser::extract_words_from_tsv;
@@ -0,0 +1,144 @@
1
+ use super::super::error::OcrError;
2
+ use super::super::utils::{TSV_MIN_FIELDS, TSV_WORD_LEVEL};
3
+ use html_to_markdown_rs::hocr::HocrWord;
4
+
5
+ /// Extract words from Tesseract TSV output and convert to HocrWord format
6
+ ///
7
+ /// This parses Tesseract's TSV format (level, page_num, block_num, ...) and
8
+ /// converts it to the HocrWord format used by html-to-markdown-rs for table reconstruction.
9
+ pub fn extract_words_from_tsv(tsv_data: &str, min_confidence: f64) -> Result<Vec<HocrWord>, OcrError> {
10
+ let mut words = Vec::new();
11
+
12
+ for (line_num, line) in tsv_data.lines().enumerate() {
13
+ if line_num == 0 {
14
+ continue;
15
+ }
16
+
17
+ let line = line.trim();
18
+ if line.is_empty() {
19
+ continue;
20
+ }
21
+
22
+ let fields: Vec<&str> = line.split('\t').collect();
23
+ if fields.len() < TSV_MIN_FIELDS {
24
+ continue;
25
+ }
26
+
27
+ let level = fields[0].parse::<u32>().unwrap_or(0);
28
+ if level != TSV_WORD_LEVEL {
29
+ continue;
30
+ }
31
+
32
+ let conf = fields[10].parse::<f64>().unwrap_or(-1.0);
33
+ if conf < min_confidence {
34
+ continue;
35
+ }
36
+
37
+ let text = fields[11].trim();
38
+ if text.is_empty() {
39
+ continue;
40
+ }
41
+
42
+ let word = HocrWord {
43
+ text: text.to_string(),
44
+ left: fields[6].parse().unwrap_or(0),
45
+ top: fields[7].parse().unwrap_or(0),
46
+ width: fields[8].parse().unwrap_or(0),
47
+ height: fields[9].parse().unwrap_or(0),
48
+ confidence: conf,
49
+ };
50
+
51
+ words.push(word);
52
+ }
53
+
54
+ Ok(words)
55
+ }
56
+
57
+ #[cfg(test)]
58
+ mod tests {
59
+ use super::*;
60
+
61
+ #[test]
62
+ fn test_extract_words_basic() {
63
+ let tsv = r#"level page_num block_num par_num line_num word_num left top width height conf text
64
+ 5 1 0 0 0 0 100 50 80 30 95.5 Hello
65
+ 5 1 0 0 0 1 190 50 70 30 92.3 World"#;
66
+
67
+ let words = extract_words_from_tsv(tsv, 0.0).unwrap();
68
+ assert_eq!(words.len(), 2);
69
+
70
+ assert_eq!(words[0].text, "Hello");
71
+ assert_eq!(words[0].left, 100);
72
+ assert_eq!(words[0].top, 50);
73
+ assert_eq!(words[0].confidence, 95.5);
74
+
75
+ assert_eq!(words[1].text, "World");
76
+ assert_eq!(words[1].left, 190);
77
+ }
78
+
79
+ #[test]
80
+ fn test_extract_words_confidence_filter() {
81
+ let tsv = r#"level page_num block_num par_num line_num word_num left top width height conf text
82
+ 5 1 0 0 0 0 100 50 80 30 95.5 Hello
83
+ 5 1 0 0 0 1 190 50 70 30 50.0 World
84
+ 5 1 0 0 0 2 270 50 60 30 92.3 Test"#;
85
+
86
+ let words = extract_words_from_tsv(tsv, 90.0).unwrap();
87
+ assert_eq!(words.len(), 2);
88
+ assert_eq!(words[0].text, "Hello");
89
+ assert_eq!(words[1].text, "Test");
90
+ }
91
+
92
+ #[test]
93
+ fn test_extract_words_level_filter() {
94
+ let tsv = r#"level page_num block_num par_num line_num word_num left top width height conf text
95
+ 3 1 0 0 0 0 100 50 80 30 95.5 Paragraph
96
+ 5 1 0 0 0 0 100 50 80 30 95.5 Hello
97
+ 4 1 0 0 0 1 190 50 70 30 92.3 Line"#;
98
+
99
+ let words = extract_words_from_tsv(tsv, 0.0).unwrap();
100
+ assert_eq!(words.len(), 1);
101
+ assert_eq!(words[0].text, "Hello");
102
+ }
103
+
104
+ #[test]
105
+ fn test_hocr_word_methods() {
106
+ let word = HocrWord {
107
+ text: "Hello".to_string(),
108
+ left: 100,
109
+ top: 50,
110
+ width: 80,
111
+ height: 30,
112
+ confidence: 95.5,
113
+ };
114
+
115
+ assert_eq!(word.right(), 180);
116
+ assert_eq!(word.bottom(), 80);
117
+ assert_eq!(word.y_center(), 65.0);
118
+ assert_eq!(word.x_center(), 140.0);
119
+ }
120
+
121
+ #[test]
122
+ fn test_extract_words_empty_text() {
123
+ let tsv = r#"level page_num block_num par_num line_num word_num left top width height conf text
124
+ 5 1 0 0 0 0 100 50 80 30 95.5
125
+ 5 1 0 0 0 1 190 50 70 30 92.3 World"#;
126
+
127
+ let words = extract_words_from_tsv(tsv, 0.0).unwrap();
128
+ assert_eq!(words.len(), 1);
129
+ assert_eq!(words[0].text, "World");
130
+ }
131
+
132
+ #[test]
133
+ fn test_extract_words_malformed() {
134
+ let tsv = r#"level page_num block_num
135
+ 5 1 0 0 0 0 100 50 80 30 95.5 Hello
136
+ invalid line
137
+ 5 1 0 0 0 1 190 50 70 30 92.3 World"#;
138
+
139
+ let words = extract_words_from_tsv(tsv, 0.0).unwrap();
140
+ assert_eq!(words.len(), 2);
141
+ assert_eq!(words[0].text, "Hello");
142
+ assert_eq!(words[1].text, "World");
143
+ }
144
+ }
@@ -0,0 +1,450 @@
1
+ //! Native Tesseract OCR backend.
2
+ //!
3
+ //! This module provides the native Tesseract backend that implements the OcrBackend
4
+ //! trait, bridging the plugin system with the low-level OcrProcessor.
5
+
6
+ use crate::Result;
7
+ use crate::core::config::OcrConfig;
8
+ use crate::ocr::processor::OcrProcessor;
9
+ use crate::plugins::{OcrBackend, OcrBackendType, Plugin};
10
+ use crate::types::ExtractionResult;
11
+ use async_trait::async_trait;
12
+ use std::path::Path;
13
+ use std::sync::Arc;
14
+
15
+ use crate::ocr::types::TesseractConfig as InternalTesseractConfig;
16
+
17
+ /// Native Tesseract OCR backend.
18
+ ///
19
+ /// This backend wraps the OcrProcessor and implements the OcrBackend trait,
20
+ /// allowing it to be used through the plugin system.
21
+ ///
22
+ /// # Thread Safety
23
+ ///
24
+ /// Uses Arc for shared ownership and is thread-safe (Send + Sync).
25
+ pub struct TesseractBackend {
26
+ processor: Arc<OcrProcessor>,
27
+ }
28
+
29
+ impl TesseractBackend {
30
+ /// Create a new Tesseract backend with default cache directory.
31
+ pub fn new() -> Result<Self> {
32
+ let processor = OcrProcessor::new(None).map_err(|e| crate::KreuzbergError::Ocr {
33
+ message: format!("Failed to create Tesseract processor: {}", e),
34
+ source: Some(Box::new(e)),
35
+ })?;
36
+ Ok(Self {
37
+ processor: Arc::new(processor),
38
+ })
39
+ }
40
+
41
+ /// Create a new Tesseract backend with custom cache directory.
42
+ pub fn with_cache_dir(cache_dir: std::path::PathBuf) -> Result<Self> {
43
+ let processor = OcrProcessor::new(Some(cache_dir)).map_err(|e| crate::KreuzbergError::Ocr {
44
+ message: format!("Failed to create Tesseract processor: {}", e),
45
+ source: Some(Box::new(e)),
46
+ })?;
47
+ Ok(Self {
48
+ processor: Arc::new(processor),
49
+ })
50
+ }
51
+
52
+ /// Convert public API TesseractConfig to internal TesseractConfig.
53
+ ///
54
+ /// The public API types (crate::types) use i32 for compatibility with PyO3,
55
+ /// while internal types (crate::ocr::types) use u8/u32 for efficiency.
56
+ fn convert_config(public_config: &crate::types::TesseractConfig) -> InternalTesseractConfig {
57
+ InternalTesseractConfig {
58
+ language: public_config.language.clone(),
59
+ psm: public_config.psm as u8,
60
+ output_format: public_config.output_format.clone(),
61
+ oem: public_config.oem as u8,
62
+ min_confidence: public_config.min_confidence,
63
+ preprocessing: public_config.preprocessing.clone(),
64
+ enable_table_detection: public_config.enable_table_detection,
65
+ table_min_confidence: public_config.table_min_confidence,
66
+ table_column_threshold: public_config.table_column_threshold as u32,
67
+ table_row_threshold_ratio: public_config.table_row_threshold_ratio,
68
+ use_cache: public_config.use_cache,
69
+ classify_use_pre_adapted_templates: public_config.classify_use_pre_adapted_templates,
70
+ language_model_ngram_on: public_config.language_model_ngram_on,
71
+ tessedit_dont_blkrej_good_wds: public_config.tessedit_dont_blkrej_good_wds,
72
+ tessedit_dont_rowrej_good_wds: public_config.tessedit_dont_rowrej_good_wds,
73
+ tessedit_enable_dict_correction: public_config.tessedit_enable_dict_correction,
74
+ tessedit_char_whitelist: public_config.tessedit_char_whitelist.clone(),
75
+ tessedit_char_blacklist: public_config.tessedit_char_blacklist.clone(),
76
+ tessedit_use_primary_params_model: public_config.tessedit_use_primary_params_model,
77
+ textord_space_size_is_variable: public_config.textord_space_size_is_variable,
78
+ thresholding_method: public_config.thresholding_method,
79
+ }
80
+ }
81
+
82
+ /// Convert OcrConfig to internal TesseractConfig.
83
+ ///
84
+ /// Uses tesseract_config from OcrConfig if provided, otherwise uses defaults
85
+ /// with the language from OcrConfig.
86
+ fn config_to_tesseract(&self, config: &OcrConfig) -> InternalTesseractConfig {
87
+ match &config.tesseract_config {
88
+ Some(tess_config) => Self::convert_config(tess_config),
89
+ None => InternalTesseractConfig {
90
+ language: config.language.clone(),
91
+ ..Default::default()
92
+ },
93
+ }
94
+ }
95
+ }
96
+
97
+ impl Default for TesseractBackend {
98
+ fn default() -> Self {
99
+ Self::new().unwrap()
100
+ }
101
+ }
102
+
103
+ impl Plugin for TesseractBackend {
104
+ fn name(&self) -> &str {
105
+ "tesseract"
106
+ }
107
+
108
+ fn version(&self) -> String {
109
+ kreuzberg_tesseract::TesseractAPI::version()
110
+ }
111
+
112
+ fn initialize(&self) -> Result<()> {
113
+ Ok(())
114
+ }
115
+
116
+ fn shutdown(&self) -> Result<()> {
117
+ self.processor.clear_cache().map_err(|e| crate::KreuzbergError::Plugin {
118
+ message: format!("Failed to clear Tesseract cache: {}", e),
119
+ plugin_name: "tesseract".to_string(),
120
+ })
121
+ }
122
+ }
123
+
124
+ #[async_trait]
125
+ impl OcrBackend for TesseractBackend {
126
+ async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
127
+ let tess_config = self.config_to_tesseract(config);
128
+ let tess_config_clone = tess_config.clone();
129
+
130
+ let processor = Arc::clone(&self.processor);
131
+ let image_bytes = image_bytes.to_vec();
132
+
133
+ let ocr_result = tokio::task::spawn_blocking(move || processor.process_image(&image_bytes, &tess_config_clone))
134
+ .await
135
+ .map_err(|e| crate::KreuzbergError::Plugin {
136
+ message: format!("Tesseract task panicked: {}", e),
137
+ plugin_name: "tesseract".to_string(),
138
+ })?
139
+ .map_err(|e| crate::KreuzbergError::Ocr {
140
+ message: format!("Tesseract OCR failed: {}", e),
141
+ source: Some(Box::new(e)),
142
+ })?;
143
+
144
+ let metadata = crate::types::Metadata {
145
+ format: Some(crate::types::FormatMetadata::Ocr(crate::types::OcrMetadata {
146
+ language: tess_config.language.clone(),
147
+ psm: tess_config.psm as i32,
148
+ output_format: tess_config.output_format.clone(),
149
+ table_count: ocr_result.tables.len(),
150
+ table_rows: ocr_result.tables.first().map(|t| t.cells.len()),
151
+ table_cols: ocr_result
152
+ .tables
153
+ .first()
154
+ .and_then(|t| t.cells.first().map(|row| row.len())),
155
+ })),
156
+ additional: ocr_result.metadata,
157
+ ..Default::default()
158
+ };
159
+
160
+ Ok(ExtractionResult {
161
+ content: ocr_result.content,
162
+ mime_type: ocr_result.mime_type,
163
+ metadata,
164
+ tables: ocr_result
165
+ .tables
166
+ .into_iter()
167
+ .map(|t| crate::types::Table {
168
+ cells: t.cells,
169
+ markdown: t.markdown,
170
+ page_number: t.page_number,
171
+ })
172
+ .collect(),
173
+ detected_languages: None,
174
+ chunks: None,
175
+ images: None,
176
+ })
177
+ }
178
+
179
+ async fn process_file(&self, path: &Path, config: &OcrConfig) -> Result<ExtractionResult> {
180
+ let tess_config = self.config_to_tesseract(config);
181
+ let tess_config_clone = tess_config.clone();
182
+
183
+ let processor = Arc::clone(&self.processor);
184
+ let path_str = path.to_string_lossy().to_string();
185
+
186
+ let ocr_result = tokio::task::spawn_blocking(move || processor.process_file(&path_str, &tess_config_clone))
187
+ .await
188
+ .map_err(|e| crate::KreuzbergError::Plugin {
189
+ message: format!("Tesseract task panicked: {}", e),
190
+ plugin_name: "tesseract".to_string(),
191
+ })?
192
+ .map_err(|e| crate::KreuzbergError::Ocr {
193
+ message: format!("Tesseract OCR failed: {}", e),
194
+ source: Some(Box::new(e)),
195
+ })?;
196
+
197
+ let metadata = crate::types::Metadata {
198
+ format: Some(crate::types::FormatMetadata::Ocr(crate::types::OcrMetadata {
199
+ language: tess_config.language.clone(),
200
+ psm: tess_config.psm as i32,
201
+ output_format: tess_config.output_format.clone(),
202
+ table_count: ocr_result.tables.len(),
203
+ table_rows: ocr_result.tables.first().map(|t| t.cells.len()),
204
+ table_cols: ocr_result
205
+ .tables
206
+ .first()
207
+ .and_then(|t| t.cells.first().map(|row| row.len())),
208
+ })),
209
+ additional: ocr_result.metadata,
210
+ ..Default::default()
211
+ };
212
+
213
+ Ok(ExtractionResult {
214
+ content: ocr_result.content,
215
+ mime_type: ocr_result.mime_type,
216
+ metadata,
217
+ tables: ocr_result
218
+ .tables
219
+ .into_iter()
220
+ .map(|t| crate::types::Table {
221
+ cells: t.cells,
222
+ markdown: t.markdown,
223
+ page_number: t.page_number,
224
+ })
225
+ .collect(),
226
+ detected_languages: None,
227
+ chunks: None,
228
+ images: None,
229
+ })
230
+ }
231
+
232
+ fn supports_language(&self, lang: &str) -> bool {
233
+ // TODO: Query Tesseract for available languages
234
+ matches!(
235
+ lang,
236
+ "eng"
237
+ | "deu"
238
+ | "fra"
239
+ | "spa"
240
+ | "ita"
241
+ | "por"
242
+ | "rus"
243
+ | "chi_sim"
244
+ | "chi_tra"
245
+ | "jpn"
246
+ | "kor"
247
+ | "ara"
248
+ | "hin"
249
+ | "ben"
250
+ | "tha"
251
+ | "vie"
252
+ | "heb"
253
+ | "tur"
254
+ | "pol"
255
+ | "nld"
256
+ | "swe"
257
+ | "dan"
258
+ | "fin"
259
+ | "nor"
260
+ | "ces"
261
+ | "hun"
262
+ | "ron"
263
+ | "ukr"
264
+ | "bul"
265
+ | "hrv"
266
+ | "srp"
267
+ | "slk"
268
+ | "slv"
269
+ | "lit"
270
+ | "lav"
271
+ | "est"
272
+ )
273
+ }
274
+
275
+ fn backend_type(&self) -> OcrBackendType {
276
+ OcrBackendType::Tesseract
277
+ }
278
+
279
+ fn supported_languages(&self) -> Vec<String> {
280
+ // TODO: Query Tesseract API for available languages dynamically
281
+ vec![
282
+ "eng", "deu", "fra", "spa", "ita", "por", "rus", "chi_sim", "chi_tra", "jpn", "kor", "ara", "hin", "ben",
283
+ "tha", "vie", "heb", "tur", "pol", "nld", "swe", "dan", "fin", "nor", "ces", "hun", "ron", "ukr", "bul",
284
+ "hrv", "srp", "slk", "slv", "lit", "lav", "est",
285
+ ]
286
+ .into_iter()
287
+ .map(String::from)
288
+ .collect()
289
+ }
290
+
291
+ fn supports_table_detection(&self) -> bool {
292
+ true
293
+ }
294
+ }
295
+
296
+ #[cfg(test)]
297
+ mod tests {
298
+ use super::*;
299
+
300
+ #[test]
301
+ fn test_tesseract_backend_creation() {
302
+ let backend = TesseractBackend::new();
303
+ assert!(backend.is_ok());
304
+ }
305
+
306
+ #[test]
307
+ fn test_tesseract_backend_plugin_interface() {
308
+ let backend = TesseractBackend::new().unwrap();
309
+ assert_eq!(backend.name(), "tesseract");
310
+ assert!(!backend.version().is_empty());
311
+ assert!(backend.initialize().is_ok());
312
+ }
313
+
314
+ #[test]
315
+ fn test_tesseract_backend_type() {
316
+ let backend = TesseractBackend::new().unwrap();
317
+ assert_eq!(backend.backend_type(), OcrBackendType::Tesseract);
318
+ }
319
+
320
+ #[test]
321
+ fn test_tesseract_backend_supports_language() {
322
+ let backend = TesseractBackend::new().unwrap();
323
+ assert!(backend.supports_language("eng"));
324
+ assert!(backend.supports_language("deu"));
325
+ assert!(backend.supports_language("fra"));
326
+ assert!(!backend.supports_language("xyz"));
327
+ }
328
+
329
+ #[test]
330
+ fn test_tesseract_backend_supports_table_detection() {
331
+ let backend = TesseractBackend::new().unwrap();
332
+ assert!(backend.supports_table_detection());
333
+ }
334
+
335
+ #[test]
336
+ fn test_tesseract_backend_supported_languages() {
337
+ let backend = TesseractBackend::new().unwrap();
338
+ let languages = backend.supported_languages();
339
+ assert!(languages.contains(&"eng".to_string()));
340
+ assert!(languages.contains(&"deu".to_string()));
341
+ assert!(languages.len() > 30);
342
+ }
343
+
344
+ #[test]
345
+ fn test_config_to_tesseract_with_none() {
346
+ let backend = TesseractBackend::new().unwrap();
347
+ let ocr_config = OcrConfig {
348
+ backend: "tesseract".to_string(),
349
+ language: "deu".to_string(),
350
+ tesseract_config: None,
351
+ };
352
+
353
+ let tess_config = backend.config_to_tesseract(&ocr_config);
354
+ assert_eq!(tess_config.language, "deu");
355
+ assert_eq!(tess_config.psm, InternalTesseractConfig::default().psm);
356
+ }
357
+
358
+ #[test]
359
+ fn test_config_to_tesseract_with_some() {
360
+ let backend = TesseractBackend::new().unwrap();
361
+ let custom_tess_config = crate::types::TesseractConfig {
362
+ language: "fra".to_string(),
363
+ psm: 6,
364
+ enable_table_detection: true,
365
+ ..Default::default()
366
+ };
367
+
368
+ let ocr_config = OcrConfig {
369
+ backend: "tesseract".to_string(),
370
+ language: "eng".to_string(),
371
+ tesseract_config: Some(custom_tess_config),
372
+ };
373
+
374
+ let tess_config = backend.config_to_tesseract(&ocr_config);
375
+ assert_eq!(tess_config.language, "fra");
376
+ assert_eq!(tess_config.psm, 6);
377
+ assert!(tess_config.enable_table_detection);
378
+ }
379
+
380
+ #[test]
381
+ fn test_tesseract_backend_default() {
382
+ let backend = TesseractBackend::default();
383
+ assert_eq!(backend.name(), "tesseract");
384
+ }
385
+
386
+ #[test]
387
+ fn test_config_conversion_with_new_fields() {
388
+ let backend = TesseractBackend::new().unwrap();
389
+
390
+ let preprocessing = crate::types::ImagePreprocessingConfig {
391
+ target_dpi: 600,
392
+ auto_rotate: false,
393
+ deskew: true,
394
+ denoise: true,
395
+ contrast_enhance: true,
396
+ binarization_method: "adaptive".to_string(),
397
+ invert_colors: false,
398
+ };
399
+
400
+ let custom_tess_config = crate::types::TesseractConfig {
401
+ language: "eng".to_string(),
402
+ psm: 6,
403
+ output_format: "markdown".to_string(),
404
+ oem: 1,
405
+ min_confidence: 80.0,
406
+ preprocessing: Some(preprocessing.clone()),
407
+ tessedit_char_blacklist: "!@#$".to_string(),
408
+ ..Default::default()
409
+ };
410
+
411
+ let ocr_config = OcrConfig {
412
+ backend: "tesseract".to_string(),
413
+ language: "eng".to_string(),
414
+ tesseract_config: Some(custom_tess_config),
415
+ };
416
+
417
+ let tess_config = backend.config_to_tesseract(&ocr_config);
418
+
419
+ assert_eq!(tess_config.oem, 1);
420
+ assert_eq!(tess_config.min_confidence, 80.0);
421
+ assert_eq!(tess_config.tessedit_char_blacklist, "!@#$");
422
+
423
+ assert!(tess_config.preprocessing.is_some());
424
+ let preproc = tess_config.preprocessing.unwrap();
425
+ assert_eq!(preproc.target_dpi, 600);
426
+ assert!(!preproc.auto_rotate);
427
+ assert!(preproc.deskew);
428
+ assert!(preproc.denoise);
429
+ assert!(preproc.contrast_enhance);
430
+ assert_eq!(preproc.binarization_method, "adaptive");
431
+ assert!(!preproc.invert_colors);
432
+ }
433
+
434
+ #[test]
435
+ fn test_convert_config_type_conversions() {
436
+ let public_config = crate::types::TesseractConfig {
437
+ language: "eng".to_string(),
438
+ psm: 6,
439
+ oem: 3,
440
+ table_column_threshold: 100,
441
+ ..Default::default()
442
+ };
443
+
444
+ let internal_config = TesseractBackend::convert_config(&public_config);
445
+
446
+ assert_eq!(internal_config.psm, 6u8);
447
+ assert_eq!(internal_config.oem, 3u8);
448
+ assert_eq!(internal_config.table_column_threshold, 100u32);
449
+ }
450
+ }