kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,420 @@
1
+ //! PDF table extraction using pdfium character positions.
2
+ //!
3
+ //! This module converts pdfium character data to HocrWord format,
4
+ //! allowing us to reuse the existing table reconstruction logic.
5
+
6
+ use super::error::{PdfError, Result};
7
+ use crate::ocr::table::HocrWord;
8
+ use pdfium_render::prelude::*;
9
+
10
+ /// Spacing threshold for word boundary detection (in PDF units).
11
+ ///
12
+ /// Characters separated by more than this distance are considered separate words.
13
+ const WORD_SPACING_THRESHOLD: f32 = 3.0;
14
+
15
+ /// Minimum word length for table detection (filter out noise).
16
+ const MIN_WORD_LENGTH: usize = 1;
17
+
18
+ /// Extract words with positions from PDF page for table detection.
19
+ ///
20
+ /// Groups adjacent characters into words based on spacing heuristics,
21
+ /// then converts to HocrWord format for table reconstruction.
22
+ ///
23
+ /// # Arguments
24
+ ///
25
+ /// * `page` - PDF page to extract words from
26
+ /// * `min_confidence` - Minimum confidence threshold (0.0-100.0). PDF text has high confidence (95.0).
27
+ ///
28
+ /// # Returns
29
+ ///
30
+ /// Vector of HocrWord objects with text and bounding box information.
31
+ ///
32
+ /// # Example
33
+ ///
34
+ /// ```rust,no_run
35
+ /// use kreuzberg::pdf::table::extract_words_from_page;
36
+ /// use pdfium_render::prelude::*;
37
+ ///
38
+ /// let pdfium = Pdfium::default();
39
+ /// let document = pdfium.load_pdf_from_file("example.pdf", None)?;
40
+ /// let page = document.pages().get(0)?;
41
+ /// let words = extract_words_from_page(&page, 90.0)?;
42
+ /// ```
43
+ pub fn extract_words_from_page(page: &PdfPage, min_confidence: f64) -> Result<Vec<HocrWord>> {
44
+ // Get page dimensions for coordinate system
45
+ let page_width = page.width().value as i32;
46
+ let page_height = page.height().value as i32;
47
+
48
+ // Get all text from page
49
+ let page_text = page
50
+ .text()
51
+ .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get page text: {}", e)))?;
52
+
53
+ // Extract character-level information
54
+ let chars = page_text.chars();
55
+
56
+ // Group characters into words based on spacing
57
+ let words = group_chars_into_words(chars, page_width, page_height, min_confidence)?;
58
+
59
+ Ok(words)
60
+ }
61
+
62
+ /// Character with position information extracted from PDF.
63
+ #[derive(Debug, Clone)]
64
+ struct CharInfo {
65
+ text: char,
66
+ x: f32,
67
+ y: f32,
68
+ width: f32,
69
+ height: f32,
70
+ }
71
+
72
+ /// Group PDF characters into words based on spacing heuristics.
73
+ ///
74
+ /// Characters are grouped into the same word if they are:
75
+ /// 1. On the same horizontal line (similar y-coordinate)
76
+ /// 2. Close together horizontally (spacing < WORD_SPACING_THRESHOLD)
77
+ ///
78
+ /// # Arguments
79
+ ///
80
+ /// * `chars` - Iterator of PDF page characters
81
+ /// * `page_width` - Page width in PDF units
82
+ /// * `page_height` - Page height in PDF units
83
+ /// * `min_confidence` - Minimum confidence threshold (PDF text uses 95.0)
84
+ fn group_chars_into_words(
85
+ chars: PdfPageTextChars,
86
+ _page_width: i32,
87
+ page_height: i32,
88
+ min_confidence: f64,
89
+ ) -> Result<Vec<HocrWord>> {
90
+ let mut words: Vec<HocrWord> = Vec::new();
91
+ let mut current_word_chars: Vec<CharInfo> = Vec::new();
92
+
93
+ for pdf_char in chars.iter() {
94
+ // Get character bounds (use loose_bounds for table detection)
95
+ let bounds = pdf_char
96
+ .loose_bounds()
97
+ .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get char bounds: {}", e)))?;
98
+
99
+ // Get unicode character (skip if invalid)
100
+ let Some(ch) = pdf_char.unicode_char() else {
101
+ continue;
102
+ };
103
+
104
+ // Extract character information
105
+ let char_info = CharInfo {
106
+ text: ch,
107
+ x: bounds.left().value,
108
+ y: bounds.bottom().value, // PDF coordinates: bottom-left origin
109
+ width: bounds.width().value,
110
+ height: bounds.height().value,
111
+ };
112
+
113
+ // Skip whitespace characters (they're used for word boundaries)
114
+ if char_info.text.is_whitespace() {
115
+ if !current_word_chars.is_empty() {
116
+ if let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence) {
117
+ words.push(word);
118
+ }
119
+ current_word_chars.clear();
120
+ }
121
+ continue;
122
+ }
123
+
124
+ // Check if this character should start a new word
125
+ if should_start_new_word(&current_word_chars, &char_info) && !current_word_chars.is_empty() {
126
+ if let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence) {
127
+ words.push(word);
128
+ }
129
+ current_word_chars.clear();
130
+ }
131
+
132
+ current_word_chars.push(char_info);
133
+ }
134
+
135
+ // Finalize last word
136
+ if !current_word_chars.is_empty() {
137
+ if let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence) {
138
+ words.push(word);
139
+ }
140
+ }
141
+
142
+ Ok(words)
143
+ }
144
+
145
+ /// Determine if a new character should start a new word.
146
+ ///
147
+ /// Returns true if the character is far from the previous character
148
+ /// (indicating a word boundary) or on a different line.
149
+ fn should_start_new_word(current_word_chars: &[CharInfo], new_char: &CharInfo) -> bool {
150
+ if current_word_chars.is_empty() {
151
+ return false;
152
+ }
153
+
154
+ let last_char = &current_word_chars[current_word_chars.len() - 1];
155
+
156
+ // Check vertical distance (different lines)
157
+ let vertical_distance = (new_char.y - last_char.y).abs();
158
+ if vertical_distance > last_char.height * 0.5 {
159
+ return true;
160
+ }
161
+
162
+ // Check horizontal distance (word spacing)
163
+ let horizontal_gap = new_char.x - (last_char.x + last_char.width);
164
+ horizontal_gap > WORD_SPACING_THRESHOLD
165
+ }
166
+
167
+ /// Convert a group of characters into a HocrWord.
168
+ ///
169
+ /// Calculates bounding box and confidence for the word.
170
+ /// Returns None if the word doesn't meet minimum criteria.
171
+ fn finalize_word(chars: &[CharInfo], page_height: i32, min_confidence: f64) -> Option<HocrWord> {
172
+ if chars.is_empty() {
173
+ return None;
174
+ }
175
+
176
+ // Build word text
177
+ let text: String = chars.iter().map(|c| c.text).collect();
178
+
179
+ if text.len() < MIN_WORD_LENGTH {
180
+ return None;
181
+ }
182
+
183
+ // Calculate bounding box (encompassing all characters)
184
+ let left = chars
185
+ .iter()
186
+ .map(|c| c.x)
187
+ .min_by(|a, b| a.partial_cmp(b).unwrap())
188
+ .unwrap_or(0.0);
189
+ let right = chars
190
+ .iter()
191
+ .map(|c| c.x + c.width)
192
+ .max_by(|a, b| a.partial_cmp(b).unwrap())
193
+ .unwrap_or(0.0);
194
+ let bottom = chars
195
+ .iter()
196
+ .map(|c| c.y)
197
+ .min_by(|a, b| a.partial_cmp(b).unwrap())
198
+ .unwrap_or(0.0);
199
+ let top = chars
200
+ .iter()
201
+ .map(|c| c.y + c.height)
202
+ .max_by(|a, b| a.partial_cmp(b).unwrap())
203
+ .unwrap_or(0.0);
204
+
205
+ let width = (right - left).round() as i32;
206
+ let height = (top - bottom).round() as i32;
207
+
208
+ // Convert PDF coordinates (bottom-left origin) to image coordinates (top-left origin)
209
+ // HocrWord expects top-left origin like images/OCR output
210
+ let top_in_image_coords = (page_height as f32 - top).round() as i32;
211
+
212
+ // PDF text has high confidence (no OCR uncertainty)
213
+ let confidence = 95.0;
214
+
215
+ // Apply confidence threshold
216
+ if confidence < min_confidence {
217
+ return None;
218
+ }
219
+
220
+ Some(HocrWord {
221
+ text,
222
+ left: left.round().max(0.0) as u32,
223
+ top: top_in_image_coords.max(0) as u32,
224
+ width: width.max(0) as u32,
225
+ height: height.max(0) as u32,
226
+ confidence,
227
+ })
228
+ }
229
+
230
+ #[cfg(test)]
231
+ mod tests {
232
+ use super::*;
233
+
234
+ #[test]
235
+ fn test_char_info_creation() {
236
+ let char_info = CharInfo {
237
+ text: 'A',
238
+ x: 100.0,
239
+ y: 50.0,
240
+ width: 10.0,
241
+ height: 12.0,
242
+ };
243
+
244
+ assert_eq!(char_info.text, 'A');
245
+ assert_eq!(char_info.x, 100.0);
246
+ assert_eq!(char_info.width, 10.0);
247
+ }
248
+
249
+ #[test]
250
+ fn test_should_start_new_word_empty() {
251
+ let chars: Vec<CharInfo> = vec![];
252
+ let new_char = CharInfo {
253
+ text: 'A',
254
+ x: 100.0,
255
+ y: 50.0,
256
+ width: 10.0,
257
+ height: 12.0,
258
+ };
259
+
260
+ assert!(!should_start_new_word(&chars, &new_char));
261
+ }
262
+
263
+ #[test]
264
+ fn test_should_start_new_word_spacing() {
265
+ let chars = vec![CharInfo {
266
+ text: 'A',
267
+ x: 100.0,
268
+ y: 50.0,
269
+ width: 10.0,
270
+ height: 12.0,
271
+ }];
272
+
273
+ // Close character - same word
274
+ let close_char = CharInfo {
275
+ text: 'B',
276
+ x: 111.0, // 1 unit gap
277
+ y: 50.0,
278
+ width: 10.0,
279
+ height: 12.0,
280
+ };
281
+ assert!(!should_start_new_word(&chars, &close_char));
282
+
283
+ // Far character - new word
284
+ let far_char = CharInfo {
285
+ text: 'C',
286
+ x: 120.0, // 10 unit gap (> WORD_SPACING_THRESHOLD)
287
+ y: 50.0,
288
+ width: 10.0,
289
+ height: 12.0,
290
+ };
291
+ assert!(should_start_new_word(&chars, &far_char));
292
+ }
293
+
294
+ #[test]
295
+ fn test_should_start_new_word_different_line() {
296
+ let chars = vec![CharInfo {
297
+ text: 'A',
298
+ x: 100.0,
299
+ y: 50.0,
300
+ width: 10.0,
301
+ height: 12.0,
302
+ }];
303
+
304
+ // Character on different line
305
+ let new_line_char = CharInfo {
306
+ text: 'B',
307
+ x: 100.0,
308
+ y: 70.0, // Different y
309
+ width: 10.0,
310
+ height: 12.0,
311
+ };
312
+ assert!(should_start_new_word(&chars, &new_line_char));
313
+ }
314
+
315
+ #[test]
316
+ fn test_finalize_word_basic() {
317
+ let chars = vec![
318
+ CharInfo {
319
+ text: 'H',
320
+ x: 100.0,
321
+ y: 50.0,
322
+ width: 10.0,
323
+ height: 12.0,
324
+ },
325
+ CharInfo {
326
+ text: 'i',
327
+ x: 110.0,
328
+ y: 50.0,
329
+ width: 8.0,
330
+ height: 12.0,
331
+ },
332
+ ];
333
+
334
+ let page_height = 800;
335
+ let word = finalize_word(&chars, page_height, 0.0).unwrap();
336
+
337
+ assert_eq!(word.text, "Hi");
338
+ assert_eq!(word.left, 100);
339
+ assert_eq!(word.width, 18); // 110 + 8 - 100
340
+ assert_eq!(word.height, 12);
341
+ assert_eq!(word.confidence, 95.0);
342
+ }
343
+
344
+ #[test]
345
+ fn test_finalize_word_empty() {
346
+ let chars: Vec<CharInfo> = vec![];
347
+ let word = finalize_word(&chars, 800, 0.0);
348
+ assert!(word.is_none());
349
+ }
350
+
351
+ #[test]
352
+ fn test_finalize_word_confidence_filter() {
353
+ let chars = vec![CharInfo {
354
+ text: 'A',
355
+ x: 100.0,
356
+ y: 50.0,
357
+ width: 10.0,
358
+ height: 12.0,
359
+ }];
360
+
361
+ // Low threshold - should pass
362
+ let word = finalize_word(&chars, 800, 90.0);
363
+ assert!(word.is_some());
364
+
365
+ // High threshold - should fail
366
+ let word = finalize_word(&chars, 800, 96.0);
367
+ assert!(word.is_none());
368
+ }
369
+
370
+ #[test]
371
+ fn test_coordinate_conversion() {
372
+ // Test PDF coordinate (bottom-left origin) to image coordinate (top-left origin)
373
+ let chars = vec![CharInfo {
374
+ text: 'A',
375
+ x: 100.0,
376
+ y: 700.0, // PDF coordinates: bottom-left origin
377
+ width: 10.0,
378
+ height: 12.0,
379
+ }];
380
+
381
+ let page_height = 800;
382
+ let word = finalize_word(&chars, page_height, 0.0).unwrap();
383
+
384
+ // top_in_image_coords = page_height - (y + height) = 800 - (700 + 12) = 88
385
+ assert_eq!(word.top, 88);
386
+ }
387
+
388
+ #[test]
389
+ fn test_word_bounding_box() {
390
+ // Test that bounding box encompasses all characters
391
+ let chars = vec![
392
+ CharInfo {
393
+ text: 'A',
394
+ x: 100.0,
395
+ y: 50.0,
396
+ width: 10.0,
397
+ height: 12.0,
398
+ },
399
+ CharInfo {
400
+ text: 'B',
401
+ x: 110.0,
402
+ y: 51.0, // Slightly different y
403
+ width: 10.0,
404
+ height: 13.0, // Slightly different height
405
+ },
406
+ ];
407
+
408
+ let word = finalize_word(&chars, 800, 0.0).unwrap();
409
+
410
+ // Left should be minimum x
411
+ assert_eq!(word.left, 100);
412
+
413
+ // Width should span from leftmost to rightmost character
414
+ assert_eq!(word.width, 20); // 120 - 100
415
+
416
+ // Height should encompass both characters
417
+ // max(y+height) - min(y) = max(51+13, 50+12) - 50 = 64 - 50 = 14
418
+ assert_eq!(word.height, 14);
419
+ }
420
+ }
@@ -0,0 +1,161 @@
1
+ use super::error::{PdfError, Result};
2
+ use pdfium_render::prelude::*;
3
+
4
+ pub struct PdfTextExtractor {
5
+ pdfium: Pdfium,
6
+ }
7
+
8
+ impl PdfTextExtractor {
9
+ pub fn new() -> Result<Self> {
10
+ let binding = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
11
+ .or_else(|_| Pdfium::bind_to_system_library())
12
+ .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
13
+
14
+ let pdfium = Pdfium::new(binding);
15
+ Ok(Self { pdfium })
16
+ }
17
+
18
+ pub fn extract_text(&self, pdf_bytes: &[u8]) -> Result<String> {
19
+ self.extract_text_with_password(pdf_bytes, None)
20
+ }
21
+
22
+ pub fn extract_text_with_password(&self, pdf_bytes: &[u8], password: Option<&str>) -> Result<String> {
23
+ let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, password).map_err(|e| {
24
+ let err_msg = e.to_string();
25
+ if (err_msg.contains("password") || err_msg.contains("Password")) && password.is_some() {
26
+ PdfError::InvalidPassword
27
+ } else if err_msg.contains("password") || err_msg.contains("Password") {
28
+ PdfError::PasswordRequired
29
+ } else {
30
+ PdfError::InvalidPdf(err_msg)
31
+ }
32
+ })?;
33
+
34
+ extract_text_from_pdf_document(&document)
35
+ }
36
+
37
+ pub fn extract_text_with_passwords(&self, pdf_bytes: &[u8], passwords: &[&str]) -> Result<String> {
38
+ let mut last_error = None;
39
+
40
+ for password in passwords {
41
+ match self.extract_text_with_password(pdf_bytes, Some(password)) {
42
+ Ok(text) => return Ok(text),
43
+ Err(e) => {
44
+ last_error = Some(e);
45
+ continue;
46
+ }
47
+ }
48
+ }
49
+
50
+ if let Some(err) = last_error {
51
+ return Err(err);
52
+ }
53
+
54
+ self.extract_text(pdf_bytes)
55
+ }
56
+
57
+ pub fn get_page_count(&self, pdf_bytes: &[u8]) -> Result<usize> {
58
+ let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, None).map_err(|e| {
59
+ let err_msg = e.to_string();
60
+ if err_msg.contains("password") || err_msg.contains("Password") {
61
+ PdfError::PasswordRequired
62
+ } else {
63
+ PdfError::InvalidPdf(err_msg)
64
+ }
65
+ })?;
66
+
67
+ Ok(document.pages().len() as usize)
68
+ }
69
+ }
70
+
71
+ impl Default for PdfTextExtractor {
72
+ fn default() -> Self {
73
+ Self::new().expect("Failed to create PDF text extractor")
74
+ }
75
+ }
76
+
77
+ pub fn extract_text_from_pdf(pdf_bytes: &[u8]) -> Result<String> {
78
+ let extractor = PdfTextExtractor::new()?;
79
+ extractor.extract_text(pdf_bytes)
80
+ }
81
+
82
+ pub fn extract_text_from_pdf_with_password(pdf_bytes: &[u8], password: &str) -> Result<String> {
83
+ let extractor = PdfTextExtractor::new()?;
84
+ extractor.extract_text_with_password(pdf_bytes, Some(password))
85
+ }
86
+
87
+ pub fn extract_text_from_pdf_with_passwords(pdf_bytes: &[u8], passwords: &[&str]) -> Result<String> {
88
+ let extractor = PdfTextExtractor::new()?;
89
+ extractor.extract_text_with_passwords(pdf_bytes, passwords)
90
+ }
91
+
92
+ pub fn extract_text_from_pdf_document(document: &PdfDocument<'_>) -> Result<String> {
93
+ let page_count = document.pages().len() as usize;
94
+
95
+ // Pre-allocate capacity based on estimated page size (average 2KB per page)
96
+ // This reduces memory reallocations during string concatenation
97
+ let estimated_size = page_count * 2048;
98
+ let mut content = String::with_capacity(estimated_size);
99
+
100
+ for page in document.pages().iter() {
101
+ let text = page
102
+ .text()
103
+ .map_err(|e| PdfError::TextExtractionFailed(format!("Page text extraction failed: {}", e)))?;
104
+
105
+ let page_text = text.all();
106
+
107
+ if !content.is_empty() {
108
+ content.push_str("\n\n");
109
+ }
110
+ content.push_str(&page_text);
111
+ }
112
+
113
+ // Shrink to actual size to free unused capacity
114
+ content.shrink_to_fit();
115
+
116
+ Ok(content)
117
+ }
118
+
119
+ #[cfg(test)]
120
+ mod tests {
121
+ use super::*;
122
+
123
+ #[test]
124
+ fn test_extractor_creation() {
125
+ let result = PdfTextExtractor::new();
126
+ assert!(result.is_ok());
127
+ }
128
+
129
+ #[test]
130
+ fn test_extract_empty_pdf() {
131
+ let extractor = PdfTextExtractor::new().unwrap();
132
+ let result = extractor.extract_text(b"");
133
+ assert!(result.is_err());
134
+ }
135
+
136
+ #[test]
137
+ fn test_extract_invalid_pdf() {
138
+ let extractor = PdfTextExtractor::new().unwrap();
139
+ let result = extractor.extract_text(b"not a pdf");
140
+ assert!(result.is_err());
141
+ assert!(matches!(result.unwrap_err(), PdfError::InvalidPdf(_)));
142
+ }
143
+
144
+ #[test]
145
+ fn test_password_required_detection() {
146
+ let extractor = PdfTextExtractor::new().unwrap();
147
+ let encrypted_pdf = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n";
148
+ let result = extractor.extract_text(encrypted_pdf);
149
+
150
+ if let Err(err) = result {
151
+ assert!(matches!(err, PdfError::PasswordRequired | PdfError::InvalidPdf(_)));
152
+ }
153
+ }
154
+
155
+ #[test]
156
+ fn test_extract_text_with_passwords_empty_list() {
157
+ let extractor = PdfTextExtractor::new().unwrap();
158
+ let result = extractor.extract_text_with_passwords(b"not a pdf", &[]);
159
+ assert!(result.is_err());
160
+ }
161
+ }