kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,393 @@
1
+ //! PDF table extraction using pdfium character positions.
2
+ //!
3
+ //! This module converts pdfium character data to HocrWord format,
4
+ //! allowing us to reuse the existing table reconstruction logic.
5
+
6
+ use super::error::{PdfError, Result};
7
+ use html_to_markdown_rs::hocr::HocrWord;
8
+ use pdfium_render::prelude::*;
9
+
10
+ /// Spacing threshold for word boundary detection (in PDF units).
11
+ ///
12
+ /// Characters separated by more than this distance are considered separate words.
13
+ const WORD_SPACING_THRESHOLD: f32 = 3.0;
14
+
15
+ /// Minimum word length for table detection (filter out noise).
16
+ const MIN_WORD_LENGTH: usize = 1;
17
+
18
+ /// Extract words with positions from PDF page for table detection.
19
+ ///
20
+ /// Groups adjacent characters into words based on spacing heuristics,
21
+ /// then converts to HocrWord format for table reconstruction.
22
+ ///
23
+ /// # Arguments
24
+ ///
25
+ /// * `page` - PDF page to extract words from
26
+ /// * `min_confidence` - Minimum confidence threshold (0.0-100.0). PDF text has high confidence (95.0).
27
+ ///
28
+ /// # Returns
29
+ ///
30
+ /// Vector of HocrWord objects with text and bounding box information.
31
+ ///
32
+ /// # Example
33
+ ///
34
+ /// ```rust,no_run
35
+ /// use kreuzberg::pdf::table::extract_words_from_page;
36
+ /// use pdfium_render::prelude::*;
37
+ ///
38
+ /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
39
+ /// let pdfium = Pdfium::default();
40
+ /// let document = pdfium.load_pdf_from_file("example.pdf", None)?;
41
+ /// let page = document.pages().get(0)?;
42
+ /// let words = extract_words_from_page(&page, 90.0)?;
43
+ /// # Ok(())
44
+ /// # }
45
+ /// ```
46
+ pub fn extract_words_from_page(page: &PdfPage, min_confidence: f64) -> Result<Vec<HocrWord>> {
47
+ let page_width = page.width().value as i32;
48
+ let page_height = page.height().value as i32;
49
+
50
+ let page_text = page
51
+ .text()
52
+ .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get page text: {}", e)))?;
53
+
54
+ let chars = page_text.chars();
55
+
56
+ let words = group_chars_into_words(chars, page_width, page_height, min_confidence)?;
57
+
58
+ Ok(words)
59
+ }
60
+
61
+ /// Character with position information extracted from PDF.
62
+ #[derive(Debug, Clone)]
63
+ struct CharInfo {
64
+ text: char,
65
+ x: f32,
66
+ y: f32,
67
+ width: f32,
68
+ height: f32,
69
+ }
70
+
71
+ /// Group PDF characters into words based on spacing heuristics.
72
+ ///
73
+ /// Characters are grouped into the same word if they are:
74
+ /// 1. On the same horizontal line (similar y-coordinate)
75
+ /// 2. Close together horizontally (spacing < WORD_SPACING_THRESHOLD)
76
+ ///
77
+ /// # Arguments
78
+ ///
79
+ /// * `chars` - Iterator of PDF page characters
80
+ /// * `page_width` - Page width in PDF units
81
+ /// * `page_height` - Page height in PDF units
82
+ /// * `min_confidence` - Minimum confidence threshold (PDF text uses 95.0)
83
+ fn group_chars_into_words(
84
+ chars: PdfPageTextChars,
85
+ _page_width: i32,
86
+ page_height: i32,
87
+ min_confidence: f64,
88
+ ) -> Result<Vec<HocrWord>> {
89
+ let mut words: Vec<HocrWord> = Vec::new();
90
+ let mut current_word_chars: Vec<CharInfo> = Vec::new();
91
+
92
+ for pdf_char in chars.iter() {
93
+ let bounds = pdf_char
94
+ .loose_bounds()
95
+ .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get char bounds: {}", e)))?;
96
+
97
+ let Some(ch) = pdf_char.unicode_char() else {
98
+ continue;
99
+ };
100
+
101
+ let char_info = CharInfo {
102
+ text: ch,
103
+ x: bounds.left().value,
104
+ y: bounds.bottom().value,
105
+ width: bounds.width().value,
106
+ height: bounds.height().value,
107
+ };
108
+
109
+ if char_info.text.is_whitespace() {
110
+ if !current_word_chars.is_empty() {
111
+ if let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence) {
112
+ words.push(word);
113
+ }
114
+ current_word_chars.clear();
115
+ }
116
+ continue;
117
+ }
118
+
119
+ if should_start_new_word(&current_word_chars, &char_info) && !current_word_chars.is_empty() {
120
+ if let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence) {
121
+ words.push(word);
122
+ }
123
+ current_word_chars.clear();
124
+ }
125
+
126
+ current_word_chars.push(char_info);
127
+ }
128
+
129
+ if !current_word_chars.is_empty()
130
+ && let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence)
131
+ {
132
+ words.push(word);
133
+ }
134
+
135
+ Ok(words)
136
+ }
137
+
138
+ /// Determine if a new character should start a new word.
139
+ ///
140
+ /// Returns true if the character is far from the previous character
141
+ /// (indicating a word boundary) or on a different line.
142
+ fn should_start_new_word(current_word_chars: &[CharInfo], new_char: &CharInfo) -> bool {
143
+ if current_word_chars.is_empty() {
144
+ return false;
145
+ }
146
+
147
+ let last_char = &current_word_chars[current_word_chars.len() - 1];
148
+
149
+ let vertical_distance = (new_char.y - last_char.y).abs();
150
+ if vertical_distance > last_char.height * 0.5 {
151
+ return true;
152
+ }
153
+
154
+ let horizontal_gap = new_char.x - (last_char.x + last_char.width);
155
+ horizontal_gap > WORD_SPACING_THRESHOLD
156
+ }
157
+
158
+ /// Convert a group of characters into a HocrWord.
159
+ ///
160
+ /// Calculates bounding box and confidence for the word.
161
+ /// Returns None if the word doesn't meet minimum criteria.
162
+ fn finalize_word(chars: &[CharInfo], page_height: i32, min_confidence: f64) -> Option<HocrWord> {
163
+ if chars.is_empty() {
164
+ return None;
165
+ }
166
+
167
+ let text: String = chars.iter().map(|c| c.text).collect();
168
+
169
+ if text.len() < MIN_WORD_LENGTH {
170
+ return None;
171
+ }
172
+
173
+ let left = chars
174
+ .iter()
175
+ .map(|c| c.x)
176
+ .min_by(|a, b| a.partial_cmp(b).unwrap())
177
+ .unwrap_or(0.0);
178
+ let right = chars
179
+ .iter()
180
+ .map(|c| c.x + c.width)
181
+ .max_by(|a, b| a.partial_cmp(b).unwrap())
182
+ .unwrap_or(0.0);
183
+ let bottom = chars
184
+ .iter()
185
+ .map(|c| c.y)
186
+ .min_by(|a, b| a.partial_cmp(b).unwrap())
187
+ .unwrap_or(0.0);
188
+ let top = chars
189
+ .iter()
190
+ .map(|c| c.y + c.height)
191
+ .max_by(|a, b| a.partial_cmp(b).unwrap())
192
+ .unwrap_or(0.0);
193
+
194
+ let width = (right - left).round() as i32;
195
+ let height = (top - bottom).round() as i32;
196
+
197
+ let top_in_image_coords = (page_height as f32 - top).round() as i32;
198
+
199
+ let confidence = 95.0;
200
+
201
+ if confidence < min_confidence {
202
+ return None;
203
+ }
204
+
205
+ Some(HocrWord {
206
+ text,
207
+ left: left.round().max(0.0) as u32,
208
+ top: top_in_image_coords.max(0) as u32,
209
+ width: width.max(0) as u32,
210
+ height: height.max(0) as u32,
211
+ confidence,
212
+ })
213
+ }
214
+
215
+ #[cfg(test)]
216
+ mod tests {
217
+ use super::*;
218
+
219
+ #[test]
220
+ fn test_char_info_creation() {
221
+ let char_info = CharInfo {
222
+ text: 'A',
223
+ x: 100.0,
224
+ y: 50.0,
225
+ width: 10.0,
226
+ height: 12.0,
227
+ };
228
+
229
+ assert_eq!(char_info.text, 'A');
230
+ assert_eq!(char_info.x, 100.0);
231
+ assert_eq!(char_info.width, 10.0);
232
+ }
233
+
234
+ #[test]
235
+ fn test_should_start_new_word_empty() {
236
+ let chars: Vec<CharInfo> = vec![];
237
+ let new_char = CharInfo {
238
+ text: 'A',
239
+ x: 100.0,
240
+ y: 50.0,
241
+ width: 10.0,
242
+ height: 12.0,
243
+ };
244
+
245
+ assert!(!should_start_new_word(&chars, &new_char));
246
+ }
247
+
248
+ #[test]
249
+ fn test_should_start_new_word_spacing() {
250
+ let chars = vec![CharInfo {
251
+ text: 'A',
252
+ x: 100.0,
253
+ y: 50.0,
254
+ width: 10.0,
255
+ height: 12.0,
256
+ }];
257
+
258
+ let close_char = CharInfo {
259
+ text: 'B',
260
+ x: 111.0,
261
+ y: 50.0,
262
+ width: 10.0,
263
+ height: 12.0,
264
+ };
265
+ assert!(!should_start_new_word(&chars, &close_char));
266
+
267
+ let far_char = CharInfo {
268
+ text: 'C',
269
+ x: 120.0,
270
+ y: 50.0,
271
+ width: 10.0,
272
+ height: 12.0,
273
+ };
274
+ assert!(should_start_new_word(&chars, &far_char));
275
+ }
276
+
277
+ #[test]
278
+ fn test_should_start_new_word_different_line() {
279
+ let chars = vec![CharInfo {
280
+ text: 'A',
281
+ x: 100.0,
282
+ y: 50.0,
283
+ width: 10.0,
284
+ height: 12.0,
285
+ }];
286
+
287
+ let new_line_char = CharInfo {
288
+ text: 'B',
289
+ x: 100.0,
290
+ y: 70.0,
291
+ width: 10.0,
292
+ height: 12.0,
293
+ };
294
+ assert!(should_start_new_word(&chars, &new_line_char));
295
+ }
296
+
297
+ #[test]
298
+ fn test_finalize_word_basic() {
299
+ let chars = vec![
300
+ CharInfo {
301
+ text: 'H',
302
+ x: 100.0,
303
+ y: 50.0,
304
+ width: 10.0,
305
+ height: 12.0,
306
+ },
307
+ CharInfo {
308
+ text: 'i',
309
+ x: 110.0,
310
+ y: 50.0,
311
+ width: 8.0,
312
+ height: 12.0,
313
+ },
314
+ ];
315
+
316
+ let page_height = 800;
317
+ let word = finalize_word(&chars, page_height, 0.0).unwrap();
318
+
319
+ assert_eq!(word.text, "Hi");
320
+ assert_eq!(word.left, 100);
321
+ assert_eq!(word.width, 18);
322
+ assert_eq!(word.height, 12);
323
+ assert_eq!(word.confidence, 95.0);
324
+ }
325
+
326
+ #[test]
327
+ fn test_finalize_word_empty() {
328
+ let chars: Vec<CharInfo> = vec![];
329
+ let word = finalize_word(&chars, 800, 0.0);
330
+ assert!(word.is_none());
331
+ }
332
+
333
+ #[test]
334
+ fn test_finalize_word_confidence_filter() {
335
+ let chars = vec![CharInfo {
336
+ text: 'A',
337
+ x: 100.0,
338
+ y: 50.0,
339
+ width: 10.0,
340
+ height: 12.0,
341
+ }];
342
+
343
+ let word = finalize_word(&chars, 800, 90.0);
344
+ assert!(word.is_some());
345
+
346
+ let word = finalize_word(&chars, 800, 96.0);
347
+ assert!(word.is_none());
348
+ }
349
+
350
+ #[test]
351
+ fn test_coordinate_conversion() {
352
+ let chars = vec![CharInfo {
353
+ text: 'A',
354
+ x: 100.0,
355
+ y: 700.0,
356
+ width: 10.0,
357
+ height: 12.0,
358
+ }];
359
+
360
+ let page_height = 800;
361
+ let word = finalize_word(&chars, page_height, 0.0).unwrap();
362
+
363
+ assert_eq!(word.top, 88);
364
+ }
365
+
366
+ #[test]
367
+ fn test_word_bounding_box() {
368
+ let chars = vec![
369
+ CharInfo {
370
+ text: 'A',
371
+ x: 100.0,
372
+ y: 50.0,
373
+ width: 10.0,
374
+ height: 12.0,
375
+ },
376
+ CharInfo {
377
+ text: 'B',
378
+ x: 110.0,
379
+ y: 51.0,
380
+ width: 10.0,
381
+ height: 13.0,
382
+ },
383
+ ];
384
+
385
+ let word = finalize_word(&chars, 800, 0.0).unwrap();
386
+
387
+ assert_eq!(word.left, 100);
388
+
389
+ assert_eq!(word.width, 20);
390
+
391
+ assert_eq!(word.height, 14);
392
+ }
393
+ }
@@ -0,0 +1,158 @@
1
+ use super::error::{PdfError, Result};
2
+ use pdfium_render::prelude::*;
3
+
4
+ pub struct PdfTextExtractor {
5
+ pdfium: Pdfium,
6
+ }
7
+
8
+ impl PdfTextExtractor {
9
+ pub fn new() -> Result<Self> {
10
+ let binding = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
11
+ .or_else(|_| Pdfium::bind_to_system_library())
12
+ .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
13
+
14
+ let pdfium = Pdfium::new(binding);
15
+ Ok(Self { pdfium })
16
+ }
17
+
18
+ pub fn extract_text(&self, pdf_bytes: &[u8]) -> Result<String> {
19
+ self.extract_text_with_password(pdf_bytes, None)
20
+ }
21
+
22
+ pub fn extract_text_with_password(&self, pdf_bytes: &[u8], password: Option<&str>) -> Result<String> {
23
+ let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, password).map_err(|e| {
24
+ let err_msg = e.to_string();
25
+ if (err_msg.contains("password") || err_msg.contains("Password")) && password.is_some() {
26
+ PdfError::InvalidPassword
27
+ } else if err_msg.contains("password") || err_msg.contains("Password") {
28
+ PdfError::PasswordRequired
29
+ } else {
30
+ PdfError::InvalidPdf(err_msg)
31
+ }
32
+ })?;
33
+
34
+ extract_text_from_pdf_document(&document)
35
+ }
36
+
37
+ pub fn extract_text_with_passwords(&self, pdf_bytes: &[u8], passwords: &[&str]) -> Result<String> {
38
+ let mut last_error = None;
39
+
40
+ for password in passwords {
41
+ match self.extract_text_with_password(pdf_bytes, Some(password)) {
42
+ Ok(text) => return Ok(text),
43
+ Err(e) => {
44
+ last_error = Some(e);
45
+ continue;
46
+ }
47
+ }
48
+ }
49
+
50
+ if let Some(err) = last_error {
51
+ return Err(err);
52
+ }
53
+
54
+ self.extract_text(pdf_bytes)
55
+ }
56
+
57
+ pub fn get_page_count(&self, pdf_bytes: &[u8]) -> Result<usize> {
58
+ let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, None).map_err(|e| {
59
+ let err_msg = e.to_string();
60
+ if err_msg.contains("password") || err_msg.contains("Password") {
61
+ PdfError::PasswordRequired
62
+ } else {
63
+ PdfError::InvalidPdf(err_msg)
64
+ }
65
+ })?;
66
+
67
+ Ok(document.pages().len() as usize)
68
+ }
69
+ }
70
+
71
+ impl Default for PdfTextExtractor {
72
+ fn default() -> Self {
73
+ Self::new().expect("Failed to create PDF text extractor")
74
+ }
75
+ }
76
+
77
+ pub fn extract_text_from_pdf(pdf_bytes: &[u8]) -> Result<String> {
78
+ let extractor = PdfTextExtractor::new()?;
79
+ extractor.extract_text(pdf_bytes)
80
+ }
81
+
82
+ pub fn extract_text_from_pdf_with_password(pdf_bytes: &[u8], password: &str) -> Result<String> {
83
+ let extractor = PdfTextExtractor::new()?;
84
+ extractor.extract_text_with_password(pdf_bytes, Some(password))
85
+ }
86
+
87
+ pub fn extract_text_from_pdf_with_passwords(pdf_bytes: &[u8], passwords: &[&str]) -> Result<String> {
88
+ let extractor = PdfTextExtractor::new()?;
89
+ extractor.extract_text_with_passwords(pdf_bytes, passwords)
90
+ }
91
+
92
+ pub fn extract_text_from_pdf_document(document: &PdfDocument<'_>) -> Result<String> {
93
+ let page_count = document.pages().len() as usize;
94
+
95
+ let estimated_size = page_count * 2048;
96
+ let mut content = String::with_capacity(estimated_size);
97
+
98
+ for page in document.pages().iter() {
99
+ let text = page
100
+ .text()
101
+ .map_err(|e| PdfError::TextExtractionFailed(format!("Page text extraction failed: {}", e)))?;
102
+
103
+ let page_text = text.all();
104
+
105
+ if !content.is_empty() {
106
+ content.push_str("\n\n");
107
+ }
108
+ content.push_str(&page_text);
109
+ }
110
+
111
+ content.shrink_to_fit();
112
+
113
+ Ok(content)
114
+ }
115
+
116
+ #[cfg(test)]
117
+ mod tests {
118
+ use super::*;
119
+
120
+ #[test]
121
+ fn test_extractor_creation() {
122
+ let result = PdfTextExtractor::new();
123
+ assert!(result.is_ok());
124
+ }
125
+
126
+ #[test]
127
+ fn test_extract_empty_pdf() {
128
+ let extractor = PdfTextExtractor::new().unwrap();
129
+ let result = extractor.extract_text(b"");
130
+ assert!(result.is_err());
131
+ }
132
+
133
+ #[test]
134
+ fn test_extract_invalid_pdf() {
135
+ let extractor = PdfTextExtractor::new().unwrap();
136
+ let result = extractor.extract_text(b"not a pdf");
137
+ assert!(result.is_err());
138
+ assert!(matches!(result.unwrap_err(), PdfError::InvalidPdf(_)));
139
+ }
140
+
141
+ #[test]
142
+ fn test_password_required_detection() {
143
+ let extractor = PdfTextExtractor::new().unwrap();
144
+ let encrypted_pdf = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n";
145
+ let result = extractor.extract_text(encrypted_pdf);
146
+
147
+ if let Err(err) = result {
148
+ assert!(matches!(err, PdfError::PasswordRequired | PdfError::InvalidPdf(_)));
149
+ }
150
+ }
151
+
152
+ #[test]
153
+ fn test_extract_text_with_passwords_empty_list() {
154
+ let extractor = PdfTextExtractor::new().unwrap();
155
+ let result = extractor.extract_text_with_passwords(b"not a pdf", &[]);
156
+ assert!(result.is_err());
157
+ }
158
+ }