kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,307 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe 'OCR Backend Plugin System' do
6
+ let(:test_image) { test_document_path('images/invoice_image.png') }
7
+
8
+ describe 'registering custom OCR backend' do
9
+ it 'registers and uses custom OCR backend class' do
10
+ class MockOcrBackend
11
+ include Kreuzberg::OcrBackendProtocol
12
+
13
+ attr_reader :process_called, :received_config
14
+
15
+ def initialize
16
+ @process_called = false
17
+ @received_config = nil
18
+ end
19
+
20
+ def name
21
+ 'mock-ocr'
22
+ end
23
+
24
+ def process_image(_image_bytes, config)
25
+ @process_called = true
26
+ @received_config = config
27
+ 'Mocked OCR text from custom backend'
28
+ end
29
+ end
30
+
31
+ backend = MockOcrBackend.new
32
+ Kreuzberg.register_ocr_backend('mock-ocr', backend)
33
+
34
+ config = Kreuzberg::Config::Extraction.new(
35
+ force_ocr: true,
36
+ ocr: Kreuzberg::Config::Ocr.new(backend: 'mock-ocr')
37
+ )
38
+
39
+ result = Kreuzberg.extract_file_sync(test_image, config: config)
40
+
41
+ expect(backend.process_called).to be true
42
+ expect(result.content).to include('Mocked OCR text')
43
+ end
44
+
45
+ it 'passes correct configuration to OCR backend' do
46
+ class ConfigCapturingBackend
47
+ include Kreuzberg::OcrBackendProtocol
48
+
49
+ attr_reader :received_config
50
+
51
+ def name
52
+ 'config-capture'
53
+ end
54
+
55
+ def process_image(_image_bytes, config)
56
+ @received_config = config
57
+ 'OCR result'
58
+ end
59
+ end
60
+
61
+ backend = ConfigCapturingBackend.new
62
+ Kreuzberg.register_ocr_backend('config-capture', backend)
63
+
64
+ config = Kreuzberg::Config::Extraction.new(
65
+ force_ocr: true,
66
+ ocr: Kreuzberg::Config::Ocr.new(
67
+ backend: 'config-capture',
68
+ language: 'eng'
69
+ )
70
+ )
71
+
72
+ Kreuzberg.extract_file_sync(test_image, config: config)
73
+
74
+ expect(backend.received_config).to be_a(Hash)
75
+ expect(backend.received_config['backend']).to eq('config-capture')
76
+ expect(backend.received_config['language']).to eq('eng')
77
+ end
78
+ end
79
+
80
+ describe 'OCR backend receives correct parameters' do
81
+ it 'receives image bytes as binary data' do
82
+ class BytesCapturingBackend
83
+ include Kreuzberg::OcrBackendProtocol
84
+
85
+ attr_accessor :received_bytes
86
+
87
+ def name
88
+ 'bytes-capture'
89
+ end
90
+
91
+ def process_image(image_bytes, _config)
92
+ self.class.instance_variable_set(:@received_bytes, image_bytes)
93
+ 'OCR result'
94
+ end
95
+ end
96
+
97
+ backend = BytesCapturingBackend.new
98
+ Kreuzberg.register_ocr_backend('bytes-capture', backend)
99
+
100
+ config = Kreuzberg::Config::Extraction.new(
101
+ force_ocr: true,
102
+ ocr: Kreuzberg::Config::Ocr.new(backend: 'bytes-capture')
103
+ )
104
+
105
+ Kreuzberg.extract_file_sync(test_image, config: config)
106
+
107
+ received_bytes = BytesCapturingBackend.instance_variable_get(:@received_bytes)
108
+ expect(received_bytes).to be_a(String)
109
+ expect(received_bytes.encoding).to eq(Encoding::BINARY)
110
+ expect(received_bytes.length).to be_positive
111
+ end
112
+
113
+ it 'backend can return extracted text' do
114
+ class SimpleOcrBackend
115
+ include Kreuzberg::OcrBackendProtocol
116
+
117
+ def name
118
+ 'simple-ocr'
119
+ end
120
+
121
+ def process_image(_image_bytes, _config)
122
+ 'Invoice Total: $1,234.56'
123
+ end
124
+ end
125
+
126
+ backend = SimpleOcrBackend.new
127
+ Kreuzberg.register_ocr_backend('simple-ocr', backend)
128
+
129
+ config = Kreuzberg::Config::Extraction.new(
130
+ force_ocr: true,
131
+ ocr: Kreuzberg::Config::Ocr.new(backend: 'simple-ocr')
132
+ )
133
+
134
+ result = Kreuzberg.extract_file_sync(test_image, config: config)
135
+
136
+ expect(result.content).to include('Invoice Total')
137
+ expect(result.content).to include('1,234.56')
138
+ end
139
+ end
140
+
141
+ describe 'OCR backend with stateful processing' do
142
+ it 'maintains state across multiple invocations' do
143
+ class StatefulOcrBackend
144
+ include Kreuzberg::OcrBackendProtocol
145
+
146
+ attr_reader :call_count
147
+
148
+ def initialize
149
+ @call_count = 0
150
+ end
151
+
152
+ def name
153
+ 'stateful-ocr'
154
+ end
155
+
156
+ def process_image(_image_bytes, _config)
157
+ @call_count += 1
158
+ "OCR call number #{@call_count}"
159
+ end
160
+ end
161
+
162
+ backend = StatefulOcrBackend.new
163
+ Kreuzberg.register_ocr_backend('stateful-ocr', backend)
164
+
165
+ config = Kreuzberg::Config::Extraction.new(
166
+ force_ocr: true,
167
+ ocr: Kreuzberg::Config::Ocr.new(backend: 'stateful-ocr')
168
+ )
169
+
170
+ Kreuzberg.extract_file_sync(test_image, config: config)
171
+ Kreuzberg.extract_file_sync(test_image, config: config)
172
+
173
+ expect(backend.call_count).to be >= 1
174
+ end
175
+ end
176
+
177
+ describe 'error handling' do
178
+ it 'propagates errors from OCR backend' do
179
+ class FailingOcrBackend
180
+ include Kreuzberg::OcrBackendProtocol
181
+
182
+ def name
183
+ 'failing-ocr'
184
+ end
185
+
186
+ def process_image(_image_bytes, _config)
187
+ raise StandardError, 'OCR processing failed'
188
+ end
189
+ end
190
+
191
+ backend = FailingOcrBackend.new
192
+ Kreuzberg.register_ocr_backend('failing-ocr', backend)
193
+
194
+ config = Kreuzberg::Config::Extraction.new(
195
+ force_ocr: true,
196
+ ocr: Kreuzberg::Config::Ocr.new(backend: 'failing-ocr')
197
+ )
198
+
199
+ expect do
200
+ Kreuzberg.extract_file_sync(test_image, config: config)
201
+ end.to raise_error(StandardError, /OCR processing failed/)
202
+ end
203
+
204
+ it 'handles missing OCR backend gracefully' do
205
+ config = Kreuzberg::Config::Extraction.new(
206
+ force_ocr: true,
207
+ ocr: Kreuzberg::Config::Ocr.new(backend: 'nonexistent-backend')
208
+ )
209
+
210
+ expect do
211
+ Kreuzberg.extract_file_sync(test_image, config: config)
212
+ end.to raise_error
213
+ end
214
+ end
215
+
216
+ describe 'OCR backend protocol implementation' do
217
+ it 'requires name method' do
218
+ class InvalidBackendNoName
219
+ def process_image(_image_bytes, _config)
220
+ 'text'
221
+ end
222
+ end
223
+
224
+ backend = InvalidBackendNoName.new
225
+
226
+ expect do
227
+ Kreuzberg.register_ocr_backend('invalid', backend)
228
+ end.to raise_error
229
+ end
230
+
231
+ it 'requires process_image method' do
232
+ class InvalidBackendNoProcess
233
+ def name
234
+ 'invalid'
235
+ end
236
+ end
237
+
238
+ backend = InvalidBackendNoProcess.new
239
+
240
+ expect do
241
+ Kreuzberg.register_ocr_backend('invalid', backend)
242
+ end.to raise_error
243
+ end
244
+ end
245
+
246
+ describe 'OCR backend management' do
247
+ describe '.list_ocr_backends' do
248
+ it 'returns an array of backend names' do
249
+ backends = Kreuzberg.list_ocr_backends
250
+ expect(backends).to be_an(Array)
251
+ end
252
+
253
+ it 'includes registered backends' do
254
+ class ListTestBackend
255
+ include Kreuzberg::OcrBackendProtocol
256
+
257
+ def name
258
+ 'list-test-backend'
259
+ end
260
+
261
+ def process_image(_image_bytes, _config)
262
+ 'test'
263
+ end
264
+ end
265
+
266
+ backend = ListTestBackend.new
267
+ Kreuzberg.register_ocr_backend('list-test-backend', backend)
268
+
269
+ backends = Kreuzberg.list_ocr_backends
270
+ expect(backends).to include('list-test-backend')
271
+
272
+ Kreuzberg.unregister_ocr_backend('list-test-backend')
273
+ end
274
+ end
275
+
276
+ describe '.unregister_ocr_backend' do
277
+ it 'removes backend from registry' do
278
+ class UnregisterTestBackend
279
+ include Kreuzberg::OcrBackendProtocol
280
+
281
+ def name
282
+ 'unregister-test'
283
+ end
284
+
285
+ def process_image(_image_bytes, _config)
286
+ 'test'
287
+ end
288
+ end
289
+
290
+ backend = UnregisterTestBackend.new
291
+ Kreuzberg.register_ocr_backend('unregister-test', backend)
292
+
293
+ backends = Kreuzberg.list_ocr_backends
294
+ expect(backends).to include('unregister-test')
295
+
296
+ Kreuzberg.unregister_ocr_backend('unregister-test')
297
+
298
+ backends = Kreuzberg.list_ocr_backends
299
+ expect(backends).not_to include('unregister-test')
300
+ end
301
+
302
+ it 'accepts nonexistent backend name without error' do
303
+ expect { Kreuzberg.unregister_ocr_backend('nonexistent-backend-xyz') }.not_to raise_error
304
+ end
305
+ end
306
+ end
307
+ end
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe 'PostProcessor Plugin System' do
6
+ let(:test_pdf) { test_document_path('text/contract_test.txt') }
7
+
8
+ after do
9
+ Kreuzberg.clear_post_processors
10
+ end
11
+
12
+ describe 'registering post-processor as Proc' do
13
+ it 'registers and executes Proc post-processor during extraction' do
14
+ processor_called = false
15
+ processor = lambda do |result|
16
+ processor_called = true
17
+ result['content'] = result['content'].upcase
18
+ result
19
+ end
20
+
21
+ Kreuzberg.register_post_processor('upcase', processor)
22
+ result = Kreuzberg.extract_file_sync(test_pdf)
23
+
24
+ expect(processor_called).to be true
25
+ expect(result.content).to eq(result.content.upcase)
26
+ end
27
+
28
+ it 'allows post-processor to modify result content' do
29
+ processor = lambda do |result|
30
+ result['content'] = "[PROCESSED] #{result['content']}"
31
+ result
32
+ end
33
+
34
+ Kreuzberg.register_post_processor('prefix', processor)
35
+ result = Kreuzberg.extract_file_sync(test_pdf)
36
+
37
+ expect(result.content).to start_with('[PROCESSED]')
38
+ end
39
+
40
+ it 'allows post-processor to add metadata' do
41
+ processor = lambda do |result|
42
+ result['metadata']['custom_field'] = 'custom_value'
43
+ result['metadata']['word_count'] = result['content'].split.length
44
+ result
45
+ end
46
+
47
+ Kreuzberg.register_post_processor('metadata_adder', processor)
48
+ result = Kreuzberg.extract_file_sync(test_pdf)
49
+
50
+ expect(result.metadata['custom_field']).to eq('custom_value')
51
+ expect(result.metadata['word_count']).to be_positive
52
+ end
53
+ end
54
+
55
+ describe 'registering post-processor as class' do
56
+ it 'registers and executes class-based post-processor' do
57
+ class WordCountProcessor
58
+ include Kreuzberg::PostProcessorProtocol
59
+
60
+ def call(result)
61
+ word_count = result['content'].split.length
62
+ result['metadata']['word_count'] = word_count
63
+ result['metadata']['processor_name'] = 'WordCountProcessor'
64
+ result
65
+ end
66
+ end
67
+
68
+ processor = WordCountProcessor.new
69
+ Kreuzberg.register_post_processor('word_count', processor)
70
+ result = Kreuzberg.extract_file_sync(test_pdf)
71
+
72
+ expect(result.metadata['word_count']).to be_positive
73
+ expect(result.metadata['processor_name']).to eq('WordCountProcessor')
74
+ end
75
+
76
+ it 'allows class-based processor to transform content' do
77
+ class TruncateProcessor
78
+ include Kreuzberg::PostProcessorProtocol
79
+
80
+ def initialize(max_length)
81
+ @max_length = max_length
82
+ end
83
+
84
+ def call(result)
85
+ result['content'] = "#{result['content'][0...@max_length]}..." if result['content'].length > @max_length
86
+ result
87
+ end
88
+ end
89
+
90
+ processor = TruncateProcessor.new(50)
91
+ Kreuzberg.register_post_processor('truncate', processor)
92
+ result = Kreuzberg.extract_file_sync(test_pdf)
93
+
94
+ expect(result.content.length).to be <= 53
95
+ end
96
+ end
97
+
98
+ describe 'multiple post-processors' do
99
+ it 'executes multiple registered post-processors in order' do
100
+ processor1 = lambda do |result|
101
+ result['metadata']['processor1'] = 'executed'
102
+ result
103
+ end
104
+
105
+ processor2 = lambda do |result|
106
+ result['metadata']['processor2'] = 'executed'
107
+ result
108
+ end
109
+
110
+ Kreuzberg.register_post_processor('proc1', processor1)
111
+ Kreuzberg.register_post_processor('proc2', processor2)
112
+ result = Kreuzberg.extract_file_sync(test_pdf)
113
+
114
+ expect(result.metadata['processor1']).to eq('executed')
115
+ expect(result.metadata['processor2']).to eq('executed')
116
+ end
117
+ end
118
+
119
+ describe 'unregister_post_processor' do
120
+ it 'removes a registered post-processor by name' do
121
+ processor = lambda do |result|
122
+ result['metadata']['should_not_appear'] = 'value'
123
+ result
124
+ end
125
+
126
+ Kreuzberg.register_post_processor('removable', processor)
127
+ Kreuzberg.unregister_post_processor('removable')
128
+ result = Kreuzberg.extract_file_sync(test_pdf)
129
+
130
+ expect(result.metadata['should_not_appear']).to be_nil
131
+ end
132
+
133
+ it 'does not affect other registered post-processors' do
134
+ processor1 = lambda do |result|
135
+ result['metadata']['keep1'] = 'value1'
136
+ result
137
+ end
138
+
139
+ processor2 = lambda do |result|
140
+ result['metadata']['remove'] = 'value2'
141
+ result
142
+ end
143
+
144
+ processor3 = lambda do |result|
145
+ result['metadata']['keep3'] = 'value3'
146
+ result
147
+ end
148
+
149
+ Kreuzberg.register_post_processor('keep1', processor1)
150
+ Kreuzberg.register_post_processor('remove', processor2)
151
+ Kreuzberg.register_post_processor('keep3', processor3)
152
+
153
+ Kreuzberg.unregister_post_processor('remove')
154
+ result = Kreuzberg.extract_file_sync(test_pdf)
155
+
156
+ expect(result.metadata['keep1']).to eq('value1')
157
+ expect(result.metadata['remove']).to be_nil
158
+ expect(result.metadata['keep3']).to eq('value3')
159
+ end
160
+ end
161
+
162
+ describe 'clear_post_processors' do
163
+ it 'removes all registered post-processors' do
164
+ processor1 = lambda do |result|
165
+ result['metadata']['proc1'] = 'value1'
166
+ result
167
+ end
168
+
169
+ processor2 = lambda do |result|
170
+ result['metadata']['proc2'] = 'value2'
171
+ result
172
+ end
173
+
174
+ Kreuzberg.register_post_processor('proc1', processor1)
175
+ Kreuzberg.register_post_processor('proc2', processor2)
176
+
177
+ Kreuzberg.clear_post_processors
178
+ result = Kreuzberg.extract_file_sync(test_pdf)
179
+
180
+ expect(result.metadata['proc1']).to be_nil
181
+ expect(result.metadata['proc2']).to be_nil
182
+ end
183
+ end
184
+
185
+ describe 'error handling' do
186
+ it 'propagates errors from post-processor' do
187
+ processor = lambda do |_result|
188
+ raise StandardError, 'Post-processor error'
189
+ end
190
+
191
+ Kreuzberg.register_post_processor('failing', processor)
192
+
193
+ expect do
194
+ Kreuzberg.extract_file_sync(test_pdf)
195
+ end.to raise_error(StandardError, /Post-processor error/)
196
+ end
197
+
198
+ it 'handles post-processor that returns invalid result' do
199
+ processor = lambda do |_result|
200
+ 'invalid return value'
201
+ end
202
+
203
+ Kreuzberg.register_post_processor('invalid', processor)
204
+
205
+ expect do
206
+ Kreuzberg.extract_file_sync(test_pdf)
207
+ end.to raise_error
208
+ end
209
+ end
210
+
211
+ describe 'list_post_processors' do
212
+ it 'returns empty array when no post-processors registered' do
213
+ Kreuzberg.clear_post_processors
214
+ processors = Kreuzberg.list_post_processors
215
+ expect(processors).to be_an(Array)
216
+ expect(processors).to be_empty
217
+ end
218
+
219
+ it 'returns post-processor names after registration' do
220
+ Kreuzberg.clear_post_processors
221
+ processor = lambda do |result|
222
+ result['content'] = result['content'].upcase
223
+ result
224
+ end
225
+ Kreuzberg.register_post_processor('test-processor', processor)
226
+ processors = Kreuzberg.list_post_processors
227
+ expect(processors).to include('test-processor')
228
+ Kreuzberg.clear_post_processors
229
+ end
230
+
231
+ it 'returns all registered post-processor names' do
232
+ Kreuzberg.clear_post_processors
233
+ processor1 = lambda do |result|
234
+ result
235
+ end
236
+ processor2 = lambda do |result|
237
+ result
238
+ end
239
+ processor3 = lambda do |result|
240
+ result
241
+ end
242
+
243
+ Kreuzberg.register_post_processor('processor-one', processor1)
244
+ Kreuzberg.register_post_processor('processor-two', processor2)
245
+ Kreuzberg.register_post_processor('processor-three', processor3)
246
+
247
+ processors = Kreuzberg.list_post_processors
248
+ expect(processors).to contain_exactly('processor-one', 'processor-two', 'processor-three')
249
+ Kreuzberg.clear_post_processors
250
+ end
251
+
252
+ it 'reflects changes after unregistration' do
253
+ Kreuzberg.clear_post_processors
254
+ processor = lambda do |result|
255
+ result
256
+ end
257
+ Kreuzberg.register_post_processor('temp-processor', processor)
258
+
259
+ processors_before = Kreuzberg.list_post_processors
260
+ expect(processors_before).to include('temp-processor')
261
+
262
+ Kreuzberg.unregister_post_processor('temp-processor')
263
+
264
+ processors_after = Kreuzberg.list_post_processors
265
+ expect(processors_after).not_to include('temp-processor')
266
+ Kreuzberg.clear_post_processors
267
+ end
268
+ end
269
+ end