kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,274 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe 'Validator Plugin System' do
6
+ let(:test_pdf) { test_document_path('text/contract_test.txt') }
7
+
8
+ after do
9
+ Kreuzberg.clear_validators
10
+ end
11
+
12
+ describe 'registering validator as Proc' do
13
+ it 'registers and executes Proc validator during extraction' do
14
+ validator_called = false
15
+ validator = lambda do |_result|
16
+ validator_called = true
17
+ end
18
+
19
+ Kreuzberg.register_validator('check_called', validator)
20
+ Kreuzberg.extract_file_sync(test_pdf)
21
+
22
+ expect(validator_called).to be true
23
+ end
24
+
25
+ it 'allows extraction to proceed when validator passes' do
26
+ validator = lambda do |result|
27
+ # Validation passes - do nothing
28
+ end
29
+
30
+ Kreuzberg.register_validator('pass_validator', validator)
31
+ result = Kreuzberg.extract_file_sync(test_pdf)
32
+
33
+ expect(result).to be_a(Kreuzberg::Result)
34
+ expect(result.content).not_to be_empty
35
+ end
36
+
37
+ it 'prevents extraction when validator raises ValidationError' do
38
+ validator = lambda do |result|
39
+ if result['content'].length < 10_000_000
40
+ raise Kreuzberg::Errors::ValidationError, 'Content too short for this test'
41
+ end
42
+ end
43
+
44
+ Kreuzberg.register_validator('min_length', validator)
45
+
46
+ expect do
47
+ Kreuzberg.extract_file_sync(test_pdf)
48
+ end.to raise_error(Kreuzberg::Errors::ValidationError, /Content too short/)
49
+ end
50
+ end
51
+
52
+ describe 'registering validator as class' do
53
+ it 'registers and executes class-based validator' do
54
+ class MinimumLengthValidator
55
+ include Kreuzberg::ValidatorProtocol
56
+
57
+ def initialize(min_length)
58
+ @min_length = min_length
59
+ end
60
+
61
+ def call(result)
62
+ return unless result['content'].length < @min_length
63
+
64
+ raise Kreuzberg::Errors::ValidationError, "Content too short: #{result['content'].length} < #{@min_length}"
65
+ end
66
+ end
67
+
68
+ validator = MinimumLengthValidator.new(10)
69
+ Kreuzberg.register_validator('min_length', validator)
70
+ result = Kreuzberg.extract_file_sync(test_pdf)
71
+
72
+ expect(result).to be_a(Kreuzberg::Result)
73
+ expect(result.content.length).to be >= 10
74
+ end
75
+
76
+ it 'validates based on content characteristics' do
77
+ class NonEmptyValidator
78
+ include Kreuzberg::ValidatorProtocol
79
+
80
+ def call(result)
81
+ return unless result['content'].strip.empty?
82
+
83
+ raise Kreuzberg::Errors::ValidationError, 'Content cannot be empty'
84
+ end
85
+ end
86
+
87
+ validator = NonEmptyValidator.new
88
+ Kreuzberg.register_validator('non_empty', validator)
89
+ result = Kreuzberg.extract_file_sync(test_pdf)
90
+
91
+ expect(result.content.strip).not_to be_empty
92
+ end
93
+ end
94
+
95
+ describe 'validator receives correct parameters' do
96
+ it 'receives result hash with all required fields' do
97
+ received_result = nil
98
+ validator = lambda do |result|
99
+ received_result = result
100
+ end
101
+
102
+ Kreuzberg.register_validator('capture', validator)
103
+ Kreuzberg.extract_file_sync(test_pdf)
104
+
105
+ expect(received_result).to be_a(Hash)
106
+ expect(received_result).to have_key('content')
107
+ expect(received_result).to have_key('mime_type')
108
+ expect(received_result).to have_key('metadata')
109
+ expect(received_result).to have_key('tables')
110
+ end
111
+
112
+ it 'receives correct content in result hash' do
113
+ received_content = nil
114
+ validator = lambda do |result|
115
+ received_content = result['content']
116
+ end
117
+
118
+ Kreuzberg.register_validator('capture_content', validator)
119
+ result = Kreuzberg.extract_file_sync(test_pdf)
120
+
121
+ expect(received_content).to eq(result.content)
122
+ end
123
+ end
124
+
125
+ describe 'multiple validators' do
126
+ it 'executes all registered validators' do
127
+ validator1_called = false
128
+ validator2_called = false
129
+
130
+ validator1 = lambda do |_result|
131
+ validator1_called = true
132
+ end
133
+
134
+ validator2 = lambda do |_result|
135
+ validator2_called = true
136
+ end
137
+
138
+ Kreuzberg.register_validator('val1', validator1)
139
+ Kreuzberg.register_validator('val2', validator2)
140
+ Kreuzberg.extract_file_sync(test_pdf)
141
+
142
+ expect(validator1_called).to be true
143
+ expect(validator2_called).to be true
144
+ end
145
+
146
+ it 'stops execution if any validator fails' do
147
+ validator1 = lambda do |_result|
148
+ raise Kreuzberg::Errors::ValidationError, 'First validator failed'
149
+ end
150
+
151
+ validator2 = lambda do |_result|
152
+ raise StandardError, 'This should not be reached'
153
+ end
154
+
155
+ Kreuzberg.register_validator('fail_first', validator1)
156
+ Kreuzberg.register_validator('never_reached', validator2)
157
+
158
+ expect do
159
+ Kreuzberg.extract_file_sync(test_pdf)
160
+ end.to raise_error(Kreuzberg::Errors::ValidationError, /First validator failed/)
161
+ end
162
+ end
163
+
164
+ describe 'unregister_validator' do
165
+ it 'removes a registered validator by name' do
166
+ validator = lambda do |_result|
167
+ raise Kreuzberg::Errors::ValidationError, 'Should not be called'
168
+ end
169
+
170
+ Kreuzberg.register_validator('removable', validator)
171
+ Kreuzberg.unregister_validator('removable')
172
+
173
+ expect do
174
+ Kreuzberg.extract_file_sync(test_pdf)
175
+ end.not_to raise_error
176
+ end
177
+
178
+ it 'does not affect other registered validators' do
179
+ validator1_called = false
180
+ validator3_called = false
181
+
182
+ validator1 = lambda do |_result|
183
+ validator1_called = true
184
+ end
185
+
186
+ validator2 = lambda do |_result|
187
+ raise Kreuzberg::Errors::ValidationError, 'Should not be called'
188
+ end
189
+
190
+ validator3 = lambda do |_result|
191
+ validator3_called = true
192
+ end
193
+
194
+ Kreuzberg.register_validator('keep1', validator1)
195
+ Kreuzberg.register_validator('remove', validator2)
196
+ Kreuzberg.register_validator('keep3', validator3)
197
+
198
+ Kreuzberg.unregister_validator('remove')
199
+ Kreuzberg.extract_file_sync(test_pdf)
200
+
201
+ expect(validator1_called).to be true
202
+ expect(validator3_called).to be true
203
+ end
204
+ end
205
+
206
+ describe 'clear_validators' do
207
+ it 'removes all registered validators' do
208
+ validator1 = lambda do |_result|
209
+ raise Kreuzberg::Errors::ValidationError, 'Should not be called 1'
210
+ end
211
+
212
+ validator2 = lambda do |_result|
213
+ raise Kreuzberg::Errors::ValidationError, 'Should not be called 2'
214
+ end
215
+
216
+ Kreuzberg.register_validator('val1', validator1)
217
+ Kreuzberg.register_validator('val2', validator2)
218
+
219
+ Kreuzberg.clear_validators
220
+
221
+ expect do
222
+ Kreuzberg.extract_file_sync(test_pdf)
223
+ end.not_to raise_error
224
+ end
225
+ end
226
+
227
+ describe 'list_validators' do
228
+ it 'returns empty array when no validators registered' do
229
+ Kreuzberg.clear_validators
230
+ validators = Kreuzberg.list_validators
231
+ expect(validators).to be_an(Array)
232
+ expect(validators).to be_empty
233
+ end
234
+
235
+ it 'returns validator names after registration' do
236
+ Kreuzberg.clear_validators
237
+ validator = ->(result) {}
238
+ Kreuzberg.register_validator('test-validator', validator)
239
+ validators = Kreuzberg.list_validators
240
+ expect(validators).to include('test-validator')
241
+ Kreuzberg.clear_validators
242
+ end
243
+
244
+ it 'returns all registered validator names' do
245
+ Kreuzberg.clear_validators
246
+ validator1 = ->(result) {}
247
+ validator2 = ->(result) {}
248
+ validator3 = ->(result) {}
249
+
250
+ Kreuzberg.register_validator('validator-one', validator1)
251
+ Kreuzberg.register_validator('validator-two', validator2)
252
+ Kreuzberg.register_validator('validator-three', validator3)
253
+
254
+ validators = Kreuzberg.list_validators
255
+ expect(validators).to contain_exactly('validator-one', 'validator-two', 'validator-three')
256
+ Kreuzberg.clear_validators
257
+ end
258
+
259
+ it 'reflects changes after unregistration' do
260
+ Kreuzberg.clear_validators
261
+ validator = ->(result) {}
262
+ Kreuzberg.register_validator('temp-validator', validator)
263
+
264
+ validators_before = Kreuzberg.list_validators
265
+ expect(validators_before).to include('temp-validator')
266
+
267
+ Kreuzberg.unregister_validator('temp-validator')
268
+
269
+ validators_after = Kreuzberg.list_validators
270
+ expect(validators_after).not_to include('temp-validator')
271
+ Kreuzberg.clear_validators
272
+ end
273
+ end
274
+ end
@@ -0,0 +1,39 @@
1
+ # Test configuration file for Kreuzberg Ruby bindings
2
+
3
+ use_cache = false
4
+ enable_quality_processing = true
5
+ force_ocr = true
6
+
7
+ [ocr]
8
+ backend = "tesseract"
9
+ language = "deu"
10
+
11
+ [chunking]
12
+ max_chars = 500
13
+ max_overlap = 100
14
+ preset = "fast"
15
+
16
+ [language_detection]
17
+ enabled = true
18
+ min_confidence = 0.9
19
+
20
+ [pdf_options]
21
+ extract_images = true
22
+ passwords = ["secret", "backup"]
23
+ extract_metadata = true
24
+
25
+ [images]
26
+ extract_images = true
27
+ target_dpi = 600
28
+ max_image_dimension = 2000
29
+ auto_adjust_dpi = false
30
+ min_dpi = 150
31
+ max_dpi = 600
32
+
33
+ [postprocessor]
34
+ enabled = true
35
+ enabled_processors = ["quality", "formatting"]
36
+
37
+ [token_reduction]
38
+ mode = "moderate"
39
+ preserve_important_words = true
@@ -0,0 +1,41 @@
1
+
2
+ use_cache: false
3
+ enable_quality_processing: true
4
+ force_ocr: true
5
+
6
+ ocr:
7
+ backend: tesseract
8
+ language: fra
9
+
10
+ chunking:
11
+ max_chars: 750
12
+ max_overlap: 150
13
+ preset: balanced
14
+
15
+ language_detection:
16
+ enabled: true
17
+ min_confidence: 0.85
18
+
19
+ pdf_options:
20
+ extract_images: false
21
+ passwords:
22
+ - password1
23
+ - password2
24
+ extract_metadata: true
25
+
26
+ images:
27
+ extract_images: true
28
+ target_dpi: 300
29
+ max_image_dimension: 4096
30
+ auto_adjust_dpi: true
31
+ min_dpi: 72
32
+ max_dpi: 600
33
+
34
+ postprocessor:
35
+ enabled: false
36
+ disabled_processors:
37
+ - token_reduction
38
+
39
+ token_reduction:
40
+ mode: light
41
+ preserve_important_words: false
@@ -0,0 +1,4 @@
1
+ # Invalid TOML file for testing error handling
2
+ use_cache = "not_a_boolean"
3
+ [ocr
4
+ backend = "tesseract"
@@ -0,0 +1,178 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Basic smoke tests to verify package structure and imports work
4
+ require 'stringio'
5
+
6
+ RSpec.describe 'Kreuzberg package' do
7
+ describe 'import and structure' do
8
+ it 'can be required without errors' do
9
+ expect { require 'kreuzberg' }.not_to raise_error
10
+ end
11
+
12
+ it 'has a version constant' do
13
+ expect(Kreuzberg::VERSION).not_to be_nil
14
+ expect(Kreuzberg::VERSION).to be_a(String)
15
+ expect(Kreuzberg::VERSION).to match(/^\d+\.\d+\.\d+/)
16
+ end
17
+ end
18
+
19
+ describe 'public API exports' do
20
+ describe 'configuration classes' do
21
+ it 'exports Config::Extraction' do
22
+ expect(defined?(Kreuzberg::Config::Extraction)).to eq('constant')
23
+ end
24
+
25
+ it 'exports Config::OCR' do
26
+ expect(defined?(Kreuzberg::Config::OCR)).to eq('constant')
27
+ end
28
+
29
+ it 'exports Config::Chunking' do
30
+ expect(defined?(Kreuzberg::Config::Chunking)).to eq('constant')
31
+ end
32
+
33
+ it 'exports Config::LanguageDetection' do
34
+ expect(defined?(Kreuzberg::Config::LanguageDetection)).to eq('constant')
35
+ end
36
+
37
+ it 'exports Config::PDF' do
38
+ expect(defined?(Kreuzberg::Config::PDF)).to eq('constant')
39
+ end
40
+
41
+ it 'exports Config::HtmlOptions' do
42
+ expect(defined?(Kreuzberg::Config::HtmlOptions)).to eq('constant')
43
+ end
44
+
45
+ it 'exports Config::Keywords' do
46
+ expect(defined?(Kreuzberg::Config::Keywords)).to eq('constant')
47
+ end
48
+ end
49
+
50
+ describe 'result classes' do
51
+ it 'exports Result' do
52
+ expect(defined?(Kreuzberg::Result)).to eq('constant')
53
+ end
54
+
55
+ it 'exports Result::Table' do
56
+ expect(defined?(Kreuzberg::Result::Table)).to eq('constant')
57
+ end
58
+
59
+ it 'exports Result::Chunk' do
60
+ expect(defined?(Kreuzberg::Result::Chunk)).to eq('constant')
61
+ end
62
+
63
+ it 'exports Result::Image' do
64
+ expect(defined?(Kreuzberg::Result::Image)).to eq('constant')
65
+ end
66
+ end
67
+
68
+ describe 'exception classes' do
69
+ it 'exports Errors::Error' do
70
+ expect(defined?(Kreuzberg::Errors::Error)).to eq('constant')
71
+ end
72
+
73
+ it 'exports Errors::ValidationError' do
74
+ expect(defined?(Kreuzberg::Errors::ValidationError)).to eq('constant')
75
+ end
76
+
77
+ it 'exports Errors::ParsingError' do
78
+ expect(defined?(Kreuzberg::Errors::ParsingError)).to eq('constant')
79
+ end
80
+
81
+ it 'exports Errors::OCRError' do
82
+ expect(defined?(Kreuzberg::Errors::OCRError)).to eq('constant')
83
+ end
84
+
85
+ it 'exports Errors::MissingDependencyError' do
86
+ expect(defined?(Kreuzberg::Errors::MissingDependencyError)).to eq('constant')
87
+ end
88
+
89
+ it 'exports Errors::IOError' do
90
+ expect(defined?(Kreuzberg::Errors::IOError)).to eq('constant')
91
+ end
92
+
93
+ it 'exports Errors::PluginError' do
94
+ expect(defined?(Kreuzberg::Errors::PluginError)).to eq('constant')
95
+ end
96
+ end
97
+
98
+ describe 'extraction functions (sync)' do
99
+ it 'exports extract_file_sync' do
100
+ expect(Kreuzberg).to respond_to(:extract_file_sync)
101
+ end
102
+
103
+ it 'exports extract_bytes_sync' do
104
+ expect(Kreuzberg).to respond_to(:extract_bytes_sync)
105
+ end
106
+
107
+ it 'exports batch_extract_files_sync' do
108
+ expect(Kreuzberg).to respond_to(:batch_extract_files_sync)
109
+ end
110
+ end
111
+
112
+ describe 'extraction functions (async)' do
113
+ it 'exports extract_file' do
114
+ expect(Kreuzberg).to respond_to(:extract_file)
115
+ end
116
+
117
+ it 'exports extract_bytes' do
118
+ expect(Kreuzberg).to respond_to(:extract_bytes)
119
+ end
120
+
121
+ it 'exports batch_extract_files' do
122
+ expect(Kreuzberg).to respond_to(:batch_extract_files)
123
+ end
124
+ end
125
+
126
+ describe 'utility modules' do
127
+ it 'exports CLI' do
128
+ expect(defined?(Kreuzberg::CLI)).to eq('constant')
129
+ end
130
+
131
+ it 'exports CLIProxy' do
132
+ expect(defined?(Kreuzberg::CLIProxy)).to eq('constant')
133
+ end
134
+
135
+ it 'exports APIProxy' do
136
+ expect(defined?(Kreuzberg::APIProxy)).to eq('constant')
137
+ end
138
+
139
+ it 'exports MCPProxy' do
140
+ expect(defined?(Kreuzberg::MCPProxy)).to eq('constant')
141
+ end
142
+ end
143
+ end
144
+
145
+ describe 'module structure' do
146
+ it 'defines Kreuzberg as a module' do
147
+ expect(Kreuzberg).to be_a(Module)
148
+ end
149
+
150
+ it 'defines Kreuzberg::Config as a module' do
151
+ expect(Kreuzberg::Config).to be_a(Module)
152
+ end
153
+
154
+ it 'defines Kreuzberg::Errors as a module' do
155
+ expect(Kreuzberg::Errors).to be_a(Module)
156
+ end
157
+ end
158
+
159
+ describe 'basic extraction smoke tests' do
160
+ it 'extracts inline text via bytes API' do
161
+ bytes = StringIO.new('Hello from Kreuzberg')
162
+ result = Kreuzberg.extract_bytes_sync(bytes.string, 'text/plain')
163
+
164
+ expect(result.content).to include('Hello')
165
+ expect(result.mime_type).to eq('text/plain')
166
+ end
167
+
168
+ it 'extracts from small temp file via sync API' do
169
+ file = create_test_file('Simple document for smoke testing')
170
+ result = Kreuzberg.extract_file_sync(file)
171
+
172
+ expect(result.content).to include('Simple document')
173
+ expect(result.mime_type).to eq('text/plain')
174
+ ensure
175
+ File.delete(file) if file && File.exist?(file)
176
+ end
177
+ end
178
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'kreuzberg'
4
+ require 'tmpdir'
5
+ require 'fileutils'
6
+
7
+ RSpec.configure do |config|
8
+ config.expect_with :rspec do |expectations|
9
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
10
+ end
11
+
12
+ config.mock_with :rspec do |mocks|
13
+ mocks.verify_partial_doubles = true
14
+ end
15
+
16
+ config.shared_context_metadata_behavior = :apply_to_host_groups
17
+ config.filter_run_when_matching :focus
18
+ config.example_status_persistence_file_path = 'spec/examples.txt'
19
+ config.disable_monkey_patching!
20
+ config.warnings = true
21
+ config.default_formatter = 'doc' if config.files_to_run.one?
22
+ config.order = :random
23
+ Kernel.srand config.seed
24
+
25
+ # Helpers
26
+ config.include(Module.new do
27
+ def fixture_path(filename)
28
+ File.join(__dir__, 'fixtures', filename)
29
+ end
30
+
31
+ def test_document_path(relative_path)
32
+ # Go up from packages/ruby/spec to project root, then into test_documents
33
+ File.expand_path(File.join(__dir__, '..', '..', '..', 'test_documents', relative_path))
34
+ end
35
+
36
+ def create_test_file(content, filename: 'test.txt')
37
+ path = File.join(Dir.tmpdir, filename)
38
+ File.write(path, content)
39
+ path
40
+ end
41
+ end)
42
+ end