kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,227 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe 'Cache Management' do
6
+ let(:test_pdf) do
7
+ test_document_path('pdfs/5_level_paging_and_5_level_ept_intel_revision_1_1_may_2017.pdf')
8
+ end
9
+ let(:test_text) { test_document_path('text/contract_test.txt') }
10
+ let(:test_docx) { test_document_path('documents/contract.docx') }
11
+
12
+ before do
13
+ Kreuzberg.clear_cache
14
+ end
15
+
16
+ after do
17
+ Kreuzberg.clear_cache
18
+ end
19
+
20
+ describe 'clear_cache' do
21
+ it 'removes all cached results' do
22
+ Kreuzberg.extract_file_sync(test_pdf)
23
+ Kreuzberg.extract_file_sync(test_text)
24
+
25
+ stats_before = Kreuzberg.cache_stats
26
+ expect(stats_before['total_entries']).to be_positive
27
+
28
+ Kreuzberg.clear_cache
29
+
30
+ stats_after = Kreuzberg.cache_stats
31
+ expect(stats_after['total_entries']).to eq(0)
32
+ expect(stats_after['total_size_bytes']).to eq(0)
33
+ end
34
+
35
+ it 'returns nil (void return)' do
36
+ result = Kreuzberg.clear_cache
37
+ expect(result).to be_nil
38
+ end
39
+
40
+ it 'can be called multiple times safely' do
41
+ Kreuzberg.clear_cache
42
+ Kreuzberg.clear_cache
43
+ Kreuzberg.clear_cache
44
+
45
+ stats = Kreuzberg.cache_stats
46
+ expect(stats['total_entries']).to eq(0)
47
+ end
48
+
49
+ it 'does not affect future extractions' do
50
+ Kreuzberg.extract_file_sync(test_pdf)
51
+ Kreuzberg.clear_cache
52
+
53
+ result = Kreuzberg.extract_file_sync(test_pdf)
54
+
55
+ expect(result).to be_a(Kreuzberg::Result)
56
+ expect(result.content).not_to be_empty
57
+ end
58
+ end
59
+
60
+ describe 'cache_stats' do
61
+ it 'returns hash with correct structure' do
62
+ stats = Kreuzberg.cache_stats
63
+
64
+ expect(stats).to be_a(Hash)
65
+ expect(stats).to have_key('total_entries')
66
+ expect(stats).to have_key('total_size_bytes')
67
+ end
68
+
69
+ it 'returns zero stats when cache is empty' do
70
+ Kreuzberg.clear_cache
71
+ stats = Kreuzberg.cache_stats
72
+
73
+ expect(stats['total_entries']).to eq(0)
74
+ expect(stats['total_size_bytes']).to eq(0)
75
+ end
76
+
77
+ it 'shows entries after extractions' do
78
+ Kreuzberg.clear_cache
79
+
80
+ Kreuzberg.extract_file_sync(test_pdf)
81
+ stats = Kreuzberg.cache_stats
82
+
83
+ expect(stats['total_entries']).to be_positive
84
+ end
85
+
86
+ it 'shows total size in bytes' do
87
+ Kreuzberg.clear_cache
88
+
89
+ Kreuzberg.extract_file_sync(test_pdf)
90
+ stats = Kreuzberg.cache_stats
91
+
92
+ expect(stats['total_size_bytes']).to be_positive
93
+ end
94
+
95
+ it 'increases stats with multiple extractions' do
96
+ Kreuzberg.clear_cache
97
+
98
+ Kreuzberg.extract_file_sync(test_pdf)
99
+ stats_after_one = Kreuzberg.cache_stats
100
+
101
+ Kreuzberg.extract_file_sync(test_text)
102
+ stats_after_two = Kreuzberg.cache_stats
103
+
104
+ expect(stats_after_two['total_entries']).to be >= stats_after_one['total_entries']
105
+ end
106
+ end
107
+
108
+ describe 'cache behavior across extractions' do
109
+ it 'caches extraction results' do
110
+ Kreuzberg.clear_cache
111
+ stats_initial = Kreuzberg.cache_stats
112
+ expect(stats_initial['total_entries']).to eq(0)
113
+
114
+ result1 = Kreuzberg.extract_file_sync(test_pdf)
115
+ stats_after_first = Kreuzberg.cache_stats
116
+ expect(stats_after_first['total_entries']).to be_positive
117
+
118
+ result2 = Kreuzberg.extract_file_sync(test_pdf)
119
+ stats_after_second = Kreuzberg.cache_stats
120
+
121
+ expect(result1.content).to eq(result2.content)
122
+ expect(stats_after_second['total_entries']).to eq(stats_after_first['total_entries'] + 1)
123
+ end
124
+
125
+ it 'tracks different files separately' do
126
+ Kreuzberg.clear_cache
127
+
128
+ Kreuzberg.extract_file_sync(test_pdf)
129
+ stats_after_pdf = Kreuzberg.cache_stats
130
+
131
+ Kreuzberg.extract_file_sync(test_text)
132
+ stats_after_text = Kreuzberg.cache_stats
133
+
134
+ expect(stats_after_text['total_entries']).to be >= stats_after_pdf['total_entries']
135
+ end
136
+
137
+ it 'second extraction of same file may use cache' do
138
+ Kreuzberg.clear_cache
139
+
140
+ Time.now
141
+ result1 = Kreuzberg.extract_file_sync(test_pdf)
142
+ Time.now
143
+
144
+ Time.now
145
+ result2 = Kreuzberg.extract_file_sync(test_pdf)
146
+ Time.now
147
+
148
+ expect(result1.content).to eq(result2.content)
149
+ expect(result1.mime_type).to eq(result2.mime_type)
150
+ end
151
+
152
+ it 'clears cache between extractions when requested' do
153
+ result1 = Kreuzberg.extract_file_sync(test_pdf)
154
+
155
+ Kreuzberg.clear_cache
156
+
157
+ result2 = Kreuzberg.extract_file_sync(test_pdf)
158
+
159
+ expect(result1.content).to eq(result2.content)
160
+ end
161
+ end
162
+
163
+ describe 'cache with different configurations' do
164
+ it 'respects use_cache flag in configs' do
165
+ Kreuzberg.clear_cache
166
+
167
+ config1 = Kreuzberg::Config::Extraction.new(use_cache: true)
168
+ config2 = Kreuzberg::Config::Extraction.new(use_cache: false)
169
+
170
+ Kreuzberg.extract_file_sync(test_pdf, config: config1)
171
+ stats_after_first = Kreuzberg.cache_stats
172
+
173
+ Kreuzberg.extract_file_sync(test_pdf, config: config2)
174
+ stats_after_second = Kreuzberg.cache_stats
175
+
176
+ expect(stats_after_second['total_entries']).to eq(stats_after_first['total_entries'])
177
+ end
178
+ end
179
+
180
+ describe 'cache stats consistency' do
181
+ it 'stats remain consistent after clear' do
182
+ Kreuzberg.extract_file_sync(test_pdf)
183
+ Kreuzberg.extract_file_sync(test_text)
184
+
185
+ Kreuzberg.clear_cache
186
+ stats = Kreuzberg.cache_stats
187
+
188
+ expect(stats['total_entries']).to eq(0)
189
+ expect(stats['total_size_bytes']).to eq(0)
190
+ end
191
+
192
+ it 'stats update correctly after new extractions' do
193
+ Kreuzberg.clear_cache
194
+
195
+ Kreuzberg.extract_file_sync(test_pdf)
196
+ Kreuzberg.cache_stats
197
+
198
+ Kreuzberg.clear_cache
199
+
200
+ Kreuzberg.extract_file_sync(test_text)
201
+ stats2 = Kreuzberg.cache_stats
202
+
203
+ expect(stats2['total_entries']).to be_positive
204
+ end
205
+ end
206
+
207
+ describe 'integration with batch operations' do
208
+ it 'caches batch extraction results' do
209
+ Kreuzberg.clear_cache
210
+
211
+ results = Kreuzberg.batch_extract_files_sync([test_pdf, test_text])
212
+ stats = Kreuzberg.cache_stats
213
+
214
+ expect(results.length).to eq(2)
215
+ expect(stats['total_entries']).to be_positive
216
+ end
217
+
218
+ it 'clear_cache affects batch extractions' do
219
+ Kreuzberg.batch_extract_files_sync([test_pdf, test_text])
220
+
221
+ Kreuzberg.clear_cache
222
+
223
+ stats = Kreuzberg.cache_stats
224
+ expect(stats['total_entries']).to eq(0)
225
+ end
226
+ end
227
+ end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::CLIProxy do
4
+ describe '.find_cli_binary' do
5
+ context 'when binary exists' do
6
+ it 'finds the binary in search paths' do
7
+ binary = described_class.find_cli_binary
8
+ expect(binary).to be_a(Pathname)
9
+ expect(binary.file?).to be true
10
+ end
11
+ end
12
+
13
+ context 'when binary does not exist' do
14
+ before do
15
+ allow(described_class).to receive(:search_paths).and_return([])
16
+ end
17
+
18
+ it 'raises MissingBinaryError' do
19
+ expect do
20
+ described_class.find_cli_binary
21
+ end.to raise_error(Kreuzberg::CLIProxy::MissingBinaryError, /not found/)
22
+ end
23
+ end
24
+ end
25
+
26
+ describe '.call' do
27
+ context 'when binary is available' do
28
+ it 'executes CLI command successfully' do
29
+ output = described_class.call(['--version'])
30
+ expect(output).to be_a(String)
31
+ expect(output).not_to be_empty
32
+ end
33
+
34
+ it 'raises CLIExecutionError on failure' do
35
+ expect do
36
+ described_class.call(['invalid-command'])
37
+ end.to raise_error(Kreuzberg::CLIProxy::CLIExecutionError)
38
+ end
39
+ end
40
+ end
41
+
42
+ describe '.search_paths' do
43
+ it 'returns an array of Pathname objects' do
44
+ paths = described_class.search_paths('kreuzberg')
45
+ expect(paths).to be_an(Array)
46
+ expect(paths).to all(be_a(Pathname))
47
+ end
48
+
49
+ it 'includes expected search locations' do
50
+ paths = described_class.search_paths('kreuzberg')
51
+ path_strings = paths.map(&:to_s)
52
+
53
+ expect(path_strings.any? { |p| p.include?('lib/bin') }).to be true
54
+ expect(path_strings.any? { |p| p.include?('target/release') }).to be true
55
+ end
56
+ end
57
+
58
+ describe '.root_path' do
59
+ it 'returns a Pathname' do
60
+ expect(described_class.root_path).to be_a(Pathname)
61
+ end
62
+
63
+ it 'points to an existing directory' do
64
+ expect(described_class.root_path.directory?).to be true
65
+ end
66
+ end
67
+
68
+ describe '.lib_path' do
69
+ it 'returns a Pathname' do
70
+ expect(described_class.lib_path).to be_a(Pathname)
71
+ end
72
+
73
+ it 'points to an existing directory' do
74
+ expect(described_class.lib_path.directory?).to be true
75
+ end
76
+ end
77
+
78
+ describe '.missing_binary_message' do
79
+ it 'returns helpful error message' do
80
+ message = described_class.missing_binary_message
81
+ expect(message).to include('cargo build')
82
+ expect(message).to include('kreuzberg-cli')
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::CLI do
4
+ describe '.extract' do
5
+ it 'extracts content from a file' do
6
+ path = test_document_path('documents/simple.odt')
7
+ output = described_class.extract(path)
8
+
9
+ expect(output).to be_a(String)
10
+ expect(output).not_to be_empty
11
+ end
12
+
13
+ it 'accepts output format option' do
14
+ path = test_document_path('documents/simple.odt')
15
+ output = described_class.extract(path, output: 'json')
16
+
17
+ expect(output).to be_a(String)
18
+ expect(output).not_to be_empty
19
+ end
20
+
21
+ it 'accepts OCR option' do
22
+ path = test_document_path('pdfs/100_g_networking_technology_overview_slides_toronto_august_2016.pdf')
23
+ output = described_class.extract(path, ocr: false)
24
+
25
+ expect(output).to be_a(String)
26
+ expect(output).not_to be_empty
27
+ end
28
+ end
29
+
30
+ describe '.detect' do
31
+ it 'detects MIME type' do
32
+ path = test_document_path('documents/simple.odt')
33
+ mime_type = described_class.detect(path)
34
+
35
+ expect(mime_type).to be_a(String)
36
+ expect(mime_type).not_to be_empty
37
+ end
38
+ end
39
+
40
+ describe '.version' do
41
+ it 'returns version string' do
42
+ version = described_class.version
43
+ expect(version).to be_a(String)
44
+ expect(version).to match(/\d+\.\d+/)
45
+ end
46
+ end
47
+
48
+ describe '.help' do
49
+ it 'returns help text' do
50
+ help_text = described_class.help
51
+ expect(help_text).to be_a(String)
52
+ expect(help_text).to include('kreuzberg')
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,345 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config do
4
+ describe Kreuzberg::Config::OCR do
5
+ it 'creates with default values' do
6
+ ocr = described_class.new
7
+
8
+ expect(ocr.backend).to eq('tesseract')
9
+ expect(ocr.language).to eq('eng')
10
+ expect(ocr.tesseract_config).to be_nil
11
+ end
12
+
13
+ it 'creates with custom values' do
14
+ ocr = described_class.new(
15
+ backend: 'easyocr',
16
+ language: 'deu'
17
+ )
18
+
19
+ expect(ocr.backend).to eq('easyocr')
20
+ expect(ocr.language).to eq('deu')
21
+ end
22
+
23
+ it 'converts to hash' do
24
+ ocr = described_class.new(backend: 'tesseract', language: 'fra')
25
+ hash = ocr.to_h
26
+
27
+ expect(hash).to be_a(Hash)
28
+ expect(hash[:backend]).to eq('tesseract')
29
+ expect(hash[:language]).to eq('fra')
30
+ end
31
+ end
32
+
33
+ describe Kreuzberg::Config::Chunking do
34
+ it 'creates with default values' do
35
+ chunking = described_class.new
36
+
37
+ expect(chunking.max_chars).to eq(1000)
38
+ expect(chunking.max_overlap).to eq(200)
39
+ expect(chunking.preset).to be_nil
40
+ expect(chunking.embedding).to be_nil
41
+ end
42
+
43
+ it 'creates with custom values' do
44
+ chunking = described_class.new(
45
+ max_chars: 500,
46
+ max_overlap: 100,
47
+ preset: 'fast'
48
+ )
49
+
50
+ expect(chunking.max_chars).to eq(500)
51
+ expect(chunking.max_overlap).to eq(100)
52
+ expect(chunking.preset).to eq('fast')
53
+ end
54
+
55
+ it 'converts to hash' do
56
+ chunking = described_class.new(max_chars: 750)
57
+ hash = chunking.to_h
58
+
59
+ expect(hash).to be_a(Hash)
60
+ expect(hash[:max_chars]).to eq(750)
61
+ end
62
+ end
63
+
64
+ describe Kreuzberg::Config::LanguageDetection do
65
+ it 'creates with default values' do
66
+ lang = described_class.new
67
+
68
+ expect(lang.enabled).to be false
69
+ expect(lang.min_confidence).to eq(0.5)
70
+ end
71
+
72
+ it 'creates with custom values' do
73
+ lang = described_class.new(enabled: true, min_confidence: 0.9)
74
+
75
+ expect(lang.enabled).to be true
76
+ expect(lang.min_confidence).to eq(0.9)
77
+ end
78
+
79
+ it 'converts to hash' do
80
+ lang = described_class.new(enabled: true, min_confidence: 0.75)
81
+ hash = lang.to_h
82
+
83
+ expect(hash).to be_a(Hash)
84
+ expect(hash[:enabled]).to be true
85
+ expect(hash[:min_confidence]).to eq(0.75)
86
+ end
87
+ end
88
+
89
+ describe Kreuzberg::Config::PDF do
90
+ it 'creates with default values' do
91
+ pdf = described_class.new
92
+
93
+ expect(pdf.extract_images).to be false
94
+ expect(pdf.passwords).to be_nil
95
+ expect(pdf.extract_metadata).to be true
96
+ end
97
+
98
+ it 'creates with custom values' do
99
+ pdf = described_class.new(
100
+ extract_images: true,
101
+ passwords: %w[secret backup]
102
+ )
103
+
104
+ expect(pdf.extract_images).to be true
105
+ expect(pdf.passwords).to eq(%w[secret backup])
106
+ end
107
+
108
+ it 'converts to hash' do
109
+ pdf = described_class.new(extract_images: true, passwords: ['test'])
110
+ hash = pdf.to_h
111
+
112
+ expect(hash).to be_a(Hash)
113
+ expect(hash[:extract_images]).to be true
114
+ expect(hash[:passwords]).to eq(['test'])
115
+ end
116
+ end
117
+
118
+ describe Kreuzberg::Config::Extraction do
119
+ describe '.from_file' do
120
+ it 'loads configuration from TOML file' do
121
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
122
+ config = described_class.from_file(config_path)
123
+
124
+ expect(config.use_cache).to be false
125
+ expect(config.enable_quality_processing).to be true
126
+ expect(config.force_ocr).to be true
127
+ end
128
+
129
+ it 'loads OCR config from TOML file' do
130
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
131
+ config = described_class.from_file(config_path)
132
+
133
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
134
+ expect(config.ocr.backend).to eq('tesseract')
135
+ expect(config.ocr.language).to eq('deu')
136
+ end
137
+
138
+ it 'loads chunking config from TOML file' do
139
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
140
+ config = described_class.from_file(config_path)
141
+
142
+ expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
143
+ expect(config.chunking.max_chars).to eq(500)
144
+ expect(config.chunking.max_overlap).to eq(100)
145
+ expect(config.chunking.preset).to eq('fast')
146
+ end
147
+
148
+ it 'loads language detection config from TOML file' do
149
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
150
+ config = described_class.from_file(config_path)
151
+
152
+ expect(config.language_detection).to be_a(Kreuzberg::Config::LanguageDetection)
153
+ expect(config.language_detection.enabled).to be true
154
+ expect(config.language_detection.min_confidence).to eq(0.9)
155
+ end
156
+
157
+ it 'loads PDF options from TOML file' do
158
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
159
+ config = described_class.from_file(config_path)
160
+
161
+ expect(config.pdf_options).to be_a(Kreuzberg::Config::PDF)
162
+ expect(config.pdf_options.extract_images).to be true
163
+ expect(config.pdf_options.passwords).to eq(%w[secret backup])
164
+ expect(config.pdf_options.extract_metadata).to be true
165
+ end
166
+
167
+ it 'loads configuration from YAML file' do
168
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
169
+ config = described_class.from_file(config_path)
170
+
171
+ expect(config.use_cache).to be false
172
+ expect(config.enable_quality_processing).to be true
173
+ expect(config.force_ocr).to be true
174
+ end
175
+
176
+ it 'loads OCR config from YAML file' do
177
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
178
+ config = described_class.from_file(config_path)
179
+
180
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
181
+ expect(config.ocr.backend).to eq('tesseract')
182
+ expect(config.ocr.language).to eq('fra')
183
+ end
184
+
185
+ it 'loads chunking config from YAML file' do
186
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
187
+ config = described_class.from_file(config_path)
188
+
189
+ expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
190
+ expect(config.chunking.max_chars).to eq(750)
191
+ expect(config.chunking.max_overlap).to eq(150)
192
+ expect(config.chunking.preset).to eq('balanced')
193
+ end
194
+
195
+ it 'works with absolute paths' do
196
+ config_path = File.expand_path('../fixtures/config.toml', __dir__)
197
+ config = described_class.from_file(config_path)
198
+
199
+ expect(config.use_cache).to be false
200
+ end
201
+
202
+ it 'works with relative paths' do
203
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
204
+ config = described_class.from_file(config_path)
205
+
206
+ expect(config.use_cache).to be false
207
+ end
208
+
209
+ it 'raises error for non-existent file' do
210
+ expect do
211
+ described_class.from_file('/path/to/nonexistent/config.toml')
212
+ end.to raise_error(Kreuzberg::Errors::ValidationError, /Failed to read config file/)
213
+ end
214
+
215
+ it 'raises error for invalid TOML file' do
216
+ config_path = File.join(__dir__, '..', 'fixtures', 'invalid_config.toml')
217
+ expect do
218
+ described_class.from_file(config_path)
219
+ end.to raise_error(Kreuzberg::Errors::ValidationError, /Invalid TOML/)
220
+ end
221
+
222
+ it 'detects file format from extension' do
223
+ toml_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
224
+ yaml_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
225
+
226
+ toml_config = described_class.from_file(toml_path)
227
+ yaml_config = described_class.from_file(yaml_path)
228
+
229
+ expect(toml_config.ocr.language).to eq('deu')
230
+ expect(yaml_config.ocr.language).to eq('fra')
231
+ end
232
+ end
233
+
234
+ it 'creates with default values' do
235
+ config = described_class.new
236
+
237
+ expect(config.use_cache).to be true
238
+ expect(config.enable_quality_processing).to be false
239
+ expect(config.force_ocr).to be false
240
+ expect(config.ocr).to be_nil
241
+ expect(config.chunking).to be_nil
242
+ expect(config.language_detection).to be_nil
243
+ expect(config.pdf_options).to be_nil
244
+ end
245
+
246
+ it 'creates with custom values' do
247
+ ocr = Kreuzberg::Config::OCR.new(backend: 'easyocr')
248
+ chunking = Kreuzberg::Config::Chunking.new(max_chars: 500)
249
+ lang = Kreuzberg::Config::LanguageDetection.new(enabled: true)
250
+ pdf = Kreuzberg::Config::PDF.new(extract_images: true)
251
+
252
+ config = described_class.new(
253
+ use_cache: false,
254
+ enable_quality_processing: true,
255
+ force_ocr: true,
256
+ ocr: ocr,
257
+ chunking: chunking,
258
+ language_detection: lang,
259
+ pdf_options: pdf
260
+ )
261
+
262
+ expect(config.use_cache).to be false
263
+ expect(config.enable_quality_processing).to be true
264
+ expect(config.force_ocr).to be true
265
+ expect(config.ocr).to eq(ocr)
266
+ expect(config.chunking).to eq(chunking)
267
+ expect(config.language_detection).to eq(lang)
268
+ expect(config.pdf_options).to eq(pdf)
269
+ end
270
+
271
+ it 'accepts hash for nested configs' do
272
+ config = described_class.new(
273
+ ocr: { backend: 'tesseract', language: 'eng' },
274
+ chunking: { max_chars: 500 }
275
+ )
276
+
277
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
278
+ expect(config.ocr.backend).to eq('tesseract')
279
+ expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
280
+ expect(config.chunking.max_chars).to eq(500)
281
+ end
282
+
283
+ it 'converts to hash' do
284
+ config = described_class.new(
285
+ use_cache: false,
286
+ ocr: { backend: 'tesseract' }
287
+ )
288
+ hash = config.to_h
289
+
290
+ expect(hash).to be_a(Hash)
291
+ expect(hash[:use_cache]).to be false
292
+ expect(hash[:ocr]).to be_a(Hash)
293
+ expect(hash[:ocr][:backend]).to eq('tesseract')
294
+ end
295
+
296
+ it 'raises error for invalid config type' do
297
+ expect do
298
+ described_class.new(ocr: 'invalid')
299
+ end.to raise_error(ArgumentError, /Expected.*OCR/)
300
+ end
301
+ end
302
+
303
+ describe 'ExtractionConfig alias' do
304
+ it 'exists at module level' do
305
+ expect(Kreuzberg.const_defined?(:ExtractionConfig)).to be true
306
+ end
307
+
308
+ it 'is the same class as Config::Extraction' do
309
+ expect(Kreuzberg::ExtractionConfig).to eq(Kreuzberg::Config::Extraction)
310
+ end
311
+
312
+ it 'can be instantiated using the alias' do
313
+ config = Kreuzberg::ExtractionConfig.new(use_cache: false)
314
+
315
+ expect(config).to be_a(Kreuzberg::Config::Extraction)
316
+ expect(config.use_cache).to be false
317
+ end
318
+
319
+ it 'supports all methods through the alias' do
320
+ config = Kreuzberg::ExtractionConfig.new(
321
+ use_cache: false,
322
+ force_ocr: true,
323
+ ocr: { backend: 'tesseract', language: 'eng' }
324
+ )
325
+
326
+ expect(config.use_cache).to be false
327
+ expect(config.force_ocr).to be true
328
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
329
+ expect(config.ocr.backend).to eq('tesseract')
330
+
331
+ hash = config.to_h
332
+ expect(hash[:use_cache]).to be false
333
+ expect(hash[:force_ocr]).to be true
334
+ end
335
+
336
+ it 'supports from_file through the alias' do
337
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
338
+ config = Kreuzberg::ExtractionConfig.from_file(config_path)
339
+
340
+ expect(config).to be_a(Kreuzberg::Config::Extraction)
341
+ expect(config.use_cache).to be false
342
+ expect(config.enable_quality_processing).to be true
343
+ end
344
+ end
345
+ end