kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Kreuzberg
6
+ # Error code constants matching kreuzberg-ffi error codes
7
+ ERROR_CODE_SUCCESS = 0
8
+ ERROR_CODE_GENERIC = 1
9
+ ERROR_CODE_PANIC = 2
10
+ ERROR_CODE_INVALID_ARGUMENT = 3
11
+ ERROR_CODE_IO = 4
12
+ ERROR_CODE_PARSING = 5
13
+ ERROR_CODE_OCR = 6
14
+ ERROR_CODE_MISSING_DEPENDENCY = 7
15
+
16
+ module Errors
17
+ # Panic context information from FFI error introspection
18
+ class PanicContext
19
+ attr_reader :file, :line, :function, :message, :timestamp_secs
20
+
21
+ def initialize(file:, line:, function:, message:, timestamp_secs:)
22
+ @file = file
23
+ @line = line
24
+ @function = function
25
+ @message = message
26
+ @timestamp_secs = timestamp_secs
27
+ end
28
+
29
+ def to_s
30
+ "#{file}:#{line}:#{function}: #{message}"
31
+ end
32
+
33
+ def to_h
34
+ {
35
+ file:,
36
+ line:,
37
+ function:,
38
+ message:,
39
+ timestamp_secs:
40
+ }
41
+ end
42
+
43
+ def self.from_json(json_string)
44
+ return nil if json_string.nil? || json_string.empty?
45
+
46
+ data = JSON.parse(json_string, symbolize_names: true)
47
+ sliced = data.slice(:file, :line, :function, :message, :timestamp_secs)
48
+ new(**with_defaults(sliced))
49
+ rescue JSON::ParserError
50
+ nil
51
+ end
52
+
53
+ def self.with_defaults(sliced)
54
+ {
55
+ file: sliced[:file] || '',
56
+ line: sliced[:line] || 0,
57
+ function: sliced[:function] || '',
58
+ message: sliced[:message] || '',
59
+ timestamp_secs: sliced[:timestamp_secs] || 0
60
+ }
61
+ end
62
+ private_class_method :with_defaults
63
+ end
64
+
65
+ # Base error class for all Kreuzberg errors
66
+ class Error < StandardError
67
+ attr_reader :panic_context, :error_code
68
+
69
+ def initialize(message, panic_context: nil, error_code: nil)
70
+ super(message)
71
+ @panic_context = panic_context
72
+ @error_code = error_code
73
+ end
74
+ end
75
+
76
+ # Raised when validation fails
77
+ class ValidationError < Error; end
78
+
79
+ # Raised when document parsing fails
80
+ class ParsingError < Error
81
+ attr_reader :context
82
+
83
+ def initialize(message, context: nil, panic_context: nil, error_code: nil)
84
+ super(message, panic_context:, error_code:)
85
+ @context = context
86
+ end
87
+ end
88
+
89
+ # Raised when OCR processing fails
90
+ class OCRError < Error
91
+ attr_reader :context
92
+
93
+ def initialize(message, context: nil, panic_context: nil, error_code: nil)
94
+ super(message, panic_context:, error_code:)
95
+ @context = context
96
+ end
97
+ end
98
+
99
+ # Raised when a required dependency is missing
100
+ class MissingDependencyError < Error
101
+ attr_reader :dependency
102
+
103
+ def initialize(message, dependency: nil, panic_context: nil, error_code: nil)
104
+ super(message, panic_context:, error_code:)
105
+ @dependency = dependency
106
+ end
107
+ end
108
+
109
+ # Raised when an I/O operation fails
110
+ class IOError < Error; end
111
+
112
+ # Raised when plugin operations fail
113
+ class PluginError < Error; end
114
+
115
+ # Raised when an unsupported file format or MIME type is encountered
116
+ class UnsupportedFormatError < Error; end
117
+ end
118
+ end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ # Provides extraction methods for documents and text.
5
+ module ExtractionAPI
6
+ def extract_file_sync(path, mime_type: nil, config: nil)
7
+ opts = normalize_config(config)
8
+ hash = if mime_type
9
+ native_extract_file_sync(path.to_s, mime_type.to_s, **opts)
10
+ else
11
+ native_extract_file_sync(path.to_s, **opts)
12
+ end
13
+ result = Result.new(hash)
14
+ record_cache_entry!(result, opts)
15
+ result
16
+ end
17
+
18
+ def extract_bytes_sync(data, mime_type, config: nil)
19
+ opts = normalize_config(config)
20
+ hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
21
+ result = Result.new(hash)
22
+ record_cache_entry!(result, opts)
23
+ result
24
+ end
25
+
26
+ def batch_extract_files_sync(paths, config: nil)
27
+ opts = normalize_config(config)
28
+ hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
29
+ results = hashes.map { |hash| Result.new(hash) }
30
+ record_cache_entry!(results, opts)
31
+ results
32
+ end
33
+
34
+ def extract_file(path, mime_type: nil, config: nil)
35
+ opts = normalize_config(config)
36
+ hash = if mime_type
37
+ native_extract_file(path.to_s, mime_type.to_s, **opts)
38
+ else
39
+ native_extract_file(path.to_s, **opts)
40
+ end
41
+ result = Result.new(hash)
42
+ record_cache_entry!(result, opts)
43
+ result
44
+ end
45
+
46
+ def extract_bytes(data, mime_type, config: nil)
47
+ opts = normalize_config(config)
48
+ hash = native_extract_bytes(data.to_s, mime_type.to_s, **opts)
49
+ result = Result.new(hash)
50
+ record_cache_entry!(result, opts)
51
+ result
52
+ end
53
+
54
+ def batch_extract_files(paths, config: nil)
55
+ opts = normalize_config(config)
56
+ hashes = native_batch_extract_files(paths.map(&:to_s), **opts)
57
+ results = hashes.map { |hash| Result.new(hash) }
58
+ record_cache_entry!(results, opts)
59
+ results
60
+ end
61
+
62
+ def batch_extract_bytes_sync(data_array, mime_types, config: nil)
63
+ opts = normalize_config(config)
64
+ hashes = native_batch_extract_bytes_sync(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
65
+ results = hashes.map { |hash| Result.new(hash) }
66
+ record_cache_entry!(results, opts)
67
+ results
68
+ end
69
+
70
+ def batch_extract_bytes(data_array, mime_types, config: nil)
71
+ opts = normalize_config(config)
72
+ hashes = native_batch_extract_bytes(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
73
+ results = hashes.map { |hash| Result.new(hash) }
74
+ record_cache_entry!(results, opts)
75
+ results
76
+ end
77
+
78
+ def normalize_config(config)
79
+ return {} if config.nil?
80
+ return config if config.is_a?(Hash)
81
+
82
+ config.to_h
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,186 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+ require 'pathname'
5
+ require 'json'
6
+
7
+ module Kreuzberg
8
+ # MCP (Model Context Protocol) server proxy
9
+ #
10
+ # Starts and manages the Kreuzberg MCP server for Claude Desktop integration.
11
+ #
12
+ # @example Start MCP server
13
+ # server = Kreuzberg::MCPProxy.new
14
+ # server.start
15
+ #
16
+ module MCPProxy
17
+ Error = Class.new(Kreuzberg::Errors::Error)
18
+ MissingBinaryError = Class.new(Error)
19
+ ServerError = Class.new(Error)
20
+
21
+ # MCP server instance
22
+ class Server
23
+ attr_reader :pid, :transport
24
+
25
+ # Initialize MCP server
26
+ #
27
+ # @param transport [String] Transport method ("stdio" or "sse")
28
+ #
29
+ def initialize(transport: 'stdio')
30
+ @transport = transport
31
+ @pid = nil
32
+ @stdin = nil
33
+ @stdout = nil
34
+ @stderr = nil
35
+ end
36
+
37
+ # Start the MCP server
38
+ #
39
+ # @return [Integer, nil] Process ID (for SSE) or nil (for stdio)
40
+ #
41
+ def start
42
+ binary = MCPProxy.find_mcp_binary
43
+
44
+ case @transport
45
+ when 'stdio'
46
+ start_stdio(binary)
47
+ when 'sse'
48
+ start_sse(binary)
49
+ else
50
+ raise ServerError, "Unknown transport: #{@transport}"
51
+ end
52
+ end
53
+
54
+ # Stop the server
55
+ #
56
+ # @return [void]
57
+ #
58
+ def stop
59
+ return unless @pid
60
+
61
+ Process.kill('TERM', @pid)
62
+ Process.wait(@pid)
63
+ rescue Errno::ESRCH, Errno::ECHILD
64
+ # Process already dead
65
+ ensure
66
+ @pid = nil
67
+ close_pipes
68
+ end
69
+
70
+ # Send a message to the server (stdio only)
71
+ #
72
+ # @param message [Hash] JSON-RPC message
73
+ # @return [void]
74
+ #
75
+ def send_message(message)
76
+ raise ServerError, 'Can only send messages in stdio mode' unless @transport == 'stdio'
77
+ raise ServerError, 'Server not started' unless @stdin
78
+
79
+ @stdin.puts(JSON.generate(message))
80
+ @stdin.flush
81
+ end
82
+
83
+ # Read a message from the server (stdio only)
84
+ #
85
+ # @return [Hash] JSON-RPC message
86
+ #
87
+ def read_message
88
+ raise ServerError, 'Can only read messages in stdio mode' unless @transport == 'stdio'
89
+ raise ServerError, 'Server not started' unless @stdout
90
+
91
+ line = @stdout.gets
92
+ JSON.parse(line) if line
93
+ end
94
+
95
+ # Check if server is running
96
+ #
97
+ # @return [Boolean]
98
+ #
99
+ def running?
100
+ return false unless @pid
101
+
102
+ Process.kill(0, @pid)
103
+ true
104
+ rescue Errno::ESRCH, Errno::EPERM
105
+ false
106
+ end
107
+
108
+ private
109
+
110
+ def start_stdio(binary)
111
+ @stdin, @stdout, @stderr, wait_thr = Open3.popen3(binary.to_s, 'mcp', '--transport', 'stdio')
112
+ @pid = wait_thr.pid
113
+ nil
114
+ end
115
+
116
+ def start_sse(binary)
117
+ @pid = spawn(
118
+ binary.to_s,
119
+ 'mcp',
120
+ '--transport', 'sse',
121
+ out: $stdout,
122
+ err: $stderr
123
+ )
124
+ Process.detach(@pid)
125
+ sleep 1 # Give server time to start
126
+ @pid
127
+ end
128
+
129
+ def close_pipes
130
+ @stdin&.close
131
+ @stdout&.close
132
+ @stderr&.close
133
+ @stdin = @stdout = @stderr = nil
134
+ end
135
+ end
136
+
137
+ module_function
138
+
139
+ # Run MCP server with a block
140
+ #
141
+ # @param transport [String] Transport method
142
+ # @yield [Server] Yields server instance
143
+ # @return [Object] Block result
144
+ #
145
+ # @example
146
+ # Kreuzberg::MCPProxy.run(transport: 'stdio') do |server|
147
+ # server.send_message({ method: 'tools/list' })
148
+ # response = server.read_message
149
+ # end
150
+ #
151
+ def run(transport: 'stdio')
152
+ server = Server.new(transport: transport)
153
+ server.start
154
+ yield server
155
+ ensure
156
+ server&.stop
157
+ end
158
+
159
+ # Find the MCP binary
160
+ #
161
+ # @return [Pathname] Path to binary
162
+ # @raise [MissingBinaryError] If not found
163
+ #
164
+ def find_mcp_binary
165
+ # MCP is served by kreuzberg CLI
166
+ binary_name = Gem.win_platform? ? 'kreuzberg.exe' : 'kreuzberg'
167
+ found = CLIProxy.search_paths(binary_name).find(&:file?)
168
+ return found if found
169
+
170
+ raise MissingBinaryError, missing_binary_message
171
+ end
172
+
173
+ # Error message for missing binary
174
+ #
175
+ # @return [String]
176
+ #
177
+ def missing_binary_message
178
+ <<~MSG.strip
179
+ kreuzberg binary not found for MCP server. Build it with:
180
+ `cargo build --release --package kreuzberg-cli`
181
+
182
+ Or ensure kreuzberg is installed with MCP support.
183
+ MSG
184
+ end
185
+ end
186
+ end
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ # OCR backend protocol interface.
5
+ #
6
+ # This module defines the protocol that all Ruby OCR backends must implement
7
+ # to be registered with the Rust core via the FFI bridge.
8
+ #
9
+ # OCR backends implement optical character recognition for images and scanned documents.
10
+ # They are called when OCR is enabled in the extraction configuration.
11
+ #
12
+ # @example Implementing a custom OCR backend
13
+ # class CustomOcrBackend
14
+ # include Kreuzberg::OcrBackendProtocol
15
+ #
16
+ # def name
17
+ # "custom-ocr"
18
+ # end
19
+ #
20
+ # def process_image(image_bytes, config)
21
+ # # Perform OCR on image_bytes
22
+ # # This is a placeholder - integrate with a real OCR engine
23
+ # text = my_ocr_engine.recognize(image_bytes, language: config["language"])
24
+ # text
25
+ # end
26
+ # end
27
+ #
28
+ # backend = CustomOcrBackend.new
29
+ # Kreuzberg.register_ocr_backend(backend.name, backend)
30
+ #
31
+ # # Use in extraction
32
+ # result = Kreuzberg.extract_file_sync(
33
+ # "scanned.pdf",
34
+ # config: { ocr: { backend: "custom-ocr", language: "eng" } }
35
+ # )
36
+ #
37
+ # @example Implementing an OCR backend with initialization
38
+ # class ModelBasedOcr
39
+ # include Kreuzberg::OcrBackendProtocol
40
+ #
41
+ # def initialize
42
+ # @model = nil
43
+ # end
44
+ #
45
+ # def name
46
+ # "model-ocr"
47
+ # end
48
+ #
49
+ # def process_image(image_bytes, config)
50
+ # # Load model on first use (lazy initialization)
51
+ # @model ||= load_model
52
+ #
53
+ # # Run OCR
54
+ # @model.recognize(image_bytes, config)
55
+ # end
56
+ #
57
+ # private
58
+ #
59
+ # def load_model
60
+ # # Load ML model for OCR
61
+ # MyOcrModel.load("path/to/model")
62
+ # end
63
+ # end
64
+ #
65
+ # Kreuzberg.register_ocr_backend("model-ocr", ModelBasedOcr.new)
66
+ #
67
+ module OcrBackendProtocol
68
+ # Return the unique name of this OCR backend.
69
+ #
70
+ # This name is used in ExtractionConfig to select the backend:
71
+ #
72
+ # config = { ocr: { backend: "custom-ocr", language: "eng" } }
73
+ #
74
+ # The name should be a lowercase string with hyphens (e.g., "custom-ocr", "tesseract").
75
+ #
76
+ # @return [String] Unique backend identifier
77
+ #
78
+ # @example
79
+ # def name
80
+ # "custom-ocr"
81
+ # end
82
+ def name
83
+ raise NotImplementedError, "#{self.class} must implement #name"
84
+ end
85
+
86
+ # Process image bytes and extract text via OCR.
87
+ #
88
+ # This method receives raw image data (PNG, JPEG, TIFF, etc.) and an OCR configuration
89
+ # hash. It must return the extracted text as a string.
90
+ #
91
+ # The config hash contains OCR settings such as:
92
+ # - "language" [String] - Language code (e.g., "eng", "deu", "fra")
93
+ # - "backend" [String] - Backend name (same as #name)
94
+ # - Additional backend-specific settings
95
+ #
96
+ # @param image_bytes [String] Binary image data (PNG, JPEG, TIFF, etc.)
97
+ # @param config [Hash] OCR configuration with the following keys:
98
+ # - "language" [String] - Language code for OCR (e.g., "eng", "deu")
99
+ # - "backend" [String] - Backend name
100
+ #
101
+ # @return [String] Extracted text content
102
+ #
103
+ # @example
104
+ # def process_image(image_bytes, config)
105
+ # language = config["language"] || "eng"
106
+ # text = my_ocr_engine.recognize(image_bytes, language: language)
107
+ # text
108
+ # end
109
+ def process_image(image_bytes, config)
110
+ raise NotImplementedError, "#{self.class} must implement #process_image(image_bytes, config)"
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ # PostProcessor protocol interface.
5
+ #
6
+ # This module defines the protocol that all Ruby post-processors must implement
7
+ # to be registered with the Rust core via the FFI bridge.
8
+ #
9
+ # Post-processors enrich extraction results by adding metadata, transforming content,
10
+ # or performing additional analysis. They are called after extraction completes.
11
+ #
12
+ # @example Implementing a simple post-processor
13
+ # class UpcaseProcessor
14
+ # include Kreuzberg::PostProcessorProtocol
15
+ #
16
+ # def call(result)
17
+ # result["content"] = result["content"].upcase
18
+ # result
19
+ # end
20
+ # end
21
+ #
22
+ # Kreuzberg.register_post_processor("upcase", UpcaseProcessor.new)
23
+ #
24
+ # @example Implementing a post-processor that adds metadata
25
+ # class EntityExtractor
26
+ # include Kreuzberg::PostProcessorProtocol
27
+ #
28
+ # def call(result)
29
+ # entities = extract_entities(result["content"])
30
+ # result["metadata"]["entities"] = entities
31
+ # result
32
+ # end
33
+ #
34
+ # private
35
+ #
36
+ # def extract_entities(text)
37
+ # # Extract named entities from text
38
+ # # This is a placeholder - use a real NER library in production
39
+ # text.scan(/[A-Z][a-z]+(?:\s[A-Z][a-z]+)*/)
40
+ # end
41
+ # end
42
+ #
43
+ # Kreuzberg.register_post_processor("entities", EntityExtractor.new)
44
+ #
45
+ # @example Using a Proc as a post-processor
46
+ # Kreuzberg.register_post_processor("word_count", ->(result) {
47
+ # word_count = result["content"].split.length
48
+ # result["metadata"]["word_count"] = word_count
49
+ # result
50
+ # })
51
+ #
52
+ module PostProcessorProtocol
53
+ # Process and enrich an extraction result.
54
+ #
55
+ # This method is called after extraction completes. It receives the extraction result
56
+ # as a hash and must return the modified hash. The processor can:
57
+ # - Add new keys to result["metadata"]
58
+ # - Transform result["content"]
59
+ # - Add entries to result["tables"]
60
+ # - Modify any other result fields
61
+ #
62
+ # Existing metadata keys will not be overwritten by the FFI bridge, so it's safe
63
+ # to add new keys without worrying about conflicts.
64
+ #
65
+ # @param result [Hash] Extraction result with the following structure:
66
+ # - "content" [String] - Extracted text content
67
+ # - "mime_type" [String] - MIME type of the source document
68
+ # - "metadata" [Hash] - Document metadata (title, author, etc.)
69
+ # - "tables" [Array<Hash>] - Extracted tables
70
+ # - "detected_languages" [Array<String>, nil] - Detected language codes
71
+ # - "chunks" [Array<String>, nil] - Content chunks (if chunking enabled)
72
+ #
73
+ # @return [Hash] Modified extraction result with enriched metadata
74
+ #
75
+ # @example
76
+ # def call(result)
77
+ # text = result["content"]
78
+ # entities = extract_entities(text)
79
+ # result["metadata"]["entities"] = entities
80
+ # result
81
+ # end
82
+ def call(result)
83
+ raise NotImplementedError, "#{self.class} must implement #call(result)"
84
+ end
85
+ end
86
+ end