kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,170 @@
1
+ //! API request and response types.
2
+
3
+ use serde::{Deserialize, Serialize};
4
+ use std::sync::Arc;
5
+
6
+ use crate::{ExtractionConfig, types::ExtractionResult};
7
+
8
+ /// API server size limit configuration.
9
+ ///
10
+ /// Controls maximum sizes for request bodies and multipart uploads.
11
+ /// Default limits are designed for typical document processing workloads.
12
+ ///
13
+ /// # Default Values
14
+ ///
15
+ /// - `max_request_body_bytes`: 100 MB (104,857,600 bytes)
16
+ /// - `max_multipart_field_bytes`: 100 MB (104,857,600 bytes)
17
+ ///
18
+ /// # Examples
19
+ ///
20
+ /// ```
21
+ /// use kreuzberg::api::ApiSizeLimits;
22
+ ///
23
+ /// // Default limits (100 MB)
24
+ /// let limits = ApiSizeLimits::default();
25
+ ///
26
+ /// // Custom limits (50 MB for both)
27
+ /// let limits = ApiSizeLimits {
28
+ /// max_request_body_bytes: 50 * 1024 * 1024,
29
+ /// max_multipart_field_bytes: 50 * 1024 * 1024,
30
+ /// };
31
+ ///
32
+ /// // Very large documents (500 MB)
33
+ /// let limits = ApiSizeLimits {
34
+ /// max_request_body_bytes: 500 * 1024 * 1024,
35
+ /// max_multipart_field_bytes: 500 * 1024 * 1024,
36
+ /// };
37
+ /// ```
38
+ #[derive(Debug, Clone, Copy)]
39
+ pub struct ApiSizeLimits {
40
+ /// Maximum size of the entire request body in bytes.
41
+ ///
42
+ /// This applies to the total size of all uploaded files and form data
43
+ /// in a single request. Default: 100 MB (104,857,600 bytes).
44
+ pub max_request_body_bytes: usize,
45
+
46
+ /// Maximum size of a single multipart field in bytes.
47
+ ///
48
+ /// This applies to individual files in a multipart upload.
49
+ /// Default: 100 MB (104,857,600 bytes).
50
+ pub max_multipart_field_bytes: usize,
51
+ }
52
+
53
+ impl Default for ApiSizeLimits {
54
+ fn default() -> Self {
55
+ Self {
56
+ max_request_body_bytes: 100 * 1024 * 1024,
57
+ max_multipart_field_bytes: 100 * 1024 * 1024,
58
+ }
59
+ }
60
+ }
61
+
62
+ impl ApiSizeLimits {
63
+ /// Create new size limits with custom values.
64
+ ///
65
+ /// # Arguments
66
+ ///
67
+ /// * `max_request_body_bytes` - Maximum total request size in bytes
68
+ /// * `max_multipart_field_bytes` - Maximum individual file size in bytes
69
+ pub fn new(max_request_body_bytes: usize, max_multipart_field_bytes: usize) -> Self {
70
+ Self {
71
+ max_request_body_bytes,
72
+ max_multipart_field_bytes,
73
+ }
74
+ }
75
+
76
+ /// Create size limits from MB values (convenience method).
77
+ ///
78
+ /// # Arguments
79
+ ///
80
+ /// * `max_request_body_mb` - Maximum total request size in megabytes
81
+ /// * `max_multipart_field_mb` - Maximum individual file size in megabytes
82
+ ///
83
+ /// # Examples
84
+ ///
85
+ /// ```
86
+ /// use kreuzberg::api::ApiSizeLimits;
87
+ ///
88
+ /// // 50 MB limits
89
+ /// let limits = ApiSizeLimits::from_mb(50, 50);
90
+ /// ```
91
+ pub fn from_mb(max_request_body_mb: usize, max_multipart_field_mb: usize) -> Self {
92
+ Self {
93
+ max_request_body_bytes: max_request_body_mb * 1024 * 1024,
94
+ max_multipart_field_bytes: max_multipart_field_mb * 1024 * 1024,
95
+ }
96
+ }
97
+ }
98
+
99
+ /// Health check response.
100
+ #[derive(Debug, Clone, Serialize, Deserialize)]
101
+ pub struct HealthResponse {
102
+ /// Health status
103
+ pub status: String,
104
+ /// API version
105
+ pub version: String,
106
+ }
107
+
108
+ /// Server information response.
109
+ #[derive(Debug, Clone, Serialize, Deserialize)]
110
+ pub struct InfoResponse {
111
+ /// API version
112
+ pub version: String,
113
+ /// Whether using Rust backend
114
+ pub rust_backend: bool,
115
+ }
116
+
117
+ /// Extraction response (list of results).
118
+ pub type ExtractResponse = Vec<ExtractionResult>;
119
+
120
+ /// Error response.
121
+ #[derive(Debug, Clone, Serialize, Deserialize)]
122
+ pub struct ErrorResponse {
123
+ /// Error type name
124
+ pub error_type: String,
125
+ /// Error message
126
+ pub message: String,
127
+ /// Stack trace (if available)
128
+ #[serde(skip_serializing_if = "Option::is_none")]
129
+ pub traceback: Option<String>,
130
+ /// HTTP status code
131
+ pub status_code: u16,
132
+ }
133
+
134
+ /// API server state.
135
+ ///
136
+ /// Holds the default extraction configuration loaded from config file
137
+ /// (via discovery or explicit path). Per-request configs override these defaults.
138
+ #[derive(Debug, Clone)]
139
+ pub struct ApiState {
140
+ /// Default extraction configuration
141
+ pub default_config: Arc<ExtractionConfig>,
142
+ }
143
+
144
+ /// Cache statistics response.
145
+ #[derive(Debug, Clone, Serialize, Deserialize)]
146
+ pub struct CacheStatsResponse {
147
+ /// Cache directory path
148
+ pub directory: String,
149
+ /// Total number of cache files
150
+ pub total_files: usize,
151
+ /// Total cache size in MB
152
+ pub total_size_mb: f64,
153
+ /// Available disk space in MB
154
+ pub available_space_mb: f64,
155
+ /// Age of oldest file in days
156
+ pub oldest_file_age_days: f64,
157
+ /// Age of newest file in days
158
+ pub newest_file_age_days: f64,
159
+ }
160
+
161
+ /// Cache clear response.
162
+ #[derive(Debug, Clone, Serialize, Deserialize)]
163
+ pub struct CacheClearResponse {
164
+ /// Cache directory path
165
+ pub directory: String,
166
+ /// Number of files removed
167
+ pub removed_files: usize,
168
+ /// Space freed in MB
169
+ pub freed_mb: f64,
170
+ }