kreuzberg 4.0.0.rc2 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (446) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +543 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +194 -6
  8. data/README.md +396 -426
  9. data/Rakefile +34 -25
  10. data/Steepfile +51 -47
  11. data/examples/async_patterns.rb +283 -341
  12. data/ext/kreuzberg_rb/extconf.rb +65 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
  14. data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
  15. data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
  23. data/extconf.rb +60 -28
  24. data/kreuzberg.gemspec +199 -148
  25. data/lib/kreuzberg/api_proxy.rb +126 -142
  26. data/lib/kreuzberg/cache_api.rb +67 -46
  27. data/lib/kreuzberg/cli.rb +47 -55
  28. data/lib/kreuzberg/cli_proxy.rb +117 -127
  29. data/lib/kreuzberg/config.rb +936 -691
  30. data/lib/kreuzberg/error_context.rb +136 -32
  31. data/lib/kreuzberg/errors.rb +116 -118
  32. data/lib/kreuzberg/extraction_api.rb +313 -85
  33. data/lib/kreuzberg/mcp_proxy.rb +177 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +15 -86
  36. data/lib/kreuzberg/result.rb +334 -216
  37. data/lib/kreuzberg/setup_lib_path.rb +99 -80
  38. data/lib/kreuzberg/types.rb +170 -0
  39. data/lib/kreuzberg/validator_protocol.rb +16 -89
  40. data/lib/kreuzberg/version.rb +5 -5
  41. data/lib/kreuzberg.rb +96 -103
  42. data/lib/libpdfium.so +0 -0
  43. data/sig/kreuzberg/internal.rbs +184 -184
  44. data/sig/kreuzberg.rbs +561 -520
  45. data/spec/binding/async_operations_spec.rb +473 -0
  46. data/spec/binding/batch_operations_spec.rb +595 -0
  47. data/spec/binding/batch_spec.rb +359 -0
  48. data/spec/binding/cache_spec.rb +227 -227
  49. data/spec/binding/cli_proxy_spec.rb +85 -85
  50. data/spec/binding/cli_spec.rb +55 -55
  51. data/spec/binding/config_result_spec.rb +377 -0
  52. data/spec/binding/config_spec.rb +419 -345
  53. data/spec/binding/config_validation_spec.rb +377 -283
  54. data/spec/binding/embeddings_spec.rb +816 -0
  55. data/spec/binding/error_handling_spec.rb +399 -213
  56. data/spec/binding/error_recovery_spec.rb +488 -0
  57. data/spec/binding/errors_spec.rb +66 -66
  58. data/spec/binding/font_config_spec.rb +220 -0
  59. data/spec/binding/images_spec.rb +738 -0
  60. data/spec/binding/keywords_extraction_spec.rb +600 -0
  61. data/spec/binding/metadata_types_spec.rb +1228 -0
  62. data/spec/binding/pages_extraction_spec.rb +471 -0
  63. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  64. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  65. data/spec/binding/plugins/validator_spec.rb +273 -274
  66. data/spec/binding/tables_spec.rb +641 -0
  67. data/spec/fixtures/config.toml +38 -39
  68. data/spec/fixtures/config.yaml +41 -41
  69. data/spec/fixtures/invalid_config.toml +3 -4
  70. data/spec/smoke/package_spec.rb +177 -178
  71. data/spec/spec_helper.rb +40 -42
  72. data/spec/unit/config/chunking_config_spec.rb +213 -0
  73. data/spec/unit/config/embedding_config_spec.rb +343 -0
  74. data/spec/unit/config/extraction_config_spec.rb +438 -0
  75. data/spec/unit/config/font_config_spec.rb +285 -0
  76. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  77. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  78. data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
  79. data/spec/unit/config/keyword_config_spec.rb +229 -0
  80. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  81. data/spec/unit/config/ocr_config_spec.rb +171 -0
  82. data/spec/unit/config/page_config_spec.rb +221 -0
  83. data/spec/unit/config/pdf_config_spec.rb +267 -0
  84. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  85. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  86. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  87. data/test/metadata_types_test.rb +959 -0
  88. data/vendor/Cargo.toml +61 -0
  89. data/vendor/kreuzberg/Cargo.toml +259 -204
  90. data/vendor/kreuzberg/README.md +263 -175
  91. data/vendor/kreuzberg/build.rs +782 -474
  92. data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
  93. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
  94. data/vendor/kreuzberg/src/api/error.rs +81 -81
  95. data/vendor/kreuzberg/src/api/handlers.rs +320 -199
  96. data/vendor/kreuzberg/src/api/mod.rs +94 -79
  97. data/vendor/kreuzberg/src/api/server.rs +518 -353
  98. data/vendor/kreuzberg/src/api/types.rs +206 -170
  99. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  100. data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
  101. data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
  102. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  103. data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
  104. data/vendor/kreuzberg/src/core/config.rs +1914 -1032
  105. data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
  106. data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
  107. data/vendor/kreuzberg/src/core/formats.rs +235 -0
  108. data/vendor/kreuzberg/src/core/io.rs +329 -329
  109. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  110. data/vendor/kreuzberg/src/core/mod.rs +61 -45
  111. data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
  112. data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
  113. data/vendor/kreuzberg/src/embeddings.rs +471 -432
  114. data/vendor/kreuzberg/src/error.rs +431 -431
  115. data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
  116. data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
  117. data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
  118. data/vendor/kreuzberg/src/extraction/email.rs +855 -854
  119. data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
  120. data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
  121. data/vendor/kreuzberg/src/extraction/image.rs +492 -368
  122. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
  123. data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
  124. data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
  125. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  126. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  127. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  128. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  129. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
  130. data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
  131. data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
  132. data/vendor/kreuzberg/src/extraction/table.rs +329 -328
  133. data/vendor/kreuzberg/src/extraction/text.rs +277 -269
  134. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  135. data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
  136. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
  137. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
  138. data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
  139. data/vendor/kreuzberg/src/extractors/email.rs +157 -143
  140. data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
  141. data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
  142. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
  143. data/vendor/kreuzberg/src/extractors/html.rs +419 -393
  144. data/vendor/kreuzberg/src/extractors/image.rs +219 -198
  145. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
  146. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
  147. data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
  149. data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
  150. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  151. data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
  152. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
  153. data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
  154. data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
  155. data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
  156. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
  157. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  158. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  159. data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
  160. data/vendor/kreuzberg/src/extractors/text.rs +265 -260
  161. data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
  162. data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
  163. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  164. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  165. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  166. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  167. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  168. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  169. data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
  170. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  171. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  172. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  173. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
  174. data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
  175. data/vendor/kreuzberg/src/lib.rs +114 -105
  176. data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
  177. data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
  178. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  179. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  180. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  181. data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
  182. data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
  183. data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
  184. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  185. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  186. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
  187. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  188. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  189. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  190. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  191. data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
  192. data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
  193. data/vendor/kreuzberg/src/pdf/error.rs +214 -122
  194. data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
  196. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  197. data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
  198. data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
  199. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  200. data/vendor/kreuzberg/src/pdf/table.rs +417 -393
  201. data/vendor/kreuzberg/src/pdf/text.rs +553 -158
  202. data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
  203. data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
  204. data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
  205. data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
  206. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
  207. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  208. data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
  209. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  210. data/vendor/kreuzberg/src/text/mod.rs +27 -19
  211. data/vendor/kreuzberg/src/text/quality.rs +710 -697
  212. data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
  213. data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
  214. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  215. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  216. data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
  217. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
  218. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  219. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  220. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
  221. data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
  222. data/vendor/kreuzberg/src/types.rs +1713 -903
  223. data/vendor/kreuzberg/src/utils/mod.rs +31 -17
  224. data/vendor/kreuzberg/src/utils/pool.rs +503 -0
  225. data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
  226. data/vendor/kreuzberg/src/utils/quality.rs +968 -959
  227. data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
  228. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  229. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  230. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  231. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  232. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  233. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  234. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  235. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  236. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  237. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  238. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  239. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  240. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  241. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  242. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  243. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  244. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  245. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  246. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  247. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  248. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  249. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  250. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  251. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  252. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  253. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  254. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  255. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  256. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  257. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  258. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  259. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  260. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  261. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  262. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  263. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  264. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  265. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  266. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  267. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  268. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  269. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  270. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  271. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  272. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  273. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  274. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  275. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  276. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  277. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  278. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  279. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  280. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  281. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  282. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  283. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  284. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  285. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  286. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  287. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  288. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  289. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  290. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  291. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  292. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  293. data/vendor/kreuzberg/tests/api_embed.rs +360 -0
  294. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  295. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
  296. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
  297. data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
  298. data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
  299. data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
  300. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
  301. data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
  302. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  303. data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
  304. data/vendor/kreuzberg/tests/config_features.rs +612 -598
  305. data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
  306. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
  307. data/vendor/kreuzberg/tests/core_integration.rs +519 -510
  308. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  309. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
  310. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
  311. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  312. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  313. data/vendor/kreuzberg/tests/email_integration.rs +327 -325
  314. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  315. data/vendor/kreuzberg/tests/error_handling.rs +402 -393
  316. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  317. data/vendor/kreuzberg/tests/format_integration.rs +165 -159
  318. data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
  319. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  320. data/vendor/kreuzberg/tests/image_integration.rs +255 -253
  321. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  322. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  323. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  324. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  325. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  326. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  327. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  328. data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
  329. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
  330. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
  331. data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
  332. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
  333. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  334. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
  335. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  336. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  337. data/vendor/kreuzberg/tests/page_markers.rs +297 -0
  338. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
  339. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
  340. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
  341. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
  342. data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
  343. data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
  344. data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
  345. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
  346. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
  347. data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
  348. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  349. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
  350. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
  351. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
  352. data/vendor/kreuzberg/tests/security_validation.rs +416 -415
  353. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  354. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
  355. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
  356. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
  357. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  358. data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
  359. data/vendor/kreuzberg-ffi/README.md +851 -0
  360. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
  361. data/vendor/kreuzberg-ffi/build.rs +168 -0
  362. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  363. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  364. data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
  365. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
  366. data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
  367. data/vendor/kreuzberg-ffi/src/error.rs +901 -0
  368. data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
  369. data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
  370. data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
  371. data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
  372. data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
  373. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  374. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
  375. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
  376. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
  377. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
  378. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
  379. data/vendor/kreuzberg-ffi/src/result.rs +510 -0
  380. data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
  381. data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
  382. data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
  383. data/vendor/kreuzberg-ffi/src/types.rs +363 -0
  384. data/vendor/kreuzberg-ffi/src/util.rs +210 -0
  385. data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
  386. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  387. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  388. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  389. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  390. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  391. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  392. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  393. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  394. data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
  395. data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
  396. data/vendor/kreuzberg-tesseract/README.md +399 -0
  397. data/vendor/kreuzberg-tesseract/build.rs +1127 -0
  398. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  399. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  400. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  401. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  402. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  403. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  404. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  405. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  406. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  407. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  408. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  409. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  410. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  411. metadata +196 -45
  412. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  413. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  414. data/vendor/rb-sys/.cargo-ok +0 -1
  415. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  416. data/vendor/rb-sys/Cargo.lock +0 -393
  417. data/vendor/rb-sys/Cargo.toml +0 -70
  418. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  419. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  420. data/vendor/rb-sys/bin/release.sh +0 -21
  421. data/vendor/rb-sys/build/features.rs +0 -108
  422. data/vendor/rb-sys/build/main.rs +0 -246
  423. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  424. data/vendor/rb-sys/build/version.rs +0 -48
  425. data/vendor/rb-sys/readme.md +0 -36
  426. data/vendor/rb-sys/src/bindings.rs +0 -21
  427. data/vendor/rb-sys/src/hidden.rs +0 -11
  428. data/vendor/rb-sys/src/lib.rs +0 -34
  429. data/vendor/rb-sys/src/macros.rs +0 -371
  430. data/vendor/rb-sys/src/memory.rs +0 -53
  431. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  432. data/vendor/rb-sys/src/special_consts.rs +0 -31
  433. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  434. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  435. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  436. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  437. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  438. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  439. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  440. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  441. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  442. data/vendor/rb-sys/src/stable_api.rs +0 -261
  443. data/vendor/rb-sys/src/symbol.rs +0 -31
  444. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  445. data/vendor/rb-sys/src/utils.rs +0 -89
  446. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -1,341 +1,283 @@
1
- # frozen_string_literal: true
2
-
3
- # Async Patterns for Kreuzberg Ruby Bindings
4
- #
5
- # This example demonstrates async patterns and concurrency approaches for Ruby,
6
- # with comparison to the underlying Rust implementation.
7
-
8
- require 'kreuzberg'
9
-
10
- # NOTE: Ruby bindings use Tokio runtime with block_on() internally.
11
- # The "async" functions block the Ruby GVL during execution, so there's
12
- # no performance benefit over the _sync variants from Ruby's perspective.
13
-
14
- # ============================================================================
15
- # Pattern 1: Synchronous Extraction (Recommended)
16
- # ============================================================================
17
-
18
- def basic_sync_extraction
19
- result = Kreuzberg.extract_file_sync('document.pdf')
20
- puts "Content: #{result[:content]}"
21
- puts "MIME type: #{result[:mime_type]}"
22
- end
23
-
24
- # ============================================================================
25
- # Pattern 2: "Async" Extraction (Same Performance as Sync)
26
- # ============================================================================
27
-
28
- def basic_async_extraction
29
- # This LOOKS async but actually blocks the Ruby thread
30
- # Internally uses: runtime.block_on(async { ... })
31
- result = Kreuzberg.extract_file('document.pdf')
32
- puts "Content: #{result[:content]}"
33
- end
34
-
35
- # ============================================================================
36
- # Pattern 3: Concurrent Processing with Ruby Threads
37
- # ============================================================================
38
-
39
- def concurrent_with_threads
40
- files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
41
-
42
- # Use Ruby threads to achieve parallelism
43
- # Each thread calls the synchronous API
44
- threads = files.map do |file|
45
- Thread.new do
46
- Kreuzberg.extract_file_sync(file)
47
- end
48
- end
49
-
50
- results = threads.map(&:value)
51
- results.each_with_index do |result, index|
52
- puts "File #{index + 1}: #{result[:content][0..100]}"
53
- end
54
- end
55
-
56
- # ============================================================================
57
- # Pattern 4: Batch Processing (Preferred for Multiple Files)
58
- # ============================================================================
59
-
60
- def batch_processing
61
- files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
62
-
63
- # The batch API handles concurrency internally via Rust/Tokio
64
- # This is more efficient than Ruby threads
65
- results = Kreuzberg.batch_extract_files_sync(files)
66
-
67
- puts "Processed #{results.length} files"
68
- results.each do |result|
69
- puts "Content preview: #{result[:content][0..50]}"
70
- end
71
- end
72
-
73
- # ============================================================================
74
- # Pattern 5: Extraction with Configuration
75
- # ============================================================================
76
-
77
- def extraction_with_config
78
- # Configure OCR
79
- config = {
80
- ocr: {
81
- backend: 'tesseract',
82
- language: 'eng'
83
- },
84
- force_ocr: false
85
- }
86
-
87
- result = Kreuzberg.extract_file_sync('scanned.pdf', **config)
88
- puts "Extracted with OCR: #{result[:content]}"
89
- end
90
-
91
- # ============================================================================
92
- # Pattern 6: Extract from Bytes
93
- # ============================================================================
94
-
95
- def extract_from_bytes
96
- data = File.binread('document.pdf')
97
- result = Kreuzberg.extract_bytes_sync(data, 'application/pdf')
98
- puts "Extracted from memory: #{result[:content]}"
99
- end
100
-
101
- # ============================================================================
102
- # Pattern 7: Batch Extract from Bytes
103
- # ============================================================================
104
-
105
- def batch_extract_from_bytes
106
- files = ['doc1.pdf', 'doc2.pdf']
107
- bytes_array = files.map { |f| File.binread(f) }
108
- mime_types = ['application/pdf', 'application/pdf']
109
-
110
- results = Kreuzberg.batch_extract_bytes_sync(bytes_array, mime_types)
111
- puts "Processed #{results.length} files from memory"
112
- end
113
-
114
- # ============================================================================
115
- # Pattern 8: Error Handling
116
- # ============================================================================
117
-
118
- def error_handling
119
- Kreuzberg.extract_file_sync('nonexistent.pdf')
120
- rescue StandardError => e
121
- puts "Extraction failed: #{e.message}"
122
- end
123
-
124
- # ============================================================================
125
- # Pattern 9: Sequential Processing
126
- # ============================================================================
127
-
128
- def sequential_processing
129
- files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
130
-
131
- files.each do |file|
132
- result = Kreuzberg.extract_file_sync(file)
133
- puts "Processed #{file}: #{result[:content][0..50]}"
134
- end
135
- end
136
-
137
- # ============================================================================
138
- # Pattern 10: Background Processing with ActiveJob (Rails)
139
- # ============================================================================
140
-
141
- # Example ActiveJob for async processing in Rails
142
- # < ApplicationJob
143
- class DocumentExtractionJob
144
- # queue_as :default
145
-
146
- def perform(file_path)
147
- result = Kreuzberg.extract_file_sync(file_path)
148
- # Store result in database or process further
149
- puts "Background extraction complete: #{result[:content][0..100]}"
150
- end
151
- end
152
-
153
- # Usage in Rails controller:
154
- # DocumentExtractionJob.perform_later('document.pdf')
155
-
156
- # ============================================================================
157
- # Pattern 11: Concurrent Processing with Parallel Gem
158
- # ============================================================================
159
-
160
- def concurrent_with_parallel_gem
161
- require 'parallel'
162
-
163
- files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf', 'doc4.pdf']
164
-
165
- # Process files in parallel using multiple CPU cores
166
- results = Parallel.map(files, in_processes: 4) do |file|
167
- Kreuzberg.extract_file_sync(file)
168
- end
169
-
170
- results.each do |result|
171
- puts "Content: #{result[:content][0..50]}"
172
- end
173
- end
174
-
175
- # ============================================================================
176
- # Pattern 12: Timeout Wrapper
177
- # ============================================================================
178
-
179
- def extraction_with_timeout(file_path, timeout_seconds = 30)
180
- require 'timeout'
181
-
182
- Timeout.timeout(timeout_seconds) do
183
- Kreuzberg.extract_file_sync(file_path)
184
- end
185
- rescue Timeout::Error
186
- puts "Extraction timed out after #{timeout_seconds} seconds"
187
- nil
188
- end
189
-
190
- # ============================================================================
191
- # Pattern 13: Custom Ruby PostProcessor Plugin
192
- # ============================================================================
193
-
194
- def register_postprocessor
195
- # Register a Ruby-based post-processor
196
- uppercase_processor = lambda do |result|
197
- result[:content] = result[:content].upcase
198
- result
199
- end
200
-
201
- Kreuzberg.register_post_processor('uppercase', uppercase_processor, 100)
202
-
203
- # Now all extractions will use the uppercase processor
204
- result = Kreuzberg.extract_file_sync('document.pdf')
205
- puts "Uppercase content: #{result[:content]}"
206
-
207
- # Clean up
208
- Kreuzberg.unregister_post_processor('uppercase')
209
- end
210
-
211
- # ============================================================================
212
- # Pattern 14: Custom Ruby Validator Plugin
213
- # ============================================================================
214
-
215
- def register_validator
216
- # Register a Ruby-based validator
217
- min_length_validator = lambda do |result|
218
- raise 'Content too short' if result[:content].length < 100
219
- end
220
-
221
- Kreuzberg.register_validator('min_length', min_length_validator, 100)
222
-
223
- # Validation will run automatically during extraction
224
- begin
225
- result = Kreuzberg.extract_file_sync('short_document.pdf')
226
- puts "Validation passed: #{result[:content]}"
227
- rescue StandardError => e
228
- puts "Validation failed: #{e.message}"
229
- end
230
-
231
- # Clean up
232
- Kreuzberg.unregister_validator('min_length')
233
- end
234
-
235
- # ============================================================================
236
- # Pattern 15: Custom Ruby OCR Backend Plugin
237
- # ============================================================================
238
-
239
- # Example OCR backend implementation for custom processing.
240
- class CustomOcrBackend
241
- def process_image(image_bytes, language)
242
- # In a real implementation, you would:
243
- # 1. Call an external OCR service
244
- # 2. Use an HTTP API
245
- # 3. Process with a Ruby gem
246
- "Extracted text from #{image_bytes.length} bytes using #{language}"
247
- end
248
-
249
- def supports_language?(lang)
250
- %w[eng deu fra].include?(lang)
251
- end
252
- end
253
-
254
- def register_ocr_backend
255
- backend = CustomOcrBackend.new
256
- Kreuzberg.register_ocr_backend('custom', backend)
257
-
258
- # Now you can use the custom backend
259
- config = {
260
- ocr: {
261
- backend: 'custom',
262
- language: 'eng'
263
- },
264
- force_ocr: true
265
- }
266
-
267
- result = Kreuzberg.extract_file_sync('scanned.pdf', **config)
268
- puts "Custom OCR result: #{result[:content]}"
269
- end
270
-
271
- # ============================================================================
272
- # Main Demonstration
273
- # ============================================================================
274
-
275
- def main
276
- puts '=== Basic Sync Extraction ==='
277
- basic_sync_extraction
278
-
279
- puts '\n=== Basic Async Extraction (Blocks GVL) ==='
280
- basic_async_extraction
281
-
282
- puts '\n=== Concurrent with Ruby Threads ==='
283
- concurrent_with_threads
284
-
285
- puts '\n=== Batch Processing (Preferred) ==='
286
- batch_processing
287
-
288
- puts '\n=== Extraction with Config ==='
289
- extraction_with_config
290
-
291
- puts '\n=== Extract from Bytes ==='
292
- extract_from_bytes
293
-
294
- puts '\n=== Error Handling ==='
295
- error_handling
296
-
297
- puts '\n=== Sequential Processing ==='
298
- sequential_processing
299
-
300
- puts '\n=== Extraction with Timeout ==='
301
- extraction_with_timeout('large_document.pdf', 30)
302
-
303
- puts '\n=== Custom PostProcessor ==='
304
- register_postprocessor
305
-
306
- puts '\n=== Custom Validator ==='
307
- register_validator
308
- end
309
-
310
- # Run if executed directly
311
- main if __FILE__ == $PROGRAM_NAME
312
-
313
- # ============================================================================
314
- # Key Takeaways:
315
- #
316
- # 1. Ruby bindings use Tokio runtime with block_on() internally
317
- # 2. "Async" functions block the Ruby GVL - no concurrency benefit
318
- # 3. Use _sync variants for clarity (same performance)
319
- # 4. Use Ruby threads or Parallel gem for concurrent processing
320
- # 5. Batch API is most efficient for multiple files
321
- # 6. ActiveJob for background processing in Rails
322
- # 7. Ruby plugins (PostProcessor, Validator, OCR) are fully supported
323
- #
324
- # Performance Comparison:
325
- # - Magnus: Blocks GVL, same overhead as sync (~Xms per call)
326
- # - PyO3 (optimized): ~0.17ms overhead, GIL released during await
327
- # - NAPI-RS: ~0ms overhead, automatic Promise conversion
328
- #
329
- # When to Use Ruby Bindings:
330
- # ✅ Rails applications (ActiveJob for background processing)
331
- # ✅ Ruby scripts (existing Ruby codebases)
332
- # ✅ Simple extraction (single-file processing)
333
- # ✅ Batch processing (batch API handles concurrency)
334
- #
335
- # Consider Other Bindings For:
336
- # ❌ High concurrency (use Node.js/NAPI-RS instead)
337
- # ❌ Real-time processing (use Node.js/NAPI-RS instead)
338
- # ❌ I/O-bound workloads (use Python/PyO3 or Node.js/NAPI-RS)
339
- #
340
- # See packages/ruby/ext/kreuzberg_rb/native/README.md for detailed async runtime documentation.
341
- # ============================================================================
1
+ # frozen_string_literal: true
2
+
3
+ require 'kreuzberg'
4
+
5
+ # NOTE: Ruby bindings use Tokio runtime with block_on() internally.
6
+ # The "async" functions block the Ruby GVL during execution, so there's
7
+ # no performance benefit over the _sync variants from Ruby's perspective.
8
+
9
+ # ============================================================================
10
+ # Pattern 1: Synchronous Extraction (Recommended)
11
+ # ============================================================================
12
+
13
+ def basic_sync_extraction
14
+ result = Kreuzberg.extract_file_sync('document.pdf')
15
+ puts "Content: #{result[:content]}"
16
+ puts "MIME type: #{result[:mime_type]}"
17
+ end
18
+
19
+ # ============================================================================
20
+ # Pattern 2: "Async" Extraction (Same Performance as Sync)
21
+ # ============================================================================
22
+
23
+ def basic_async_extraction
24
+ result = Kreuzberg.extract_file('document.pdf')
25
+ puts "Content: #{result[:content]}"
26
+ end
27
+
28
+ # ============================================================================
29
+ # Pattern 3: Concurrent Processing with Ruby Threads
30
+ # ============================================================================
31
+
32
+ def concurrent_with_threads
33
+ files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
34
+
35
+ threads = files.map do |file|
36
+ Thread.new do
37
+ Kreuzberg.extract_file_sync(file)
38
+ end
39
+ end
40
+
41
+ results = threads.map(&:value)
42
+ results.each_with_index do |result, index|
43
+ puts "File #{index + 1}: #{result[:content][0..100]}"
44
+ end
45
+ end
46
+
47
+ # ============================================================================
48
+ # Pattern 4: Batch Processing (Preferred for Multiple Files)
49
+ # ============================================================================
50
+
51
+ def batch_processing
52
+ files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
53
+
54
+ results = Kreuzberg.batch_extract_files_sync(files)
55
+
56
+ puts "Processed #{results.length} files"
57
+ results.each do |result|
58
+ puts "Content preview: #{result[:content][0..50]}"
59
+ end
60
+ end
61
+
62
+ # ============================================================================
63
+ # Pattern 5: Extraction with Configuration
64
+ # ============================================================================
65
+
66
+ def extraction_with_config
67
+ config = {
68
+ ocr: {
69
+ backend: 'tesseract',
70
+ language: 'eng'
71
+ },
72
+ force_ocr: false
73
+ }
74
+
75
+ result = Kreuzberg.extract_file_sync('scanned.pdf', **config)
76
+ puts "Extracted with OCR: #{result[:content]}"
77
+ end
78
+
79
+ # ============================================================================
80
+ # Pattern 6: Extract from Bytes
81
+ # ============================================================================
82
+
83
+ def extract_from_bytes
84
+ data = File.binread('document.pdf')
85
+ result = Kreuzberg.extract_bytes_sync(data, 'application/pdf')
86
+ puts "Extracted from memory: #{result[:content]}"
87
+ end
88
+
89
+ # ============================================================================
90
+ # Pattern 7: Batch Extract from Bytes
91
+ # ============================================================================
92
+
93
+ def batch_extract_from_bytes
94
+ files = ['doc1.pdf', 'doc2.pdf']
95
+ bytes_array = files.map { |f| File.binread(f) }
96
+ mime_types = ['application/pdf', 'application/pdf']
97
+
98
+ results = Kreuzberg.batch_extract_bytes_sync(bytes_array, mime_types)
99
+ puts "Processed #{results.length} files from memory"
100
+ end
101
+
102
+ # ============================================================================
103
+ # Pattern 8: Error Handling
104
+ # ============================================================================
105
+
106
+ def error_handling
107
+ Kreuzberg.extract_file_sync('nonexistent.pdf')
108
+ rescue StandardError => e
109
+ puts "Extraction failed: #{e.message}"
110
+ end
111
+
112
+ # ============================================================================
113
+ # Pattern 9: Sequential Processing
114
+ # ============================================================================
115
+
116
+ def sequential_processing
117
+ files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
118
+
119
+ files.each do |file|
120
+ result = Kreuzberg.extract_file_sync(file)
121
+ puts "Processed #{file}: #{result[:content][0..50]}"
122
+ end
123
+ end
124
+
125
+ # ============================================================================
126
+ # Pattern 10: Background Processing with ActiveJob (Rails)
127
+ # ============================================================================
128
+
129
+ # Example ActiveJob for async processing in Rails
130
+ # < ApplicationJob
131
+ class DocumentExtractionJob
132
+ def perform(file_path)
133
+ result = Kreuzberg.extract_file_sync(file_path)
134
+ puts "Background extraction complete: #{result[:content][0..100]}"
135
+ end
136
+ end
137
+
138
+ # Usage in Rails controller:
139
+ # DocumentExtractionJob.perform_later('document.pdf')
140
+
141
+ # ============================================================================
142
+ # Pattern 11: Concurrent Processing with Parallel Gem
143
+ # ============================================================================
144
+
145
+ def concurrent_with_parallel_gem
146
+ require 'parallel'
147
+
148
+ files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf', 'doc4.pdf']
149
+
150
+ results = Parallel.map(files, in_processes: 4) do |file|
151
+ Kreuzberg.extract_file_sync(file)
152
+ end
153
+
154
+ results.each do |result|
155
+ puts "Content: #{result[:content][0..50]}"
156
+ end
157
+ end
158
+
159
+ # ============================================================================
160
+ # Pattern 12: Timeout Wrapper
161
+ # ============================================================================
162
+
163
+ def extraction_with_timeout(file_path, timeout_seconds = 30)
164
+ require 'timeout'
165
+
166
+ Timeout.timeout(timeout_seconds) do
167
+ Kreuzberg.extract_file_sync(file_path)
168
+ end
169
+ rescue Timeout::Error
170
+ puts "Extraction timed out after #{timeout_seconds} seconds"
171
+ nil
172
+ end
173
+
174
+ # ============================================================================
175
+ # Pattern 13: Custom Ruby PostProcessor Plugin
176
+ # ============================================================================
177
+
178
+ def register_postprocessor
179
+ uppercase_processor = lambda do |result|
180
+ result[:content] = result[:content].upcase
181
+ result
182
+ end
183
+
184
+ Kreuzberg.register_post_processor('uppercase', uppercase_processor, 100)
185
+
186
+ result = Kreuzberg.extract_file_sync('document.pdf')
187
+ puts "Uppercase content: #{result[:content]}"
188
+
189
+ Kreuzberg.unregister_post_processor('uppercase')
190
+ end
191
+
192
+ # ============================================================================
193
+ # Pattern 14: Custom Ruby Validator Plugin
194
+ # ============================================================================
195
+
196
+ def register_validator
197
+ min_length_validator = lambda do |result|
198
+ raise 'Content too short' if result[:content].length < 100
199
+ end
200
+
201
+ Kreuzberg.register_validator('min_length', min_length_validator, 100)
202
+
203
+ begin
204
+ result = Kreuzberg.extract_file_sync('short_document.pdf')
205
+ puts "Validation passed: #{result[:content]}"
206
+ rescue StandardError => e
207
+ puts "Validation failed: #{e.message}"
208
+ end
209
+
210
+ Kreuzberg.unregister_validator('min_length')
211
+ end
212
+
213
+ # ============================================================================
214
+ # Pattern 15: Custom Ruby OCR Backend Plugin
215
+ # ============================================================================
216
+
217
+ # Example OCR backend implementation for custom processing.
218
+ class CustomOcrBackend
219
+ def process_image(image_bytes, language)
220
+ "Extracted text from #{image_bytes.length} bytes using #{language}"
221
+ end
222
+
223
+ def supports_language?(lang)
224
+ %w[eng deu fra].include?(lang)
225
+ end
226
+ end
227
+
228
+ def register_ocr_backend
229
+ backend = CustomOcrBackend.new
230
+ Kreuzberg.register_ocr_backend('custom', backend)
231
+
232
+ config = {
233
+ ocr: {
234
+ backend: 'custom',
235
+ language: 'eng'
236
+ },
237
+ force_ocr: true
238
+ }
239
+
240
+ result = Kreuzberg.extract_file_sync('scanned.pdf', **config)
241
+ puts "Custom OCR result: #{result[:content]}"
242
+ end
243
+
244
+ # ============================================================================
245
+ # Main Demonstration
246
+ # ============================================================================
247
+
248
+ def main
249
+ puts '=== Basic Sync Extraction ==='
250
+ basic_sync_extraction
251
+
252
+ puts '\n=== Basic Async Extraction (Blocks GVL) ==='
253
+ basic_async_extraction
254
+
255
+ puts '\n=== Concurrent with Ruby Threads ==='
256
+ concurrent_with_threads
257
+
258
+ puts '\n=== Batch Processing (Preferred) ==='
259
+ batch_processing
260
+
261
+ puts '\n=== Extraction with Config ==='
262
+ extraction_with_config
263
+
264
+ puts '\n=== Extract from Bytes ==='
265
+ extract_from_bytes
266
+
267
+ puts '\n=== Error Handling ==='
268
+ error_handling
269
+
270
+ puts '\n=== Sequential Processing ==='
271
+ sequential_processing
272
+
273
+ puts '\n=== Extraction with Timeout ==='
274
+ extraction_with_timeout('large_document.pdf', 30)
275
+
276
+ puts '\n=== Custom PostProcessor ==='
277
+ register_postprocessor
278
+
279
+ puts '\n=== Custom Validator ==='
280
+ register_validator
281
+ end
282
+
283
+ main if __FILE__ == $PROGRAM_NAME