kreuzberg 4.0.0.pre.rc.11 → 4.0.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +2 -105
- data/README.md +454 -454
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{libpdfium.dylib → pdfium.dll} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -1
- data/vendor/kreuzberg/Cargo.toml +2 -2
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -722
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +7 -80
data/examples/async_patterns.rb
CHANGED
|
@@ -1,341 +1,341 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
# Async Patterns for Kreuzberg Ruby Bindings
|
|
4
|
-
#
|
|
5
|
-
# This example demonstrates async patterns and concurrency approaches for Ruby,
|
|
6
|
-
# with comparison to the underlying Rust implementation.
|
|
7
|
-
|
|
8
|
-
require 'kreuzberg'
|
|
9
|
-
|
|
10
|
-
# NOTE: Ruby bindings use Tokio runtime with block_on() internally.
|
|
11
|
-
# The "async" functions block the Ruby GVL during execution, so there's
|
|
12
|
-
# no performance benefit over the _sync variants from Ruby's perspective.
|
|
13
|
-
|
|
14
|
-
# ============================================================================
|
|
15
|
-
# Pattern 1: Synchronous Extraction (Recommended)
|
|
16
|
-
# ============================================================================
|
|
17
|
-
|
|
18
|
-
def basic_sync_extraction
|
|
19
|
-
result = Kreuzberg.extract_file_sync('document.pdf')
|
|
20
|
-
puts "Content: #{result[:content]}"
|
|
21
|
-
puts "MIME type: #{result[:mime_type]}"
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
# ============================================================================
|
|
25
|
-
# Pattern 2: "Async" Extraction (Same Performance as Sync)
|
|
26
|
-
# ============================================================================
|
|
27
|
-
|
|
28
|
-
def basic_async_extraction
|
|
29
|
-
# This LOOKS async but actually blocks the Ruby thread
|
|
30
|
-
# Internally uses: runtime.block_on(async { ... })
|
|
31
|
-
result = Kreuzberg.extract_file('document.pdf')
|
|
32
|
-
puts "Content: #{result[:content]}"
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
# ============================================================================
|
|
36
|
-
# Pattern 3: Concurrent Processing with Ruby Threads
|
|
37
|
-
# ============================================================================
|
|
38
|
-
|
|
39
|
-
def concurrent_with_threads
|
|
40
|
-
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
|
|
41
|
-
|
|
42
|
-
# Use Ruby threads to achieve parallelism
|
|
43
|
-
# Each thread calls the synchronous API
|
|
44
|
-
threads = files.map do |file|
|
|
45
|
-
Thread.new do
|
|
46
|
-
Kreuzberg.extract_file_sync(file)
|
|
47
|
-
end
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
results = threads.map(&:value)
|
|
51
|
-
results.each_with_index do |result, index|
|
|
52
|
-
puts "File #{index + 1}: #{result[:content][0..100]}"
|
|
53
|
-
end
|
|
54
|
-
end
|
|
55
|
-
|
|
56
|
-
# ============================================================================
|
|
57
|
-
# Pattern 4: Batch Processing (Preferred for Multiple Files)
|
|
58
|
-
# ============================================================================
|
|
59
|
-
|
|
60
|
-
def batch_processing
|
|
61
|
-
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
|
|
62
|
-
|
|
63
|
-
# The batch API handles concurrency internally via Rust/Tokio
|
|
64
|
-
# This is more efficient than Ruby threads
|
|
65
|
-
results = Kreuzberg.batch_extract_files_sync(files)
|
|
66
|
-
|
|
67
|
-
puts "Processed #{results.length} files"
|
|
68
|
-
results.each do |result|
|
|
69
|
-
puts "Content preview: #{result[:content][0..50]}"
|
|
70
|
-
end
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
# ============================================================================
|
|
74
|
-
# Pattern 5: Extraction with Configuration
|
|
75
|
-
# ============================================================================
|
|
76
|
-
|
|
77
|
-
def extraction_with_config
|
|
78
|
-
# Configure OCR
|
|
79
|
-
config = {
|
|
80
|
-
ocr: {
|
|
81
|
-
backend: 'tesseract',
|
|
82
|
-
language: 'eng'
|
|
83
|
-
},
|
|
84
|
-
force_ocr: false
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
result = Kreuzberg.extract_file_sync('scanned.pdf', **config)
|
|
88
|
-
puts "Extracted with OCR: #{result[:content]}"
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
# ============================================================================
|
|
92
|
-
# Pattern 6: Extract from Bytes
|
|
93
|
-
# ============================================================================
|
|
94
|
-
|
|
95
|
-
def extract_from_bytes
|
|
96
|
-
data = File.binread('document.pdf')
|
|
97
|
-
result = Kreuzberg.extract_bytes_sync(data, 'application/pdf')
|
|
98
|
-
puts "Extracted from memory: #{result[:content]}"
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
# ============================================================================
|
|
102
|
-
# Pattern 7: Batch Extract from Bytes
|
|
103
|
-
# ============================================================================
|
|
104
|
-
|
|
105
|
-
def batch_extract_from_bytes
|
|
106
|
-
files = ['doc1.pdf', 'doc2.pdf']
|
|
107
|
-
bytes_array = files.map { |f| File.binread(f) }
|
|
108
|
-
mime_types = ['application/pdf', 'application/pdf']
|
|
109
|
-
|
|
110
|
-
results = Kreuzberg.batch_extract_bytes_sync(bytes_array, mime_types)
|
|
111
|
-
puts "Processed #{results.length} files from memory"
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
# ============================================================================
|
|
115
|
-
# Pattern 8: Error Handling
|
|
116
|
-
# ============================================================================
|
|
117
|
-
|
|
118
|
-
def error_handling
|
|
119
|
-
Kreuzberg.extract_file_sync('nonexistent.pdf')
|
|
120
|
-
rescue StandardError => e
|
|
121
|
-
puts "Extraction failed: #{e.message}"
|
|
122
|
-
end
|
|
123
|
-
|
|
124
|
-
# ============================================================================
|
|
125
|
-
# Pattern 9: Sequential Processing
|
|
126
|
-
# ============================================================================
|
|
127
|
-
|
|
128
|
-
def sequential_processing
|
|
129
|
-
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
|
|
130
|
-
|
|
131
|
-
files.each do |file|
|
|
132
|
-
result = Kreuzberg.extract_file_sync(file)
|
|
133
|
-
puts "Processed #{file}: #{result[:content][0..50]}"
|
|
134
|
-
end
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
# ============================================================================
|
|
138
|
-
# Pattern 10: Background Processing with ActiveJob (Rails)
|
|
139
|
-
# ============================================================================
|
|
140
|
-
|
|
141
|
-
# Example ActiveJob for async processing in Rails
|
|
142
|
-
# < ApplicationJob
|
|
143
|
-
class DocumentExtractionJob
|
|
144
|
-
# queue_as :default
|
|
145
|
-
|
|
146
|
-
def perform(file_path)
|
|
147
|
-
result = Kreuzberg.extract_file_sync(file_path)
|
|
148
|
-
# Store result in database or process further
|
|
149
|
-
puts "Background extraction complete: #{result[:content][0..100]}"
|
|
150
|
-
end
|
|
151
|
-
end
|
|
152
|
-
|
|
153
|
-
# Usage in Rails controller:
|
|
154
|
-
# DocumentExtractionJob.perform_later('document.pdf')
|
|
155
|
-
|
|
156
|
-
# ============================================================================
|
|
157
|
-
# Pattern 11: Concurrent Processing with Parallel Gem
|
|
158
|
-
# ============================================================================
|
|
159
|
-
|
|
160
|
-
def concurrent_with_parallel_gem
|
|
161
|
-
require 'parallel'
|
|
162
|
-
|
|
163
|
-
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf', 'doc4.pdf']
|
|
164
|
-
|
|
165
|
-
# Process files in parallel using multiple CPU cores
|
|
166
|
-
results = Parallel.map(files, in_processes: 4) do |file|
|
|
167
|
-
Kreuzberg.extract_file_sync(file)
|
|
168
|
-
end
|
|
169
|
-
|
|
170
|
-
results.each do |result|
|
|
171
|
-
puts "Content: #{result[:content][0..50]}"
|
|
172
|
-
end
|
|
173
|
-
end
|
|
174
|
-
|
|
175
|
-
# ============================================================================
|
|
176
|
-
# Pattern 12: Timeout Wrapper
|
|
177
|
-
# ============================================================================
|
|
178
|
-
|
|
179
|
-
def extraction_with_timeout(file_path, timeout_seconds = 30)
|
|
180
|
-
require 'timeout'
|
|
181
|
-
|
|
182
|
-
Timeout.timeout(timeout_seconds) do
|
|
183
|
-
Kreuzberg.extract_file_sync(file_path)
|
|
184
|
-
end
|
|
185
|
-
rescue Timeout::Error
|
|
186
|
-
puts "Extraction timed out after #{timeout_seconds} seconds"
|
|
187
|
-
nil
|
|
188
|
-
end
|
|
189
|
-
|
|
190
|
-
# ============================================================================
|
|
191
|
-
# Pattern 13: Custom Ruby PostProcessor Plugin
|
|
192
|
-
# ============================================================================
|
|
193
|
-
|
|
194
|
-
def register_postprocessor
|
|
195
|
-
# Register a Ruby-based post-processor
|
|
196
|
-
uppercase_processor = lambda do |result|
|
|
197
|
-
result[:content] = result[:content].upcase
|
|
198
|
-
result
|
|
199
|
-
end
|
|
200
|
-
|
|
201
|
-
Kreuzberg.register_post_processor('uppercase', uppercase_processor, 100)
|
|
202
|
-
|
|
203
|
-
# Now all extractions will use the uppercase processor
|
|
204
|
-
result = Kreuzberg.extract_file_sync('document.pdf')
|
|
205
|
-
puts "Uppercase content: #{result[:content]}"
|
|
206
|
-
|
|
207
|
-
# Clean up
|
|
208
|
-
Kreuzberg.unregister_post_processor('uppercase')
|
|
209
|
-
end
|
|
210
|
-
|
|
211
|
-
# ============================================================================
|
|
212
|
-
# Pattern 14: Custom Ruby Validator Plugin
|
|
213
|
-
# ============================================================================
|
|
214
|
-
|
|
215
|
-
def register_validator
|
|
216
|
-
# Register a Ruby-based validator
|
|
217
|
-
min_length_validator = lambda do |result|
|
|
218
|
-
raise 'Content too short' if result[:content].length < 100
|
|
219
|
-
end
|
|
220
|
-
|
|
221
|
-
Kreuzberg.register_validator('min_length', min_length_validator, 100)
|
|
222
|
-
|
|
223
|
-
# Validation will run automatically during extraction
|
|
224
|
-
begin
|
|
225
|
-
result = Kreuzberg.extract_file_sync('short_document.pdf')
|
|
226
|
-
puts "Validation passed: #{result[:content]}"
|
|
227
|
-
rescue StandardError => e
|
|
228
|
-
puts "Validation failed: #{e.message}"
|
|
229
|
-
end
|
|
230
|
-
|
|
231
|
-
# Clean up
|
|
232
|
-
Kreuzberg.unregister_validator('min_length')
|
|
233
|
-
end
|
|
234
|
-
|
|
235
|
-
# ============================================================================
|
|
236
|
-
# Pattern 15: Custom Ruby OCR Backend Plugin
|
|
237
|
-
# ============================================================================
|
|
238
|
-
|
|
239
|
-
# Example OCR backend implementation for custom processing.
|
|
240
|
-
class CustomOcrBackend
|
|
241
|
-
def process_image(image_bytes, language)
|
|
242
|
-
# In a real implementation, you would:
|
|
243
|
-
# 1. Call an external OCR service
|
|
244
|
-
# 2. Use an HTTP API
|
|
245
|
-
# 3. Process with a Ruby gem
|
|
246
|
-
"Extracted text from #{image_bytes.length} bytes using #{language}"
|
|
247
|
-
end
|
|
248
|
-
|
|
249
|
-
def supports_language?(lang)
|
|
250
|
-
%w[eng deu fra].include?(lang)
|
|
251
|
-
end
|
|
252
|
-
end
|
|
253
|
-
|
|
254
|
-
def register_ocr_backend
|
|
255
|
-
backend = CustomOcrBackend.new
|
|
256
|
-
Kreuzberg.register_ocr_backend('custom', backend)
|
|
257
|
-
|
|
258
|
-
# Now you can use the custom backend
|
|
259
|
-
config = {
|
|
260
|
-
ocr: {
|
|
261
|
-
backend: 'custom',
|
|
262
|
-
language: 'eng'
|
|
263
|
-
},
|
|
264
|
-
force_ocr: true
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
result = Kreuzberg.extract_file_sync('scanned.pdf', **config)
|
|
268
|
-
puts "Custom OCR result: #{result[:content]}"
|
|
269
|
-
end
|
|
270
|
-
|
|
271
|
-
# ============================================================================
|
|
272
|
-
# Main Demonstration
|
|
273
|
-
# ============================================================================
|
|
274
|
-
|
|
275
|
-
def main
|
|
276
|
-
puts '=== Basic Sync Extraction ==='
|
|
277
|
-
basic_sync_extraction
|
|
278
|
-
|
|
279
|
-
puts '\n=== Basic Async Extraction (Blocks GVL) ==='
|
|
280
|
-
basic_async_extraction
|
|
281
|
-
|
|
282
|
-
puts '\n=== Concurrent with Ruby Threads ==='
|
|
283
|
-
concurrent_with_threads
|
|
284
|
-
|
|
285
|
-
puts '\n=== Batch Processing (Preferred) ==='
|
|
286
|
-
batch_processing
|
|
287
|
-
|
|
288
|
-
puts '\n=== Extraction with Config ==='
|
|
289
|
-
extraction_with_config
|
|
290
|
-
|
|
291
|
-
puts '\n=== Extract from Bytes ==='
|
|
292
|
-
extract_from_bytes
|
|
293
|
-
|
|
294
|
-
puts '\n=== Error Handling ==='
|
|
295
|
-
error_handling
|
|
296
|
-
|
|
297
|
-
puts '\n=== Sequential Processing ==='
|
|
298
|
-
sequential_processing
|
|
299
|
-
|
|
300
|
-
puts '\n=== Extraction with Timeout ==='
|
|
301
|
-
extraction_with_timeout('large_document.pdf', 30)
|
|
302
|
-
|
|
303
|
-
puts '\n=== Custom PostProcessor ==='
|
|
304
|
-
register_postprocessor
|
|
305
|
-
|
|
306
|
-
puts '\n=== Custom Validator ==='
|
|
307
|
-
register_validator
|
|
308
|
-
end
|
|
309
|
-
|
|
310
|
-
# Run if executed directly
|
|
311
|
-
main if __FILE__ == $PROGRAM_NAME
|
|
312
|
-
|
|
313
|
-
# ============================================================================
|
|
314
|
-
# Key Takeaways:
|
|
315
|
-
#
|
|
316
|
-
# 1. Ruby bindings use Tokio runtime with block_on() internally
|
|
317
|
-
# 2. "Async" functions block the Ruby GVL - no concurrency benefit
|
|
318
|
-
# 3. Use _sync variants for clarity (same performance)
|
|
319
|
-
# 4. Use Ruby threads or Parallel gem for concurrent processing
|
|
320
|
-
# 5. Batch API is most efficient for multiple files
|
|
321
|
-
# 6. ActiveJob for background processing in Rails
|
|
322
|
-
# 7. Ruby plugins (PostProcessor, Validator, OCR) are fully supported
|
|
323
|
-
#
|
|
324
|
-
# Performance Comparison:
|
|
325
|
-
# - Magnus: Blocks GVL, same overhead as sync (~Xms per call)
|
|
326
|
-
# - PyO3 (optimized): ~0.17ms overhead, GIL released during await
|
|
327
|
-
# - NAPI-RS: ~0ms overhead, automatic Promise conversion
|
|
328
|
-
#
|
|
329
|
-
# When to Use Ruby Bindings:
|
|
330
|
-
# ✅ Rails applications (ActiveJob for background processing)
|
|
331
|
-
# ✅ Ruby scripts (existing Ruby codebases)
|
|
332
|
-
# ✅ Simple extraction (single-file processing)
|
|
333
|
-
# ✅ Batch processing (batch API handles concurrency)
|
|
334
|
-
#
|
|
335
|
-
# Consider Other Bindings For:
|
|
336
|
-
# ❌ High concurrency (use Node.js/NAPI-RS instead)
|
|
337
|
-
# ❌ Real-time processing (use Node.js/NAPI-RS instead)
|
|
338
|
-
# ❌ I/O-bound workloads (use Python/PyO3 or Node.js/NAPI-RS)
|
|
339
|
-
#
|
|
340
|
-
# See packages/ruby/ext/kreuzberg_rb/native/README.md for detailed async runtime documentation.
|
|
341
|
-
# ============================================================================
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Async Patterns for Kreuzberg Ruby Bindings
|
|
4
|
+
#
|
|
5
|
+
# This example demonstrates async patterns and concurrency approaches for Ruby,
|
|
6
|
+
# with comparison to the underlying Rust implementation.
|
|
7
|
+
|
|
8
|
+
require 'kreuzberg'
|
|
9
|
+
|
|
10
|
+
# NOTE: Ruby bindings use Tokio runtime with block_on() internally.
|
|
11
|
+
# The "async" functions block the Ruby GVL during execution, so there's
|
|
12
|
+
# no performance benefit over the _sync variants from Ruby's perspective.
|
|
13
|
+
|
|
14
|
+
# ============================================================================
|
|
15
|
+
# Pattern 1: Synchronous Extraction (Recommended)
|
|
16
|
+
# ============================================================================
|
|
17
|
+
|
|
18
|
+
def basic_sync_extraction
|
|
19
|
+
result = Kreuzberg.extract_file_sync('document.pdf')
|
|
20
|
+
puts "Content: #{result[:content]}"
|
|
21
|
+
puts "MIME type: #{result[:mime_type]}"
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# ============================================================================
|
|
25
|
+
# Pattern 2: "Async" Extraction (Same Performance as Sync)
|
|
26
|
+
# ============================================================================
|
|
27
|
+
|
|
28
|
+
def basic_async_extraction
|
|
29
|
+
# This LOOKS async but actually blocks the Ruby thread
|
|
30
|
+
# Internally uses: runtime.block_on(async { ... })
|
|
31
|
+
result = Kreuzberg.extract_file('document.pdf')
|
|
32
|
+
puts "Content: #{result[:content]}"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# ============================================================================
|
|
36
|
+
# Pattern 3: Concurrent Processing with Ruby Threads
|
|
37
|
+
# ============================================================================
|
|
38
|
+
|
|
39
|
+
def concurrent_with_threads
|
|
40
|
+
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
|
|
41
|
+
|
|
42
|
+
# Use Ruby threads to achieve parallelism
|
|
43
|
+
# Each thread calls the synchronous API
|
|
44
|
+
threads = files.map do |file|
|
|
45
|
+
Thread.new do
|
|
46
|
+
Kreuzberg.extract_file_sync(file)
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
results = threads.map(&:value)
|
|
51
|
+
results.each_with_index do |result, index|
|
|
52
|
+
puts "File #{index + 1}: #{result[:content][0..100]}"
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# ============================================================================
|
|
57
|
+
# Pattern 4: Batch Processing (Preferred for Multiple Files)
|
|
58
|
+
# ============================================================================
|
|
59
|
+
|
|
60
|
+
def batch_processing
|
|
61
|
+
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
|
|
62
|
+
|
|
63
|
+
# The batch API handles concurrency internally via Rust/Tokio
|
|
64
|
+
# This is more efficient than Ruby threads
|
|
65
|
+
results = Kreuzberg.batch_extract_files_sync(files)
|
|
66
|
+
|
|
67
|
+
puts "Processed #{results.length} files"
|
|
68
|
+
results.each do |result|
|
|
69
|
+
puts "Content preview: #{result[:content][0..50]}"
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# ============================================================================
|
|
74
|
+
# Pattern 5: Extraction with Configuration
|
|
75
|
+
# ============================================================================
|
|
76
|
+
|
|
77
|
+
def extraction_with_config
|
|
78
|
+
# Configure OCR
|
|
79
|
+
config = {
|
|
80
|
+
ocr: {
|
|
81
|
+
backend: 'tesseract',
|
|
82
|
+
language: 'eng'
|
|
83
|
+
},
|
|
84
|
+
force_ocr: false
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
result = Kreuzberg.extract_file_sync('scanned.pdf', **config)
|
|
88
|
+
puts "Extracted with OCR: #{result[:content]}"
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# ============================================================================
|
|
92
|
+
# Pattern 6: Extract from Bytes
|
|
93
|
+
# ============================================================================
|
|
94
|
+
|
|
95
|
+
def extract_from_bytes
|
|
96
|
+
data = File.binread('document.pdf')
|
|
97
|
+
result = Kreuzberg.extract_bytes_sync(data, 'application/pdf')
|
|
98
|
+
puts "Extracted from memory: #{result[:content]}"
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# ============================================================================
|
|
102
|
+
# Pattern 7: Batch Extract from Bytes
|
|
103
|
+
# ============================================================================
|
|
104
|
+
|
|
105
|
+
def batch_extract_from_bytes
|
|
106
|
+
files = ['doc1.pdf', 'doc2.pdf']
|
|
107
|
+
bytes_array = files.map { |f| File.binread(f) }
|
|
108
|
+
mime_types = ['application/pdf', 'application/pdf']
|
|
109
|
+
|
|
110
|
+
results = Kreuzberg.batch_extract_bytes_sync(bytes_array, mime_types)
|
|
111
|
+
puts "Processed #{results.length} files from memory"
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# ============================================================================
|
|
115
|
+
# Pattern 8: Error Handling
|
|
116
|
+
# ============================================================================
|
|
117
|
+
|
|
118
|
+
def error_handling
|
|
119
|
+
Kreuzberg.extract_file_sync('nonexistent.pdf')
|
|
120
|
+
rescue StandardError => e
|
|
121
|
+
puts "Extraction failed: #{e.message}"
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# ============================================================================
|
|
125
|
+
# Pattern 9: Sequential Processing
|
|
126
|
+
# ============================================================================
|
|
127
|
+
|
|
128
|
+
def sequential_processing
|
|
129
|
+
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf']
|
|
130
|
+
|
|
131
|
+
files.each do |file|
|
|
132
|
+
result = Kreuzberg.extract_file_sync(file)
|
|
133
|
+
puts "Processed #{file}: #{result[:content][0..50]}"
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# ============================================================================
|
|
138
|
+
# Pattern 10: Background Processing with ActiveJob (Rails)
|
|
139
|
+
# ============================================================================
|
|
140
|
+
|
|
141
|
+
# Example ActiveJob for async processing in Rails
|
|
142
|
+
# < ApplicationJob
|
|
143
|
+
class DocumentExtractionJob
|
|
144
|
+
# queue_as :default
|
|
145
|
+
|
|
146
|
+
def perform(file_path)
|
|
147
|
+
result = Kreuzberg.extract_file_sync(file_path)
|
|
148
|
+
# Store result in database or process further
|
|
149
|
+
puts "Background extraction complete: #{result[:content][0..100]}"
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Usage in Rails controller:
|
|
154
|
+
# DocumentExtractionJob.perform_later('document.pdf')
|
|
155
|
+
|
|
156
|
+
# ============================================================================
|
|
157
|
+
# Pattern 11: Concurrent Processing with Parallel Gem
|
|
158
|
+
# ============================================================================
|
|
159
|
+
|
|
160
|
+
def concurrent_with_parallel_gem
|
|
161
|
+
require 'parallel'
|
|
162
|
+
|
|
163
|
+
files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf', 'doc4.pdf']
|
|
164
|
+
|
|
165
|
+
# Process files in parallel using multiple CPU cores
|
|
166
|
+
results = Parallel.map(files, in_processes: 4) do |file|
|
|
167
|
+
Kreuzberg.extract_file_sync(file)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
results.each do |result|
|
|
171
|
+
puts "Content: #{result[:content][0..50]}"
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# ============================================================================
|
|
176
|
+
# Pattern 12: Timeout Wrapper
|
|
177
|
+
# ============================================================================
|
|
178
|
+
|
|
179
|
+
def extraction_with_timeout(file_path, timeout_seconds = 30)
|
|
180
|
+
require 'timeout'
|
|
181
|
+
|
|
182
|
+
Timeout.timeout(timeout_seconds) do
|
|
183
|
+
Kreuzberg.extract_file_sync(file_path)
|
|
184
|
+
end
|
|
185
|
+
rescue Timeout::Error
|
|
186
|
+
puts "Extraction timed out after #{timeout_seconds} seconds"
|
|
187
|
+
nil
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# ============================================================================
|
|
191
|
+
# Pattern 13: Custom Ruby PostProcessor Plugin
|
|
192
|
+
# ============================================================================
|
|
193
|
+
|
|
194
|
+
def register_postprocessor
|
|
195
|
+
# Register a Ruby-based post-processor
|
|
196
|
+
uppercase_processor = lambda do |result|
|
|
197
|
+
result[:content] = result[:content].upcase
|
|
198
|
+
result
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
Kreuzberg.register_post_processor('uppercase', uppercase_processor, 100)
|
|
202
|
+
|
|
203
|
+
# Now all extractions will use the uppercase processor
|
|
204
|
+
result = Kreuzberg.extract_file_sync('document.pdf')
|
|
205
|
+
puts "Uppercase content: #{result[:content]}"
|
|
206
|
+
|
|
207
|
+
# Clean up
|
|
208
|
+
Kreuzberg.unregister_post_processor('uppercase')
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# ============================================================================
|
|
212
|
+
# Pattern 14: Custom Ruby Validator Plugin
|
|
213
|
+
# ============================================================================
|
|
214
|
+
|
|
215
|
+
def register_validator
|
|
216
|
+
# Register a Ruby-based validator
|
|
217
|
+
min_length_validator = lambda do |result|
|
|
218
|
+
raise 'Content too short' if result[:content].length < 100
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
Kreuzberg.register_validator('min_length', min_length_validator, 100)
|
|
222
|
+
|
|
223
|
+
# Validation will run automatically during extraction
|
|
224
|
+
begin
|
|
225
|
+
result = Kreuzberg.extract_file_sync('short_document.pdf')
|
|
226
|
+
puts "Validation passed: #{result[:content]}"
|
|
227
|
+
rescue StandardError => e
|
|
228
|
+
puts "Validation failed: #{e.message}"
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# Clean up
|
|
232
|
+
Kreuzberg.unregister_validator('min_length')
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
# ============================================================================
|
|
236
|
+
# Pattern 15: Custom Ruby OCR Backend Plugin
|
|
237
|
+
# ============================================================================
|
|
238
|
+
|
|
239
|
+
# Example OCR backend implementation for custom processing.
|
|
240
|
+
class CustomOcrBackend
|
|
241
|
+
def process_image(image_bytes, language)
|
|
242
|
+
# In a real implementation, you would:
|
|
243
|
+
# 1. Call an external OCR service
|
|
244
|
+
# 2. Use an HTTP API
|
|
245
|
+
# 3. Process with a Ruby gem
|
|
246
|
+
"Extracted text from #{image_bytes.length} bytes using #{language}"
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def supports_language?(lang)
|
|
250
|
+
%w[eng deu fra].include?(lang)
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
def register_ocr_backend
|
|
255
|
+
backend = CustomOcrBackend.new
|
|
256
|
+
Kreuzberg.register_ocr_backend('custom', backend)
|
|
257
|
+
|
|
258
|
+
# Now you can use the custom backend
|
|
259
|
+
config = {
|
|
260
|
+
ocr: {
|
|
261
|
+
backend: 'custom',
|
|
262
|
+
language: 'eng'
|
|
263
|
+
},
|
|
264
|
+
force_ocr: true
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
result = Kreuzberg.extract_file_sync('scanned.pdf', **config)
|
|
268
|
+
puts "Custom OCR result: #{result[:content]}"
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
# ============================================================================
|
|
272
|
+
# Main Demonstration
|
|
273
|
+
# ============================================================================
|
|
274
|
+
|
|
275
|
+
def main
|
|
276
|
+
puts '=== Basic Sync Extraction ==='
|
|
277
|
+
basic_sync_extraction
|
|
278
|
+
|
|
279
|
+
puts '\n=== Basic Async Extraction (Blocks GVL) ==='
|
|
280
|
+
basic_async_extraction
|
|
281
|
+
|
|
282
|
+
puts '\n=== Concurrent with Ruby Threads ==='
|
|
283
|
+
concurrent_with_threads
|
|
284
|
+
|
|
285
|
+
puts '\n=== Batch Processing (Preferred) ==='
|
|
286
|
+
batch_processing
|
|
287
|
+
|
|
288
|
+
puts '\n=== Extraction with Config ==='
|
|
289
|
+
extraction_with_config
|
|
290
|
+
|
|
291
|
+
puts '\n=== Extract from Bytes ==='
|
|
292
|
+
extract_from_bytes
|
|
293
|
+
|
|
294
|
+
puts '\n=== Error Handling ==='
|
|
295
|
+
error_handling
|
|
296
|
+
|
|
297
|
+
puts '\n=== Sequential Processing ==='
|
|
298
|
+
sequential_processing
|
|
299
|
+
|
|
300
|
+
puts '\n=== Extraction with Timeout ==='
|
|
301
|
+
extraction_with_timeout('large_document.pdf', 30)
|
|
302
|
+
|
|
303
|
+
puts '\n=== Custom PostProcessor ==='
|
|
304
|
+
register_postprocessor
|
|
305
|
+
|
|
306
|
+
puts '\n=== Custom Validator ==='
|
|
307
|
+
register_validator
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
# Run if executed directly
|
|
311
|
+
main if __FILE__ == $PROGRAM_NAME
|
|
312
|
+
|
|
313
|
+
# ============================================================================
|
|
314
|
+
# Key Takeaways:
|
|
315
|
+
#
|
|
316
|
+
# 1. Ruby bindings use Tokio runtime with block_on() internally
|
|
317
|
+
# 2. "Async" functions block the Ruby GVL - no concurrency benefit
|
|
318
|
+
# 3. Use _sync variants for clarity (same performance)
|
|
319
|
+
# 4. Use Ruby threads or Parallel gem for concurrent processing
|
|
320
|
+
# 5. Batch API is most efficient for multiple files
|
|
321
|
+
# 6. ActiveJob for background processing in Rails
|
|
322
|
+
# 7. Ruby plugins (PostProcessor, Validator, OCR) are fully supported
|
|
323
|
+
#
|
|
324
|
+
# Performance Comparison:
|
|
325
|
+
# - Magnus: Blocks GVL, same overhead as sync (~Xms per call)
|
|
326
|
+
# - PyO3 (optimized): ~0.17ms overhead, GIL released during await
|
|
327
|
+
# - NAPI-RS: ~0ms overhead, automatic Promise conversion
|
|
328
|
+
#
|
|
329
|
+
# When to Use Ruby Bindings:
|
|
330
|
+
# ✅ Rails applications (ActiveJob for background processing)
|
|
331
|
+
# ✅ Ruby scripts (existing Ruby codebases)
|
|
332
|
+
# ✅ Simple extraction (single-file processing)
|
|
333
|
+
# ✅ Batch processing (batch API handles concurrency)
|
|
334
|
+
#
|
|
335
|
+
# Consider Other Bindings For:
|
|
336
|
+
# ❌ High concurrency (use Node.js/NAPI-RS instead)
|
|
337
|
+
# ❌ Real-time processing (use Node.js/NAPI-RS instead)
|
|
338
|
+
# ❌ I/O-bound workloads (use Python/PyO3 or Node.js/NAPI-RS)
|
|
339
|
+
#
|
|
340
|
+
# See packages/ruby/ext/kreuzberg_rb/native/README.md for detailed async runtime documentation.
|
|
341
|
+
# ============================================================================
|