kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +105 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6940 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.dylib} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +73 -4
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
|
@@ -1,170 +1,170 @@
|
|
|
1
|
-
//! API request and response types.
|
|
2
|
-
|
|
3
|
-
use serde::{Deserialize, Serialize};
|
|
4
|
-
use std::sync::Arc;
|
|
5
|
-
|
|
6
|
-
use crate::{ExtractionConfig, types::ExtractionResult};
|
|
7
|
-
|
|
8
|
-
/// API server size limit configuration.
|
|
9
|
-
///
|
|
10
|
-
/// Controls maximum sizes for request bodies and multipart uploads.
|
|
11
|
-
/// Default limits are designed for typical document processing workloads.
|
|
12
|
-
///
|
|
13
|
-
/// # Default Values
|
|
14
|
-
///
|
|
15
|
-
/// - `max_request_body_bytes`: 100 MB (104,857,600 bytes)
|
|
16
|
-
/// - `max_multipart_field_bytes`: 100 MB (104,857,600 bytes)
|
|
17
|
-
///
|
|
18
|
-
/// # Examples
|
|
19
|
-
///
|
|
20
|
-
/// ```
|
|
21
|
-
/// use kreuzberg::api::ApiSizeLimits;
|
|
22
|
-
///
|
|
23
|
-
/// // Default limits (100 MB)
|
|
24
|
-
/// let limits = ApiSizeLimits::default();
|
|
25
|
-
///
|
|
26
|
-
/// // Custom limits (50 MB for both)
|
|
27
|
-
/// let limits = ApiSizeLimits {
|
|
28
|
-
/// max_request_body_bytes: 50 * 1024 * 1024,
|
|
29
|
-
/// max_multipart_field_bytes: 50 * 1024 * 1024,
|
|
30
|
-
/// };
|
|
31
|
-
///
|
|
32
|
-
/// // Very large documents (500 MB)
|
|
33
|
-
/// let limits = ApiSizeLimits {
|
|
34
|
-
/// max_request_body_bytes: 500 * 1024 * 1024,
|
|
35
|
-
/// max_multipart_field_bytes: 500 * 1024 * 1024,
|
|
36
|
-
/// };
|
|
37
|
-
/// ```
|
|
38
|
-
#[derive(Debug, Clone, Copy)]
|
|
39
|
-
pub struct ApiSizeLimits {
|
|
40
|
-
/// Maximum size of the entire request body in bytes.
|
|
41
|
-
///
|
|
42
|
-
/// This applies to the total size of all uploaded files and form data
|
|
43
|
-
/// in a single request. Default: 100 MB (104,857,600 bytes).
|
|
44
|
-
pub max_request_body_bytes: usize,
|
|
45
|
-
|
|
46
|
-
/// Maximum size of a single multipart field in bytes.
|
|
47
|
-
///
|
|
48
|
-
/// This applies to individual files in a multipart upload.
|
|
49
|
-
/// Default: 100 MB (104,857,600 bytes).
|
|
50
|
-
pub max_multipart_field_bytes: usize,
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
impl Default for ApiSizeLimits {
|
|
54
|
-
fn default() -> Self {
|
|
55
|
-
Self {
|
|
56
|
-
max_request_body_bytes: 100 * 1024 * 1024,
|
|
57
|
-
max_multipart_field_bytes: 100 * 1024 * 1024,
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
impl ApiSizeLimits {
|
|
63
|
-
/// Create new size limits with custom values.
|
|
64
|
-
///
|
|
65
|
-
/// # Arguments
|
|
66
|
-
///
|
|
67
|
-
/// * `max_request_body_bytes` - Maximum total request size in bytes
|
|
68
|
-
/// * `max_multipart_field_bytes` - Maximum individual file size in bytes
|
|
69
|
-
pub fn new(max_request_body_bytes: usize, max_multipart_field_bytes: usize) -> Self {
|
|
70
|
-
Self {
|
|
71
|
-
max_request_body_bytes,
|
|
72
|
-
max_multipart_field_bytes,
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
/// Create size limits from MB values (convenience method).
|
|
77
|
-
///
|
|
78
|
-
/// # Arguments
|
|
79
|
-
///
|
|
80
|
-
/// * `max_request_body_mb` - Maximum total request size in megabytes
|
|
81
|
-
/// * `max_multipart_field_mb` - Maximum individual file size in megabytes
|
|
82
|
-
///
|
|
83
|
-
/// # Examples
|
|
84
|
-
///
|
|
85
|
-
/// ```
|
|
86
|
-
/// use kreuzberg::api::ApiSizeLimits;
|
|
87
|
-
///
|
|
88
|
-
/// // 50 MB limits
|
|
89
|
-
/// let limits = ApiSizeLimits::from_mb(50, 50);
|
|
90
|
-
/// ```
|
|
91
|
-
pub fn from_mb(max_request_body_mb: usize, max_multipart_field_mb: usize) -> Self {
|
|
92
|
-
Self {
|
|
93
|
-
max_request_body_bytes: max_request_body_mb * 1024 * 1024,
|
|
94
|
-
max_multipart_field_bytes: max_multipart_field_mb * 1024 * 1024,
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
/// Health check response.
|
|
100
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
101
|
-
pub struct HealthResponse {
|
|
102
|
-
/// Health status
|
|
103
|
-
pub status: String,
|
|
104
|
-
/// API version
|
|
105
|
-
pub version: String,
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
/// Server information response.
|
|
109
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
110
|
-
pub struct InfoResponse {
|
|
111
|
-
/// API version
|
|
112
|
-
pub version: String,
|
|
113
|
-
/// Whether using Rust backend
|
|
114
|
-
pub rust_backend: bool,
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
/// Extraction response (list of results).
|
|
118
|
-
pub type ExtractResponse = Vec<ExtractionResult>;
|
|
119
|
-
|
|
120
|
-
/// Error response.
|
|
121
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
122
|
-
pub struct ErrorResponse {
|
|
123
|
-
/// Error type name
|
|
124
|
-
pub error_type: String,
|
|
125
|
-
/// Error message
|
|
126
|
-
pub message: String,
|
|
127
|
-
/// Stack trace (if available)
|
|
128
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
129
|
-
pub traceback: Option<String>,
|
|
130
|
-
/// HTTP status code
|
|
131
|
-
pub status_code: u16,
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
/// API server state.
|
|
135
|
-
///
|
|
136
|
-
/// Holds the default extraction configuration loaded from config file
|
|
137
|
-
/// (via discovery or explicit path). Per-request configs override these defaults.
|
|
138
|
-
#[derive(Debug, Clone)]
|
|
139
|
-
pub struct ApiState {
|
|
140
|
-
/// Default extraction configuration
|
|
141
|
-
pub default_config: Arc<ExtractionConfig>,
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
/// Cache statistics response.
|
|
145
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
146
|
-
pub struct CacheStatsResponse {
|
|
147
|
-
/// Cache directory path
|
|
148
|
-
pub directory: String,
|
|
149
|
-
/// Total number of cache files
|
|
150
|
-
pub total_files: usize,
|
|
151
|
-
/// Total cache size in MB
|
|
152
|
-
pub total_size_mb: f64,
|
|
153
|
-
/// Available disk space in MB
|
|
154
|
-
pub available_space_mb: f64,
|
|
155
|
-
/// Age of oldest file in days
|
|
156
|
-
pub oldest_file_age_days: f64,
|
|
157
|
-
/// Age of newest file in days
|
|
158
|
-
pub newest_file_age_days: f64,
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
/// Cache clear response.
|
|
162
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
163
|
-
pub struct CacheClearResponse {
|
|
164
|
-
/// Cache directory path
|
|
165
|
-
pub directory: String,
|
|
166
|
-
/// Number of files removed
|
|
167
|
-
pub removed_files: usize,
|
|
168
|
-
/// Space freed in MB
|
|
169
|
-
pub freed_mb: f64,
|
|
170
|
-
}
|
|
1
|
+
//! API request and response types.
|
|
2
|
+
|
|
3
|
+
use serde::{Deserialize, Serialize};
|
|
4
|
+
use std::sync::Arc;
|
|
5
|
+
|
|
6
|
+
use crate::{ExtractionConfig, types::ExtractionResult};
|
|
7
|
+
|
|
8
|
+
/// API server size limit configuration.
|
|
9
|
+
///
|
|
10
|
+
/// Controls maximum sizes for request bodies and multipart uploads.
|
|
11
|
+
/// Default limits are designed for typical document processing workloads.
|
|
12
|
+
///
|
|
13
|
+
/// # Default Values
|
|
14
|
+
///
|
|
15
|
+
/// - `max_request_body_bytes`: 100 MB (104,857,600 bytes)
|
|
16
|
+
/// - `max_multipart_field_bytes`: 100 MB (104,857,600 bytes)
|
|
17
|
+
///
|
|
18
|
+
/// # Examples
|
|
19
|
+
///
|
|
20
|
+
/// ```
|
|
21
|
+
/// use kreuzberg::api::ApiSizeLimits;
|
|
22
|
+
///
|
|
23
|
+
/// // Default limits (100 MB)
|
|
24
|
+
/// let limits = ApiSizeLimits::default();
|
|
25
|
+
///
|
|
26
|
+
/// // Custom limits (50 MB for both)
|
|
27
|
+
/// let limits = ApiSizeLimits {
|
|
28
|
+
/// max_request_body_bytes: 50 * 1024 * 1024,
|
|
29
|
+
/// max_multipart_field_bytes: 50 * 1024 * 1024,
|
|
30
|
+
/// };
|
|
31
|
+
///
|
|
32
|
+
/// // Very large documents (500 MB)
|
|
33
|
+
/// let limits = ApiSizeLimits {
|
|
34
|
+
/// max_request_body_bytes: 500 * 1024 * 1024,
|
|
35
|
+
/// max_multipart_field_bytes: 500 * 1024 * 1024,
|
|
36
|
+
/// };
|
|
37
|
+
/// ```
|
|
38
|
+
#[derive(Debug, Clone, Copy)]
|
|
39
|
+
pub struct ApiSizeLimits {
|
|
40
|
+
/// Maximum size of the entire request body in bytes.
|
|
41
|
+
///
|
|
42
|
+
/// This applies to the total size of all uploaded files and form data
|
|
43
|
+
/// in a single request. Default: 100 MB (104,857,600 bytes).
|
|
44
|
+
pub max_request_body_bytes: usize,
|
|
45
|
+
|
|
46
|
+
/// Maximum size of a single multipart field in bytes.
|
|
47
|
+
///
|
|
48
|
+
/// This applies to individual files in a multipart upload.
|
|
49
|
+
/// Default: 100 MB (104,857,600 bytes).
|
|
50
|
+
pub max_multipart_field_bytes: usize,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
impl Default for ApiSizeLimits {
|
|
54
|
+
fn default() -> Self {
|
|
55
|
+
Self {
|
|
56
|
+
max_request_body_bytes: 100 * 1024 * 1024,
|
|
57
|
+
max_multipart_field_bytes: 100 * 1024 * 1024,
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
impl ApiSizeLimits {
|
|
63
|
+
/// Create new size limits with custom values.
|
|
64
|
+
///
|
|
65
|
+
/// # Arguments
|
|
66
|
+
///
|
|
67
|
+
/// * `max_request_body_bytes` - Maximum total request size in bytes
|
|
68
|
+
/// * `max_multipart_field_bytes` - Maximum individual file size in bytes
|
|
69
|
+
pub fn new(max_request_body_bytes: usize, max_multipart_field_bytes: usize) -> Self {
|
|
70
|
+
Self {
|
|
71
|
+
max_request_body_bytes,
|
|
72
|
+
max_multipart_field_bytes,
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/// Create size limits from MB values (convenience method).
|
|
77
|
+
///
|
|
78
|
+
/// # Arguments
|
|
79
|
+
///
|
|
80
|
+
/// * `max_request_body_mb` - Maximum total request size in megabytes
|
|
81
|
+
/// * `max_multipart_field_mb` - Maximum individual file size in megabytes
|
|
82
|
+
///
|
|
83
|
+
/// # Examples
|
|
84
|
+
///
|
|
85
|
+
/// ```
|
|
86
|
+
/// use kreuzberg::api::ApiSizeLimits;
|
|
87
|
+
///
|
|
88
|
+
/// // 50 MB limits
|
|
89
|
+
/// let limits = ApiSizeLimits::from_mb(50, 50);
|
|
90
|
+
/// ```
|
|
91
|
+
pub fn from_mb(max_request_body_mb: usize, max_multipart_field_mb: usize) -> Self {
|
|
92
|
+
Self {
|
|
93
|
+
max_request_body_bytes: max_request_body_mb * 1024 * 1024,
|
|
94
|
+
max_multipart_field_bytes: max_multipart_field_mb * 1024 * 1024,
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/// Health check response.
|
|
100
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
101
|
+
pub struct HealthResponse {
|
|
102
|
+
/// Health status
|
|
103
|
+
pub status: String,
|
|
104
|
+
/// API version
|
|
105
|
+
pub version: String,
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/// Server information response.
|
|
109
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
110
|
+
pub struct InfoResponse {
|
|
111
|
+
/// API version
|
|
112
|
+
pub version: String,
|
|
113
|
+
/// Whether using Rust backend
|
|
114
|
+
pub rust_backend: bool,
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/// Extraction response (list of results).
|
|
118
|
+
pub type ExtractResponse = Vec<ExtractionResult>;
|
|
119
|
+
|
|
120
|
+
/// Error response.
|
|
121
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
122
|
+
pub struct ErrorResponse {
|
|
123
|
+
/// Error type name
|
|
124
|
+
pub error_type: String,
|
|
125
|
+
/// Error message
|
|
126
|
+
pub message: String,
|
|
127
|
+
/// Stack trace (if available)
|
|
128
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
129
|
+
pub traceback: Option<String>,
|
|
130
|
+
/// HTTP status code
|
|
131
|
+
pub status_code: u16,
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/// API server state.
|
|
135
|
+
///
|
|
136
|
+
/// Holds the default extraction configuration loaded from config file
|
|
137
|
+
/// (via discovery or explicit path). Per-request configs override these defaults.
|
|
138
|
+
#[derive(Debug, Clone)]
|
|
139
|
+
pub struct ApiState {
|
|
140
|
+
/// Default extraction configuration
|
|
141
|
+
pub default_config: Arc<ExtractionConfig>,
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/// Cache statistics response.
|
|
145
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
146
|
+
pub struct CacheStatsResponse {
|
|
147
|
+
/// Cache directory path
|
|
148
|
+
pub directory: String,
|
|
149
|
+
/// Total number of cache files
|
|
150
|
+
pub total_files: usize,
|
|
151
|
+
/// Total cache size in MB
|
|
152
|
+
pub total_size_mb: f64,
|
|
153
|
+
/// Available disk space in MB
|
|
154
|
+
pub available_space_mb: f64,
|
|
155
|
+
/// Age of oldest file in days
|
|
156
|
+
pub oldest_file_age_days: f64,
|
|
157
|
+
/// Age of newest file in days
|
|
158
|
+
pub newest_file_age_days: f64,
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/// Cache clear response.
|
|
162
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
163
|
+
pub struct CacheClearResponse {
|
|
164
|
+
/// Cache directory path
|
|
165
|
+
pub directory: String,
|
|
166
|
+
/// Number of files removed
|
|
167
|
+
pub removed_files: usize,
|
|
168
|
+
/// Space freed in MB
|
|
169
|
+
pub freed_mb: f64,
|
|
170
|
+
}
|