kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +105 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6940 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.dylib} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +73 -4
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
|
@@ -1,81 +1,81 @@
|
|
|
1
|
-
//! API error handling.
|
|
2
|
-
|
|
3
|
-
use axum::{
|
|
4
|
-
Json,
|
|
5
|
-
http::StatusCode,
|
|
6
|
-
response::{IntoResponse, Response},
|
|
7
|
-
};
|
|
8
|
-
|
|
9
|
-
use crate::error::KreuzbergError;
|
|
10
|
-
|
|
11
|
-
use super::types::ErrorResponse;
|
|
12
|
-
|
|
13
|
-
/// API-specific error wrapper.
|
|
14
|
-
#[derive(Debug)]
|
|
15
|
-
pub struct ApiError {
|
|
16
|
-
/// HTTP status code
|
|
17
|
-
pub status: StatusCode,
|
|
18
|
-
/// Error response body
|
|
19
|
-
pub body: ErrorResponse,
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
impl ApiError {
|
|
23
|
-
/// Create a new API error.
|
|
24
|
-
pub fn new(status: StatusCode, error: KreuzbergError) -> Self {
|
|
25
|
-
let error_type = match &error {
|
|
26
|
-
KreuzbergError::Validation { .. } => "ValidationError",
|
|
27
|
-
KreuzbergError::Parsing { .. } => "ParsingError",
|
|
28
|
-
KreuzbergError::Ocr { .. } => "OCRError",
|
|
29
|
-
KreuzbergError::Io(_) => "IOError",
|
|
30
|
-
KreuzbergError::Cache { .. } => "CacheError",
|
|
31
|
-
KreuzbergError::ImageProcessing { .. } => "ImageProcessingError",
|
|
32
|
-
KreuzbergError::Serialization { .. } => "SerializationError",
|
|
33
|
-
KreuzbergError::MissingDependency(_) => "MissingDependencyError",
|
|
34
|
-
KreuzbergError::Plugin { .. } => "PluginError",
|
|
35
|
-
KreuzbergError::LockPoisoned(_) => "LockPoisonedError",
|
|
36
|
-
KreuzbergError::UnsupportedFormat(_) => "UnsupportedFormatError",
|
|
37
|
-
KreuzbergError::Other(_) => "Error",
|
|
38
|
-
};
|
|
39
|
-
|
|
40
|
-
Self {
|
|
41
|
-
status,
|
|
42
|
-
body: ErrorResponse {
|
|
43
|
-
error_type: error_type.to_string(),
|
|
44
|
-
message: error.to_string(),
|
|
45
|
-
traceback: None,
|
|
46
|
-
status_code: status.as_u16(),
|
|
47
|
-
},
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
/// Create a validation error (400).
|
|
52
|
-
pub fn validation(error: KreuzbergError) -> Self {
|
|
53
|
-
Self::new(StatusCode::BAD_REQUEST, error)
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
/// Create an unprocessable entity error (422).
|
|
57
|
-
pub fn unprocessable(error: KreuzbergError) -> Self {
|
|
58
|
-
Self::new(StatusCode::UNPROCESSABLE_ENTITY, error)
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
/// Create an internal server error (500).
|
|
62
|
-
pub fn internal(error: KreuzbergError) -> Self {
|
|
63
|
-
Self::new(StatusCode::INTERNAL_SERVER_ERROR, error)
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
impl IntoResponse for ApiError {
|
|
68
|
-
fn into_response(self) -> Response {
|
|
69
|
-
(self.status, Json(self.body)).into_response()
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
impl From<KreuzbergError> for ApiError {
|
|
74
|
-
fn from(error: KreuzbergError) -> Self {
|
|
75
|
-
match &error {
|
|
76
|
-
KreuzbergError::Validation { .. } => Self::validation(error),
|
|
77
|
-
KreuzbergError::Parsing { .. } | KreuzbergError::Ocr { .. } => Self::unprocessable(error),
|
|
78
|
-
_ => Self::internal(error),
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
}
|
|
1
|
+
//! API error handling.
|
|
2
|
+
|
|
3
|
+
use axum::{
|
|
4
|
+
Json,
|
|
5
|
+
http::StatusCode,
|
|
6
|
+
response::{IntoResponse, Response},
|
|
7
|
+
};
|
|
8
|
+
|
|
9
|
+
use crate::error::KreuzbergError;
|
|
10
|
+
|
|
11
|
+
use super::types::ErrorResponse;
|
|
12
|
+
|
|
13
|
+
/// API-specific error wrapper.
|
|
14
|
+
#[derive(Debug)]
|
|
15
|
+
pub struct ApiError {
|
|
16
|
+
/// HTTP status code
|
|
17
|
+
pub status: StatusCode,
|
|
18
|
+
/// Error response body
|
|
19
|
+
pub body: ErrorResponse,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
impl ApiError {
|
|
23
|
+
/// Create a new API error.
|
|
24
|
+
pub fn new(status: StatusCode, error: KreuzbergError) -> Self {
|
|
25
|
+
let error_type = match &error {
|
|
26
|
+
KreuzbergError::Validation { .. } => "ValidationError",
|
|
27
|
+
KreuzbergError::Parsing { .. } => "ParsingError",
|
|
28
|
+
KreuzbergError::Ocr { .. } => "OCRError",
|
|
29
|
+
KreuzbergError::Io(_) => "IOError",
|
|
30
|
+
KreuzbergError::Cache { .. } => "CacheError",
|
|
31
|
+
KreuzbergError::ImageProcessing { .. } => "ImageProcessingError",
|
|
32
|
+
KreuzbergError::Serialization { .. } => "SerializationError",
|
|
33
|
+
KreuzbergError::MissingDependency(_) => "MissingDependencyError",
|
|
34
|
+
KreuzbergError::Plugin { .. } => "PluginError",
|
|
35
|
+
KreuzbergError::LockPoisoned(_) => "LockPoisonedError",
|
|
36
|
+
KreuzbergError::UnsupportedFormat(_) => "UnsupportedFormatError",
|
|
37
|
+
KreuzbergError::Other(_) => "Error",
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
Self {
|
|
41
|
+
status,
|
|
42
|
+
body: ErrorResponse {
|
|
43
|
+
error_type: error_type.to_string(),
|
|
44
|
+
message: error.to_string(),
|
|
45
|
+
traceback: None,
|
|
46
|
+
status_code: status.as_u16(),
|
|
47
|
+
},
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/// Create a validation error (400).
|
|
52
|
+
pub fn validation(error: KreuzbergError) -> Self {
|
|
53
|
+
Self::new(StatusCode::BAD_REQUEST, error)
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/// Create an unprocessable entity error (422).
|
|
57
|
+
pub fn unprocessable(error: KreuzbergError) -> Self {
|
|
58
|
+
Self::new(StatusCode::UNPROCESSABLE_ENTITY, error)
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/// Create an internal server error (500).
|
|
62
|
+
pub fn internal(error: KreuzbergError) -> Self {
|
|
63
|
+
Self::new(StatusCode::INTERNAL_SERVER_ERROR, error)
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
impl IntoResponse for ApiError {
|
|
68
|
+
fn into_response(self) -> Response {
|
|
69
|
+
(self.status, Json(self.body)).into_response()
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
impl From<KreuzbergError> for ApiError {
|
|
74
|
+
fn from(error: KreuzbergError) -> Self {
|
|
75
|
+
match &error {
|
|
76
|
+
KreuzbergError::Validation { .. } => Self::validation(error),
|
|
77
|
+
KreuzbergError::Parsing { .. } | KreuzbergError::Ocr { .. } => Self::unprocessable(error),
|
|
78
|
+
_ => Self::internal(error),
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
@@ -1,199 +1,199 @@
|
|
|
1
|
-
//! API request handlers.
|
|
2
|
-
|
|
3
|
-
use axum::{
|
|
4
|
-
Json,
|
|
5
|
-
extract::{Multipart, State},
|
|
6
|
-
};
|
|
7
|
-
|
|
8
|
-
use crate::{batch_extract_bytes, cache, extract_bytes};
|
|
9
|
-
|
|
10
|
-
use super::{
|
|
11
|
-
error::ApiError,
|
|
12
|
-
types::{ApiState, CacheClearResponse, CacheStatsResponse, ExtractResponse, HealthResponse, InfoResponse},
|
|
13
|
-
};
|
|
14
|
-
|
|
15
|
-
/// Extract endpoint handler.
|
|
16
|
-
///
|
|
17
|
-
/// POST /extract
|
|
18
|
-
///
|
|
19
|
-
/// Accepts multipart form data with:
|
|
20
|
-
/// - `files`: One or more files to extract
|
|
21
|
-
/// - `config` (optional): JSON extraction configuration (overrides server defaults)
|
|
22
|
-
///
|
|
23
|
-
/// Returns a list of extraction results, one per file.
|
|
24
|
-
///
|
|
25
|
-
/// # Size Limits
|
|
26
|
-
///
|
|
27
|
-
/// Request body size limits are enforced at the router layer via `RequestBodyLimitLayer`.
|
|
28
|
-
/// Default limits:
|
|
29
|
-
/// - Total request body: 100 MB (all files + form data combined)
|
|
30
|
-
/// - Individual multipart fields: Controlled by Axum's default multipart limits
|
|
31
|
-
///
|
|
32
|
-
/// If a request exceeds the size limit, it will be rejected with HTTP 413 (Payload Too Large).
|
|
33
|
-
///
|
|
34
|
-
/// The server's default config (loaded from kreuzberg.toml/yaml/json via discovery)
|
|
35
|
-
/// is used as the base, and any per-request config overrides those defaults.
|
|
36
|
-
pub async fn extract_handler(
|
|
37
|
-
State(state): State<ApiState>,
|
|
38
|
-
mut multipart: Multipart,
|
|
39
|
-
) -> Result<Json<ExtractResponse>, ApiError> {
|
|
40
|
-
let mut files = Vec::new();
|
|
41
|
-
let mut config = (*state.default_config).clone();
|
|
42
|
-
|
|
43
|
-
while let Some(field) = multipart
|
|
44
|
-
.next_field()
|
|
45
|
-
.await
|
|
46
|
-
.map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?
|
|
47
|
-
{
|
|
48
|
-
let field_name = field.name().unwrap_or("").to_string();
|
|
49
|
-
|
|
50
|
-
match field_name.as_str() {
|
|
51
|
-
"files" => {
|
|
52
|
-
let file_name = field.file_name().map(|s| s.to_string());
|
|
53
|
-
let content_type = field.content_type().map(|s| s.to_string());
|
|
54
|
-
let data = field
|
|
55
|
-
.bytes()
|
|
56
|
-
.await
|
|
57
|
-
.map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?;
|
|
58
|
-
|
|
59
|
-
let mime_type = content_type.unwrap_or_else(|| "application/octet-stream".to_string());
|
|
60
|
-
|
|
61
|
-
files.push((data.to_vec(), mime_type, file_name));
|
|
62
|
-
}
|
|
63
|
-
"config" => {
|
|
64
|
-
let config_str = field
|
|
65
|
-
.text()
|
|
66
|
-
.await
|
|
67
|
-
.map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?;
|
|
68
|
-
|
|
69
|
-
config = serde_json::from_str(&config_str).map_err(|e| {
|
|
70
|
-
ApiError::validation(crate::error::KreuzbergError::validation(format!(
|
|
71
|
-
"Invalid extraction configuration: {}",
|
|
72
|
-
e
|
|
73
|
-
)))
|
|
74
|
-
})?;
|
|
75
|
-
}
|
|
76
|
-
_ => {}
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
if files.is_empty() {
|
|
81
|
-
return Err(ApiError::validation(crate::error::KreuzbergError::validation(
|
|
82
|
-
"No files provided for extraction",
|
|
83
|
-
)));
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
if files.len() == 1 {
|
|
87
|
-
let (data, mime_type, _file_name) = files
|
|
88
|
-
.into_iter()
|
|
89
|
-
.next()
|
|
90
|
-
.expect("files.len() == 1 guarantees one element exists");
|
|
91
|
-
let result = extract_bytes(&data, mime_type.as_str(), &config).await?;
|
|
92
|
-
return Ok(Json(vec![result]));
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
let files_data: Vec<(Vec<u8>, String)> = files.into_iter().map(|(data, mime, _name)| (data, mime)).collect();
|
|
96
|
-
|
|
97
|
-
let file_refs: Vec<(&[u8], &str)> = files_data
|
|
98
|
-
.iter()
|
|
99
|
-
.map(|(data, mime)| (data.as_slice(), mime.as_str()))
|
|
100
|
-
.collect();
|
|
101
|
-
|
|
102
|
-
let results = batch_extract_bytes(file_refs, &config).await?;
|
|
103
|
-
Ok(Json(results))
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
/// Health check endpoint handler.
|
|
107
|
-
///
|
|
108
|
-
/// GET /health
|
|
109
|
-
pub async fn health_handler() -> Json<HealthResponse> {
|
|
110
|
-
Json(HealthResponse {
|
|
111
|
-
status: "healthy".to_string(),
|
|
112
|
-
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
113
|
-
})
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
/// Server info endpoint handler.
|
|
117
|
-
///
|
|
118
|
-
/// GET /info
|
|
119
|
-
pub async fn info_handler() -> Json<InfoResponse> {
|
|
120
|
-
Json(InfoResponse {
|
|
121
|
-
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
122
|
-
rust_backend: true,
|
|
123
|
-
})
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
/// Cache stats endpoint handler.
|
|
127
|
-
///
|
|
128
|
-
/// GET /cache/stats
|
|
129
|
-
///
|
|
130
|
-
/// # Errors
|
|
131
|
-
///
|
|
132
|
-
/// Returns `ApiError::Internal` if:
|
|
133
|
-
/// - Current directory cannot be determined
|
|
134
|
-
/// - Cache directory path contains non-UTF8 characters
|
|
135
|
-
/// - Cache metadata retrieval fails
|
|
136
|
-
pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError> {
|
|
137
|
-
let cache_dir = std::env::current_dir()
|
|
138
|
-
.map_err(|e| {
|
|
139
|
-
ApiError::internal(crate::error::KreuzbergError::Other(format!(
|
|
140
|
-
"Failed to get current directory: {}",
|
|
141
|
-
e
|
|
142
|
-
)))
|
|
143
|
-
})?
|
|
144
|
-
.join(".kreuzberg");
|
|
145
|
-
|
|
146
|
-
let cache_dir_str = cache_dir.to_str().ok_or_else(|| {
|
|
147
|
-
ApiError::internal(crate::error::KreuzbergError::Other(format!(
|
|
148
|
-
"Cache directory path contains non-UTF8 characters: {}",
|
|
149
|
-
cache_dir.display()
|
|
150
|
-
)))
|
|
151
|
-
})?;
|
|
152
|
-
|
|
153
|
-
let stats = cache::get_cache_metadata(cache_dir_str).map_err(ApiError::internal)?;
|
|
154
|
-
|
|
155
|
-
Ok(Json(CacheStatsResponse {
|
|
156
|
-
directory: cache_dir.to_string_lossy().to_string(),
|
|
157
|
-
total_files: stats.total_files,
|
|
158
|
-
total_size_mb: stats.total_size_mb,
|
|
159
|
-
available_space_mb: stats.available_space_mb,
|
|
160
|
-
oldest_file_age_days: stats.oldest_file_age_days,
|
|
161
|
-
newest_file_age_days: stats.newest_file_age_days,
|
|
162
|
-
}))
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
/// Cache clear endpoint handler.
|
|
166
|
-
///
|
|
167
|
-
/// DELETE /cache/clear
|
|
168
|
-
///
|
|
169
|
-
/// # Errors
|
|
170
|
-
///
|
|
171
|
-
/// Returns `ApiError::Internal` if:
|
|
172
|
-
/// - Current directory cannot be determined
|
|
173
|
-
/// - Cache directory path contains non-UTF8 characters
|
|
174
|
-
/// - Cache clearing operation fails
|
|
175
|
-
pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError> {
|
|
176
|
-
let cache_dir = std::env::current_dir()
|
|
177
|
-
.map_err(|e| {
|
|
178
|
-
ApiError::internal(crate::error::KreuzbergError::Other(format!(
|
|
179
|
-
"Failed to get current directory: {}",
|
|
180
|
-
e
|
|
181
|
-
)))
|
|
182
|
-
})?
|
|
183
|
-
.join(".kreuzberg");
|
|
184
|
-
|
|
185
|
-
let cache_dir_str = cache_dir.to_str().ok_or_else(|| {
|
|
186
|
-
ApiError::internal(crate::error::KreuzbergError::Other(format!(
|
|
187
|
-
"Cache directory path contains non-UTF8 characters: {}",
|
|
188
|
-
cache_dir.display()
|
|
189
|
-
)))
|
|
190
|
-
})?;
|
|
191
|
-
|
|
192
|
-
let (removed_files, freed_mb) = cache::clear_cache_directory(cache_dir_str).map_err(ApiError::internal)?;
|
|
193
|
-
|
|
194
|
-
Ok(Json(CacheClearResponse {
|
|
195
|
-
directory: cache_dir.to_string_lossy().to_string(),
|
|
196
|
-
removed_files,
|
|
197
|
-
freed_mb,
|
|
198
|
-
}))
|
|
199
|
-
}
|
|
1
|
+
//! API request handlers.
|
|
2
|
+
|
|
3
|
+
use axum::{
|
|
4
|
+
Json,
|
|
5
|
+
extract::{Multipart, State},
|
|
6
|
+
};
|
|
7
|
+
|
|
8
|
+
use crate::{batch_extract_bytes, cache, extract_bytes};
|
|
9
|
+
|
|
10
|
+
use super::{
|
|
11
|
+
error::ApiError,
|
|
12
|
+
types::{ApiState, CacheClearResponse, CacheStatsResponse, ExtractResponse, HealthResponse, InfoResponse},
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
/// Extract endpoint handler.
|
|
16
|
+
///
|
|
17
|
+
/// POST /extract
|
|
18
|
+
///
|
|
19
|
+
/// Accepts multipart form data with:
|
|
20
|
+
/// - `files`: One or more files to extract
|
|
21
|
+
/// - `config` (optional): JSON extraction configuration (overrides server defaults)
|
|
22
|
+
///
|
|
23
|
+
/// Returns a list of extraction results, one per file.
|
|
24
|
+
///
|
|
25
|
+
/// # Size Limits
|
|
26
|
+
///
|
|
27
|
+
/// Request body size limits are enforced at the router layer via `RequestBodyLimitLayer`.
|
|
28
|
+
/// Default limits:
|
|
29
|
+
/// - Total request body: 100 MB (all files + form data combined)
|
|
30
|
+
/// - Individual multipart fields: Controlled by Axum's default multipart limits
|
|
31
|
+
///
|
|
32
|
+
/// If a request exceeds the size limit, it will be rejected with HTTP 413 (Payload Too Large).
|
|
33
|
+
///
|
|
34
|
+
/// The server's default config (loaded from kreuzberg.toml/yaml/json via discovery)
|
|
35
|
+
/// is used as the base, and any per-request config overrides those defaults.
|
|
36
|
+
pub async fn extract_handler(
|
|
37
|
+
State(state): State<ApiState>,
|
|
38
|
+
mut multipart: Multipart,
|
|
39
|
+
) -> Result<Json<ExtractResponse>, ApiError> {
|
|
40
|
+
let mut files = Vec::new();
|
|
41
|
+
let mut config = (*state.default_config).clone();
|
|
42
|
+
|
|
43
|
+
while let Some(field) = multipart
|
|
44
|
+
.next_field()
|
|
45
|
+
.await
|
|
46
|
+
.map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?
|
|
47
|
+
{
|
|
48
|
+
let field_name = field.name().unwrap_or("").to_string();
|
|
49
|
+
|
|
50
|
+
match field_name.as_str() {
|
|
51
|
+
"files" => {
|
|
52
|
+
let file_name = field.file_name().map(|s| s.to_string());
|
|
53
|
+
let content_type = field.content_type().map(|s| s.to_string());
|
|
54
|
+
let data = field
|
|
55
|
+
.bytes()
|
|
56
|
+
.await
|
|
57
|
+
.map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?;
|
|
58
|
+
|
|
59
|
+
let mime_type = content_type.unwrap_or_else(|| "application/octet-stream".to_string());
|
|
60
|
+
|
|
61
|
+
files.push((data.to_vec(), mime_type, file_name));
|
|
62
|
+
}
|
|
63
|
+
"config" => {
|
|
64
|
+
let config_str = field
|
|
65
|
+
.text()
|
|
66
|
+
.await
|
|
67
|
+
.map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?;
|
|
68
|
+
|
|
69
|
+
config = serde_json::from_str(&config_str).map_err(|e| {
|
|
70
|
+
ApiError::validation(crate::error::KreuzbergError::validation(format!(
|
|
71
|
+
"Invalid extraction configuration: {}",
|
|
72
|
+
e
|
|
73
|
+
)))
|
|
74
|
+
})?;
|
|
75
|
+
}
|
|
76
|
+
_ => {}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
if files.is_empty() {
|
|
81
|
+
return Err(ApiError::validation(crate::error::KreuzbergError::validation(
|
|
82
|
+
"No files provided for extraction",
|
|
83
|
+
)));
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if files.len() == 1 {
|
|
87
|
+
let (data, mime_type, _file_name) = files
|
|
88
|
+
.into_iter()
|
|
89
|
+
.next()
|
|
90
|
+
.expect("files.len() == 1 guarantees one element exists");
|
|
91
|
+
let result = extract_bytes(&data, mime_type.as_str(), &config).await?;
|
|
92
|
+
return Ok(Json(vec![result]));
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
let files_data: Vec<(Vec<u8>, String)> = files.into_iter().map(|(data, mime, _name)| (data, mime)).collect();
|
|
96
|
+
|
|
97
|
+
let file_refs: Vec<(&[u8], &str)> = files_data
|
|
98
|
+
.iter()
|
|
99
|
+
.map(|(data, mime)| (data.as_slice(), mime.as_str()))
|
|
100
|
+
.collect();
|
|
101
|
+
|
|
102
|
+
let results = batch_extract_bytes(file_refs, &config).await?;
|
|
103
|
+
Ok(Json(results))
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/// Health check endpoint handler.
|
|
107
|
+
///
|
|
108
|
+
/// GET /health
|
|
109
|
+
pub async fn health_handler() -> Json<HealthResponse> {
|
|
110
|
+
Json(HealthResponse {
|
|
111
|
+
status: "healthy".to_string(),
|
|
112
|
+
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
113
|
+
})
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/// Server info endpoint handler.
|
|
117
|
+
///
|
|
118
|
+
/// GET /info
|
|
119
|
+
pub async fn info_handler() -> Json<InfoResponse> {
|
|
120
|
+
Json(InfoResponse {
|
|
121
|
+
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
122
|
+
rust_backend: true,
|
|
123
|
+
})
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/// Cache stats endpoint handler.
|
|
127
|
+
///
|
|
128
|
+
/// GET /cache/stats
|
|
129
|
+
///
|
|
130
|
+
/// # Errors
|
|
131
|
+
///
|
|
132
|
+
/// Returns `ApiError::Internal` if:
|
|
133
|
+
/// - Current directory cannot be determined
|
|
134
|
+
/// - Cache directory path contains non-UTF8 characters
|
|
135
|
+
/// - Cache metadata retrieval fails
|
|
136
|
+
pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError> {
|
|
137
|
+
let cache_dir = std::env::current_dir()
|
|
138
|
+
.map_err(|e| {
|
|
139
|
+
ApiError::internal(crate::error::KreuzbergError::Other(format!(
|
|
140
|
+
"Failed to get current directory: {}",
|
|
141
|
+
e
|
|
142
|
+
)))
|
|
143
|
+
})?
|
|
144
|
+
.join(".kreuzberg");
|
|
145
|
+
|
|
146
|
+
let cache_dir_str = cache_dir.to_str().ok_or_else(|| {
|
|
147
|
+
ApiError::internal(crate::error::KreuzbergError::Other(format!(
|
|
148
|
+
"Cache directory path contains non-UTF8 characters: {}",
|
|
149
|
+
cache_dir.display()
|
|
150
|
+
)))
|
|
151
|
+
})?;
|
|
152
|
+
|
|
153
|
+
let stats = cache::get_cache_metadata(cache_dir_str).map_err(ApiError::internal)?;
|
|
154
|
+
|
|
155
|
+
Ok(Json(CacheStatsResponse {
|
|
156
|
+
directory: cache_dir.to_string_lossy().to_string(),
|
|
157
|
+
total_files: stats.total_files,
|
|
158
|
+
total_size_mb: stats.total_size_mb,
|
|
159
|
+
available_space_mb: stats.available_space_mb,
|
|
160
|
+
oldest_file_age_days: stats.oldest_file_age_days,
|
|
161
|
+
newest_file_age_days: stats.newest_file_age_days,
|
|
162
|
+
}))
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/// Cache clear endpoint handler.
|
|
166
|
+
///
|
|
167
|
+
/// DELETE /cache/clear
|
|
168
|
+
///
|
|
169
|
+
/// # Errors
|
|
170
|
+
///
|
|
171
|
+
/// Returns `ApiError::Internal` if:
|
|
172
|
+
/// - Current directory cannot be determined
|
|
173
|
+
/// - Cache directory path contains non-UTF8 characters
|
|
174
|
+
/// - Cache clearing operation fails
|
|
175
|
+
pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError> {
|
|
176
|
+
let cache_dir = std::env::current_dir()
|
|
177
|
+
.map_err(|e| {
|
|
178
|
+
ApiError::internal(crate::error::KreuzbergError::Other(format!(
|
|
179
|
+
"Failed to get current directory: {}",
|
|
180
|
+
e
|
|
181
|
+
)))
|
|
182
|
+
})?
|
|
183
|
+
.join(".kreuzberg");
|
|
184
|
+
|
|
185
|
+
let cache_dir_str = cache_dir.to_str().ok_or_else(|| {
|
|
186
|
+
ApiError::internal(crate::error::KreuzbergError::Other(format!(
|
|
187
|
+
"Cache directory path contains non-UTF8 characters: {}",
|
|
188
|
+
cache_dir.display()
|
|
189
|
+
)))
|
|
190
|
+
})?;
|
|
191
|
+
|
|
192
|
+
let (removed_files, freed_mb) = cache::clear_cache_directory(cache_dir_str).map_err(ApiError::internal)?;
|
|
193
|
+
|
|
194
|
+
Ok(Json(CacheClearResponse {
|
|
195
|
+
directory: cache_dir.to_string_lossy().to_string(),
|
|
196
|
+
removed_files,
|
|
197
|
+
freed_mb,
|
|
198
|
+
}))
|
|
199
|
+
}
|