kreuzberg 4.0.0.pre.rc.8 → 4.0.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +4 -104
- data/README.md +454 -432
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6721
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3135
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -182
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -46
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -32
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -85
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -103
- data/lib/pdfium.dll +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -537
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +45 -0
- data/vendor/kreuzberg/Cargo.toml +61 -38
- data/vendor/kreuzberg/README.md +230 -221
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -891
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1171
- data/vendor/kreuzberg/src/embeddings.rs +500 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -569
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -673
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -328
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -66
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -417
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -161
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/build.rs +176 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
- data/vendor/kreuzberg-tesseract/LICENSE +22 -0
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1354 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +44 -81
- data/vendor/rb-sys/bin/release.sh +0 -21
|
@@ -1,353 +1,353 @@
|
|
|
1
|
-
//! API server setup and configuration.
|
|
2
|
-
|
|
3
|
-
use std::{
|
|
4
|
-
net::{IpAddr, SocketAddr},
|
|
5
|
-
sync::Arc,
|
|
6
|
-
};
|
|
7
|
-
|
|
8
|
-
use axum::{
|
|
9
|
-
Router,
|
|
10
|
-
routing::{delete, get, post},
|
|
11
|
-
};
|
|
12
|
-
use tower_http::{
|
|
13
|
-
cors::{AllowOrigin, Any, CorsLayer},
|
|
14
|
-
limit::RequestBodyLimitLayer,
|
|
15
|
-
trace::TraceLayer,
|
|
16
|
-
};
|
|
17
|
-
|
|
18
|
-
use crate::{ExtractionConfig, Result};
|
|
19
|
-
|
|
20
|
-
use super::{
|
|
21
|
-
handlers::{cache_clear_handler, cache_stats_handler, extract_handler, health_handler, info_handler},
|
|
22
|
-
types::{ApiSizeLimits, ApiState},
|
|
23
|
-
};
|
|
24
|
-
|
|
25
|
-
/// Parse size limits from environment variables.
|
|
26
|
-
///
|
|
27
|
-
/// Reads `KREUZBERG_MAX_UPLOAD_SIZE_MB` to configure upload size limits.
|
|
28
|
-
/// Falls back to default (100 MB) if not set or invalid.
|
|
29
|
-
fn parse_size_limits_from_env() -> ApiSizeLimits {
|
|
30
|
-
match std::env::var("KREUZBERG_MAX_UPLOAD_SIZE_MB") {
|
|
31
|
-
Ok(value) => match value.parse::<usize>() {
|
|
32
|
-
Ok(mb) if mb > 0 => {
|
|
33
|
-
tracing::info!(
|
|
34
|
-
"Upload size limit configured from environment: {} MB ({} bytes)",
|
|
35
|
-
mb,
|
|
36
|
-
mb * 1024 * 1024
|
|
37
|
-
);
|
|
38
|
-
ApiSizeLimits::from_mb(mb, mb)
|
|
39
|
-
}
|
|
40
|
-
Ok(_) => {
|
|
41
|
-
tracing::warn!("Invalid KREUZBERG_MAX_UPLOAD_SIZE_MB value (must be > 0), using default 100 MB");
|
|
42
|
-
let limits = ApiSizeLimits::default();
|
|
43
|
-
tracing::info!(
|
|
44
|
-
"Upload size limit: 100 MB (default, {} bytes)",
|
|
45
|
-
limits.max_request_body_bytes
|
|
46
|
-
);
|
|
47
|
-
limits
|
|
48
|
-
}
|
|
49
|
-
Err(e) => {
|
|
50
|
-
tracing::warn!(
|
|
51
|
-
"Failed to parse KREUZBERG_MAX_UPLOAD_SIZE_MB='{}': {}, using default 100 MB",
|
|
52
|
-
value,
|
|
53
|
-
e
|
|
54
|
-
);
|
|
55
|
-
let limits = ApiSizeLimits::default();
|
|
56
|
-
tracing::info!(
|
|
57
|
-
"Upload size limit: 100 MB (default, {} bytes)",
|
|
58
|
-
limits.max_request_body_bytes
|
|
59
|
-
);
|
|
60
|
-
limits
|
|
61
|
-
}
|
|
62
|
-
},
|
|
63
|
-
Err(_) => {
|
|
64
|
-
let limits = ApiSizeLimits::default();
|
|
65
|
-
tracing::info!(
|
|
66
|
-
"Upload size limit: 100 MB (default, {} bytes)",
|
|
67
|
-
limits.max_request_body_bytes
|
|
68
|
-
);
|
|
69
|
-
limits
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
/// Create the API router with all routes configured.
|
|
75
|
-
///
|
|
76
|
-
/// This is public to allow users to embed the router in their own applications.
|
|
77
|
-
///
|
|
78
|
-
/// # Arguments
|
|
79
|
-
///
|
|
80
|
-
/// * `config` - Default extraction configuration. Per-request configs override these defaults.
|
|
81
|
-
///
|
|
82
|
-
/// # Examples
|
|
83
|
-
///
|
|
84
|
-
/// ```no_run
|
|
85
|
-
/// use kreuzberg::{ExtractionConfig, api::create_router};
|
|
86
|
-
///
|
|
87
|
-
/// # #[tokio::main]
|
|
88
|
-
/// # async fn main() {
|
|
89
|
-
/// // Create router with default config and size limits
|
|
90
|
-
/// let config = ExtractionConfig::default();
|
|
91
|
-
/// let router = create_router(config);
|
|
92
|
-
/// # }
|
|
93
|
-
/// ```
|
|
94
|
-
pub fn create_router(config: ExtractionConfig) -> Router {
|
|
95
|
-
create_router_with_limits(config, ApiSizeLimits::default())
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
/// Create the API router with custom size limits.
|
|
99
|
-
///
|
|
100
|
-
/// This allows fine-grained control over request body and multipart field size limits.
|
|
101
|
-
///
|
|
102
|
-
/// # Arguments
|
|
103
|
-
///
|
|
104
|
-
/// * `config` - Default extraction configuration. Per-request configs override these defaults.
|
|
105
|
-
/// * `limits` - Size limits for request bodies and multipart uploads.
|
|
106
|
-
///
|
|
107
|
-
/// # Examples
|
|
108
|
-
///
|
|
109
|
-
/// ```no_run
|
|
110
|
-
/// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits, ApiSizeLimits}};
|
|
111
|
-
///
|
|
112
|
-
/// # #[tokio::main]
|
|
113
|
-
/// # async fn main() {
|
|
114
|
-
/// // Create router with 50 MB limits
|
|
115
|
-
/// let config = ExtractionConfig::default();
|
|
116
|
-
/// let limits = ApiSizeLimits::from_mb(50, 50);
|
|
117
|
-
/// let router = create_router_with_limits(config, limits);
|
|
118
|
-
/// # }
|
|
119
|
-
/// ```
|
|
120
|
-
///
|
|
121
|
-
/// ```no_run
|
|
122
|
-
/// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits, ApiSizeLimits}};
|
|
123
|
-
/// use tower_http::limit::RequestBodyLimitLayer;
|
|
124
|
-
///
|
|
125
|
-
/// # #[tokio::main]
|
|
126
|
-
/// # async fn main() {
|
|
127
|
-
/// // Custom limits for very large documents (500 MB)
|
|
128
|
-
/// let config = ExtractionConfig::default();
|
|
129
|
-
/// let limits = ApiSizeLimits::from_mb(500, 500);
|
|
130
|
-
/// let router = create_router_with_limits(config, limits);
|
|
131
|
-
/// # }
|
|
132
|
-
/// ```
|
|
133
|
-
pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits) -> Router {
|
|
134
|
-
let state = ApiState {
|
|
135
|
-
default_config: Arc::new(config),
|
|
136
|
-
};
|
|
137
|
-
|
|
138
|
-
// SECURITY WARNING: The default allows all origins for development convenience,
|
|
139
|
-
let cors_layer = if let Ok(origins_str) = std::env::var("KREUZBERG_CORS_ORIGINS") {
|
|
140
|
-
let origins: Vec<_> = origins_str
|
|
141
|
-
.split(',')
|
|
142
|
-
.filter(|s| !s.trim().is_empty())
|
|
143
|
-
.filter_map(|s| s.trim().parse::<axum::http::HeaderValue>().ok())
|
|
144
|
-
.collect();
|
|
145
|
-
|
|
146
|
-
if !origins.is_empty() {
|
|
147
|
-
tracing::info!("CORS configured with {} explicit allowed origin(s)", origins.len());
|
|
148
|
-
CorsLayer::new()
|
|
149
|
-
.allow_origin(AllowOrigin::list(origins))
|
|
150
|
-
.allow_methods(Any)
|
|
151
|
-
.allow_headers(Any)
|
|
152
|
-
} else {
|
|
153
|
-
tracing::warn!(
|
|
154
|
-
"KREUZBERG_CORS_ORIGINS set but empty/invalid - falling back to permissive CORS. \
|
|
155
|
-
This allows CSRF attacks. Set explicit origins for production."
|
|
156
|
-
);
|
|
157
|
-
CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
|
|
158
|
-
}
|
|
159
|
-
} else {
|
|
160
|
-
tracing::warn!(
|
|
161
|
-
"CORS configured to allow all origins (default). This permits CSRF attacks. \
|
|
162
|
-
For production, set KREUZBERG_CORS_ORIGINS environment variable to comma-separated \
|
|
163
|
-
list of allowed origins (e.g., 'https://app.example.com,https://api.example.com')"
|
|
164
|
-
);
|
|
165
|
-
CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
|
|
166
|
-
};
|
|
167
|
-
|
|
168
|
-
Router::new()
|
|
169
|
-
.route("/extract", post(extract_handler))
|
|
170
|
-
.route("/health", get(health_handler))
|
|
171
|
-
.route("/info", get(info_handler))
|
|
172
|
-
.route("/cache/stats", get(cache_stats_handler))
|
|
173
|
-
.route("/cache/clear", delete(cache_clear_handler))
|
|
174
|
-
.layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
|
|
175
|
-
.layer(cors_layer)
|
|
176
|
-
.layer(TraceLayer::new_for_http())
|
|
177
|
-
.with_state(state)
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
/// Start the API server with config file discovery.
|
|
181
|
-
///
|
|
182
|
-
/// Searches for kreuzberg.toml/yaml/json in current and parent directories.
|
|
183
|
-
/// If no config file is found, uses default configuration.
|
|
184
|
-
///
|
|
185
|
-
/// # Arguments
|
|
186
|
-
///
|
|
187
|
-
/// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
|
|
188
|
-
/// * `port` - Port number to bind to (e.g., 8000)
|
|
189
|
-
///
|
|
190
|
-
/// # Examples
|
|
191
|
-
///
|
|
192
|
-
/// ```no_run
|
|
193
|
-
/// use kreuzberg::api::serve;
|
|
194
|
-
///
|
|
195
|
-
/// #[tokio::main]
|
|
196
|
-
/// async fn main() -> kreuzberg::Result<()> {
|
|
197
|
-
/// // Local development
|
|
198
|
-
/// serve("127.0.0.1", 8000).await?;
|
|
199
|
-
/// Ok(())
|
|
200
|
-
/// }
|
|
201
|
-
/// ```
|
|
202
|
-
///
|
|
203
|
-
/// ```no_run
|
|
204
|
-
/// use kreuzberg::api::serve;
|
|
205
|
-
///
|
|
206
|
-
/// #[tokio::main]
|
|
207
|
-
/// async fn main() -> kreuzberg::Result<()> {
|
|
208
|
-
/// // Docker/production (listen on all interfaces)
|
|
209
|
-
/// serve("0.0.0.0", 8000).await?;
|
|
210
|
-
/// Ok(())
|
|
211
|
-
/// }
|
|
212
|
-
/// ```
|
|
213
|
-
///
|
|
214
|
-
/// # Environment Variables
|
|
215
|
-
///
|
|
216
|
-
/// ```bash
|
|
217
|
-
/// # Python/Docker usage
|
|
218
|
-
/// export KREUZBERG_HOST=0.0.0.0
|
|
219
|
-
/// export KREUZBERG_PORT=8000
|
|
220
|
-
///
|
|
221
|
-
/// # CORS configuration (IMPORTANT for production security)
|
|
222
|
-
/// # Default: allows all origins (permits CSRF attacks)
|
|
223
|
-
/// # Production: set to comma-separated list of allowed origins
|
|
224
|
-
/// export KREUZBERG_CORS_ORIGINS="https://app.example.com,https://api.example.com"
|
|
225
|
-
///
|
|
226
|
-
/// # Upload size limit (default: 100 MB)
|
|
227
|
-
/// export KREUZBERG_MAX_UPLOAD_SIZE_MB=200
|
|
228
|
-
///
|
|
229
|
-
/// python -m kreuzberg.api
|
|
230
|
-
/// ```
|
|
231
|
-
pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
|
|
232
|
-
let config = match ExtractionConfig::discover()? {
|
|
233
|
-
Some(config) => {
|
|
234
|
-
tracing::info!("Loaded extraction config from discovered file");
|
|
235
|
-
config
|
|
236
|
-
}
|
|
237
|
-
None => {
|
|
238
|
-
tracing::info!("No config file found, using default configuration");
|
|
239
|
-
ExtractionConfig::default()
|
|
240
|
-
}
|
|
241
|
-
};
|
|
242
|
-
|
|
243
|
-
let limits = parse_size_limits_from_env();
|
|
244
|
-
|
|
245
|
-
serve_with_config_and_limits(host, port, config, limits).await
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
/// Start the API server with explicit config.
|
|
249
|
-
///
|
|
250
|
-
/// Uses default size limits (100 MB). For custom limits, use `serve_with_config_and_limits`.
|
|
251
|
-
///
|
|
252
|
-
/// # Arguments
|
|
253
|
-
///
|
|
254
|
-
/// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
|
|
255
|
-
/// * `port` - Port number to bind to (e.g., 8000)
|
|
256
|
-
/// * `config` - Default extraction configuration for all requests
|
|
257
|
-
///
|
|
258
|
-
/// # Examples
|
|
259
|
-
///
|
|
260
|
-
/// ```no_run
|
|
261
|
-
/// use kreuzberg::{ExtractionConfig, api::serve_with_config};
|
|
262
|
-
///
|
|
263
|
-
/// #[tokio::main]
|
|
264
|
-
/// async fn main() -> kreuzberg::Result<()> {
|
|
265
|
-
/// let config = ExtractionConfig::from_toml_file("config/kreuzberg.toml")?;
|
|
266
|
-
/// serve_with_config("127.0.0.1", 8000, config).await?;
|
|
267
|
-
/// Ok(())
|
|
268
|
-
/// }
|
|
269
|
-
/// ```
|
|
270
|
-
pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: ExtractionConfig) -> Result<()> {
|
|
271
|
-
let limits = ApiSizeLimits::default();
|
|
272
|
-
tracing::info!(
|
|
273
|
-
"Upload size limit: 100 MB (default, {} bytes)",
|
|
274
|
-
limits.max_request_body_bytes
|
|
275
|
-
);
|
|
276
|
-
serve_with_config_and_limits(host, port, config, limits).await
|
|
277
|
-
}
|
|
278
|
-
|
|
279
|
-
/// Start the API server with explicit config and size limits.
|
|
280
|
-
///
|
|
281
|
-
/// # Arguments
|
|
282
|
-
///
|
|
283
|
-
/// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
|
|
284
|
-
/// * `port` - Port number to bind to (e.g., 8000)
|
|
285
|
-
/// * `config` - Default extraction configuration for all requests
|
|
286
|
-
/// * `limits` - Size limits for request bodies and multipart uploads
|
|
287
|
-
///
|
|
288
|
-
/// # Examples
|
|
289
|
-
///
|
|
290
|
-
/// ```no_run
|
|
291
|
-
/// use kreuzberg::{ExtractionConfig, api::{serve_with_config_and_limits, ApiSizeLimits}};
|
|
292
|
-
///
|
|
293
|
-
/// #[tokio::main]
|
|
294
|
-
/// async fn main() -> kreuzberg::Result<()> {
|
|
295
|
-
/// let config = ExtractionConfig::from_toml_file("config/kreuzberg.toml")?;
|
|
296
|
-
/// let limits = ApiSizeLimits::from_mb(200, 200);
|
|
297
|
-
/// serve_with_config_and_limits("127.0.0.1", 8000, config, limits).await?;
|
|
298
|
-
/// Ok(())
|
|
299
|
-
/// }
|
|
300
|
-
/// ```
|
|
301
|
-
pub async fn serve_with_config_and_limits(
|
|
302
|
-
host: impl AsRef<str>,
|
|
303
|
-
port: u16,
|
|
304
|
-
config: ExtractionConfig,
|
|
305
|
-
limits: ApiSizeLimits,
|
|
306
|
-
) -> Result<()> {
|
|
307
|
-
let ip: IpAddr = host
|
|
308
|
-
.as_ref()
|
|
309
|
-
.parse()
|
|
310
|
-
.map_err(|e| crate::error::KreuzbergError::validation(format!("Invalid host address: {}", e)))?;
|
|
311
|
-
|
|
312
|
-
let addr = SocketAddr::new(ip, port);
|
|
313
|
-
let app = create_router_with_limits(config, limits);
|
|
314
|
-
|
|
315
|
-
tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
|
|
316
|
-
|
|
317
|
-
let listener = tokio::net::TcpListener::bind(addr)
|
|
318
|
-
.await
|
|
319
|
-
.map_err(crate::error::KreuzbergError::Io)?;
|
|
320
|
-
|
|
321
|
-
axum::serve(listener, app)
|
|
322
|
-
.await
|
|
323
|
-
.map_err(|e| crate::error::KreuzbergError::Other(e.to_string()))?;
|
|
324
|
-
|
|
325
|
-
Ok(())
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
/// Start the API server with default host and port.
|
|
329
|
-
///
|
|
330
|
-
/// Defaults: host = "127.0.0.1", port = 8000
|
|
331
|
-
///
|
|
332
|
-
/// Uses config file discovery (searches current/parent directories for kreuzberg.toml/yaml/json).
|
|
333
|
-
pub async fn serve_default() -> Result<()> {
|
|
334
|
-
serve("127.0.0.1", 8000).await
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
#[cfg(test)]
|
|
338
|
-
mod tests {
|
|
339
|
-
use super::*;
|
|
340
|
-
|
|
341
|
-
#[test]
|
|
342
|
-
fn test_create_router() {
|
|
343
|
-
let config = ExtractionConfig::default();
|
|
344
|
-
let _router = create_router(config);
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
#[test]
|
|
348
|
-
fn test_router_has_routes() {
|
|
349
|
-
let config = ExtractionConfig::default();
|
|
350
|
-
let router = create_router(config);
|
|
351
|
-
assert!(size_of_val(&router) > 0);
|
|
352
|
-
}
|
|
353
|
-
}
|
|
1
|
+
//! API server setup and configuration.
|
|
2
|
+
|
|
3
|
+
use std::{
|
|
4
|
+
net::{IpAddr, SocketAddr},
|
|
5
|
+
sync::Arc,
|
|
6
|
+
};
|
|
7
|
+
|
|
8
|
+
use axum::{
|
|
9
|
+
Router,
|
|
10
|
+
routing::{delete, get, post},
|
|
11
|
+
};
|
|
12
|
+
use tower_http::{
|
|
13
|
+
cors::{AllowOrigin, Any, CorsLayer},
|
|
14
|
+
limit::RequestBodyLimitLayer,
|
|
15
|
+
trace::TraceLayer,
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
use crate::{ExtractionConfig, Result};
|
|
19
|
+
|
|
20
|
+
use super::{
|
|
21
|
+
handlers::{cache_clear_handler, cache_stats_handler, extract_handler, health_handler, info_handler},
|
|
22
|
+
types::{ApiSizeLimits, ApiState},
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
/// Parse size limits from environment variables.
|
|
26
|
+
///
|
|
27
|
+
/// Reads `KREUZBERG_MAX_UPLOAD_SIZE_MB` to configure upload size limits.
|
|
28
|
+
/// Falls back to default (100 MB) if not set or invalid.
|
|
29
|
+
fn parse_size_limits_from_env() -> ApiSizeLimits {
|
|
30
|
+
match std::env::var("KREUZBERG_MAX_UPLOAD_SIZE_MB") {
|
|
31
|
+
Ok(value) => match value.parse::<usize>() {
|
|
32
|
+
Ok(mb) if mb > 0 => {
|
|
33
|
+
tracing::info!(
|
|
34
|
+
"Upload size limit configured from environment: {} MB ({} bytes)",
|
|
35
|
+
mb,
|
|
36
|
+
mb * 1024 * 1024
|
|
37
|
+
);
|
|
38
|
+
ApiSizeLimits::from_mb(mb, mb)
|
|
39
|
+
}
|
|
40
|
+
Ok(_) => {
|
|
41
|
+
tracing::warn!("Invalid KREUZBERG_MAX_UPLOAD_SIZE_MB value (must be > 0), using default 100 MB");
|
|
42
|
+
let limits = ApiSizeLimits::default();
|
|
43
|
+
tracing::info!(
|
|
44
|
+
"Upload size limit: 100 MB (default, {} bytes)",
|
|
45
|
+
limits.max_request_body_bytes
|
|
46
|
+
);
|
|
47
|
+
limits
|
|
48
|
+
}
|
|
49
|
+
Err(e) => {
|
|
50
|
+
tracing::warn!(
|
|
51
|
+
"Failed to parse KREUZBERG_MAX_UPLOAD_SIZE_MB='{}': {}, using default 100 MB",
|
|
52
|
+
value,
|
|
53
|
+
e
|
|
54
|
+
);
|
|
55
|
+
let limits = ApiSizeLimits::default();
|
|
56
|
+
tracing::info!(
|
|
57
|
+
"Upload size limit: 100 MB (default, {} bytes)",
|
|
58
|
+
limits.max_request_body_bytes
|
|
59
|
+
);
|
|
60
|
+
limits
|
|
61
|
+
}
|
|
62
|
+
},
|
|
63
|
+
Err(_) => {
|
|
64
|
+
let limits = ApiSizeLimits::default();
|
|
65
|
+
tracing::info!(
|
|
66
|
+
"Upload size limit: 100 MB (default, {} bytes)",
|
|
67
|
+
limits.max_request_body_bytes
|
|
68
|
+
);
|
|
69
|
+
limits
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/// Create the API router with all routes configured.
|
|
75
|
+
///
|
|
76
|
+
/// This is public to allow users to embed the router in their own applications.
|
|
77
|
+
///
|
|
78
|
+
/// # Arguments
|
|
79
|
+
///
|
|
80
|
+
/// * `config` - Default extraction configuration. Per-request configs override these defaults.
|
|
81
|
+
///
|
|
82
|
+
/// # Examples
|
|
83
|
+
///
|
|
84
|
+
/// ```no_run
|
|
85
|
+
/// use kreuzberg::{ExtractionConfig, api::create_router};
|
|
86
|
+
///
|
|
87
|
+
/// # #[tokio::main]
|
|
88
|
+
/// # async fn main() {
|
|
89
|
+
/// // Create router with default config and size limits
|
|
90
|
+
/// let config = ExtractionConfig::default();
|
|
91
|
+
/// let router = create_router(config);
|
|
92
|
+
/// # }
|
|
93
|
+
/// ```
|
|
94
|
+
pub fn create_router(config: ExtractionConfig) -> Router {
|
|
95
|
+
create_router_with_limits(config, ApiSizeLimits::default())
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/// Create the API router with custom size limits.
|
|
99
|
+
///
|
|
100
|
+
/// This allows fine-grained control over request body and multipart field size limits.
|
|
101
|
+
///
|
|
102
|
+
/// # Arguments
|
|
103
|
+
///
|
|
104
|
+
/// * `config` - Default extraction configuration. Per-request configs override these defaults.
|
|
105
|
+
/// * `limits` - Size limits for request bodies and multipart uploads.
|
|
106
|
+
///
|
|
107
|
+
/// # Examples
|
|
108
|
+
///
|
|
109
|
+
/// ```no_run
|
|
110
|
+
/// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits, ApiSizeLimits}};
|
|
111
|
+
///
|
|
112
|
+
/// # #[tokio::main]
|
|
113
|
+
/// # async fn main() {
|
|
114
|
+
/// // Create router with 50 MB limits
|
|
115
|
+
/// let config = ExtractionConfig::default();
|
|
116
|
+
/// let limits = ApiSizeLimits::from_mb(50, 50);
|
|
117
|
+
/// let router = create_router_with_limits(config, limits);
|
|
118
|
+
/// # }
|
|
119
|
+
/// ```
|
|
120
|
+
///
|
|
121
|
+
/// ```no_run
|
|
122
|
+
/// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits, ApiSizeLimits}};
|
|
123
|
+
/// use tower_http::limit::RequestBodyLimitLayer;
|
|
124
|
+
///
|
|
125
|
+
/// # #[tokio::main]
|
|
126
|
+
/// # async fn main() {
|
|
127
|
+
/// // Custom limits for very large documents (500 MB)
|
|
128
|
+
/// let config = ExtractionConfig::default();
|
|
129
|
+
/// let limits = ApiSizeLimits::from_mb(500, 500);
|
|
130
|
+
/// let router = create_router_with_limits(config, limits);
|
|
131
|
+
/// # }
|
|
132
|
+
/// ```
|
|
133
|
+
pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits) -> Router {
|
|
134
|
+
let state = ApiState {
|
|
135
|
+
default_config: Arc::new(config),
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
// SECURITY WARNING: The default allows all origins for development convenience,
|
|
139
|
+
let cors_layer = if let Ok(origins_str) = std::env::var("KREUZBERG_CORS_ORIGINS") {
|
|
140
|
+
let origins: Vec<_> = origins_str
|
|
141
|
+
.split(',')
|
|
142
|
+
.filter(|s| !s.trim().is_empty())
|
|
143
|
+
.filter_map(|s| s.trim().parse::<axum::http::HeaderValue>().ok())
|
|
144
|
+
.collect();
|
|
145
|
+
|
|
146
|
+
if !origins.is_empty() {
|
|
147
|
+
tracing::info!("CORS configured with {} explicit allowed origin(s)", origins.len());
|
|
148
|
+
CorsLayer::new()
|
|
149
|
+
.allow_origin(AllowOrigin::list(origins))
|
|
150
|
+
.allow_methods(Any)
|
|
151
|
+
.allow_headers(Any)
|
|
152
|
+
} else {
|
|
153
|
+
tracing::warn!(
|
|
154
|
+
"KREUZBERG_CORS_ORIGINS set but empty/invalid - falling back to permissive CORS. \
|
|
155
|
+
This allows CSRF attacks. Set explicit origins for production."
|
|
156
|
+
);
|
|
157
|
+
CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
|
|
158
|
+
}
|
|
159
|
+
} else {
|
|
160
|
+
tracing::warn!(
|
|
161
|
+
"CORS configured to allow all origins (default). This permits CSRF attacks. \
|
|
162
|
+
For production, set KREUZBERG_CORS_ORIGINS environment variable to comma-separated \
|
|
163
|
+
list of allowed origins (e.g., 'https://app.example.com,https://api.example.com')"
|
|
164
|
+
);
|
|
165
|
+
CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
|
|
166
|
+
};
|
|
167
|
+
|
|
168
|
+
Router::new()
|
|
169
|
+
.route("/extract", post(extract_handler))
|
|
170
|
+
.route("/health", get(health_handler))
|
|
171
|
+
.route("/info", get(info_handler))
|
|
172
|
+
.route("/cache/stats", get(cache_stats_handler))
|
|
173
|
+
.route("/cache/clear", delete(cache_clear_handler))
|
|
174
|
+
.layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
|
|
175
|
+
.layer(cors_layer)
|
|
176
|
+
.layer(TraceLayer::new_for_http())
|
|
177
|
+
.with_state(state)
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/// Start the API server with config file discovery.
|
|
181
|
+
///
|
|
182
|
+
/// Searches for kreuzberg.toml/yaml/json in current and parent directories.
|
|
183
|
+
/// If no config file is found, uses default configuration.
|
|
184
|
+
///
|
|
185
|
+
/// # Arguments
|
|
186
|
+
///
|
|
187
|
+
/// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
|
|
188
|
+
/// * `port` - Port number to bind to (e.g., 8000)
|
|
189
|
+
///
|
|
190
|
+
/// # Examples
|
|
191
|
+
///
|
|
192
|
+
/// ```no_run
|
|
193
|
+
/// use kreuzberg::api::serve;
|
|
194
|
+
///
|
|
195
|
+
/// #[tokio::main]
|
|
196
|
+
/// async fn main() -> kreuzberg::Result<()> {
|
|
197
|
+
/// // Local development
|
|
198
|
+
/// serve("127.0.0.1", 8000).await?;
|
|
199
|
+
/// Ok(())
|
|
200
|
+
/// }
|
|
201
|
+
/// ```
|
|
202
|
+
///
|
|
203
|
+
/// ```no_run
|
|
204
|
+
/// use kreuzberg::api::serve;
|
|
205
|
+
///
|
|
206
|
+
/// #[tokio::main]
|
|
207
|
+
/// async fn main() -> kreuzberg::Result<()> {
|
|
208
|
+
/// // Docker/production (listen on all interfaces)
|
|
209
|
+
/// serve("0.0.0.0", 8000).await?;
|
|
210
|
+
/// Ok(())
|
|
211
|
+
/// }
|
|
212
|
+
/// ```
|
|
213
|
+
///
|
|
214
|
+
/// # Environment Variables
|
|
215
|
+
///
|
|
216
|
+
/// ```bash
|
|
217
|
+
/// # Python/Docker usage
|
|
218
|
+
/// export KREUZBERG_HOST=0.0.0.0
|
|
219
|
+
/// export KREUZBERG_PORT=8000
|
|
220
|
+
///
|
|
221
|
+
/// # CORS configuration (IMPORTANT for production security)
|
|
222
|
+
/// # Default: allows all origins (permits CSRF attacks)
|
|
223
|
+
/// # Production: set to comma-separated list of allowed origins
|
|
224
|
+
/// export KREUZBERG_CORS_ORIGINS="https://app.example.com,https://api.example.com"
|
|
225
|
+
///
|
|
226
|
+
/// # Upload size limit (default: 100 MB)
|
|
227
|
+
/// export KREUZBERG_MAX_UPLOAD_SIZE_MB=200
|
|
228
|
+
///
|
|
229
|
+
/// python -m kreuzberg.api
|
|
230
|
+
/// ```
|
|
231
|
+
pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
|
|
232
|
+
let config = match ExtractionConfig::discover()? {
|
|
233
|
+
Some(config) => {
|
|
234
|
+
tracing::info!("Loaded extraction config from discovered file");
|
|
235
|
+
config
|
|
236
|
+
}
|
|
237
|
+
None => {
|
|
238
|
+
tracing::info!("No config file found, using default configuration");
|
|
239
|
+
ExtractionConfig::default()
|
|
240
|
+
}
|
|
241
|
+
};
|
|
242
|
+
|
|
243
|
+
let limits = parse_size_limits_from_env();
|
|
244
|
+
|
|
245
|
+
serve_with_config_and_limits(host, port, config, limits).await
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/// Start the API server with explicit config.
|
|
249
|
+
///
|
|
250
|
+
/// Uses default size limits (100 MB). For custom limits, use `serve_with_config_and_limits`.
|
|
251
|
+
///
|
|
252
|
+
/// # Arguments
|
|
253
|
+
///
|
|
254
|
+
/// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
|
|
255
|
+
/// * `port` - Port number to bind to (e.g., 8000)
|
|
256
|
+
/// * `config` - Default extraction configuration for all requests
|
|
257
|
+
///
|
|
258
|
+
/// # Examples
|
|
259
|
+
///
|
|
260
|
+
/// ```no_run
|
|
261
|
+
/// use kreuzberg::{ExtractionConfig, api::serve_with_config};
|
|
262
|
+
///
|
|
263
|
+
/// #[tokio::main]
|
|
264
|
+
/// async fn main() -> kreuzberg::Result<()> {
|
|
265
|
+
/// let config = ExtractionConfig::from_toml_file("config/kreuzberg.toml")?;
|
|
266
|
+
/// serve_with_config("127.0.0.1", 8000, config).await?;
|
|
267
|
+
/// Ok(())
|
|
268
|
+
/// }
|
|
269
|
+
/// ```
|
|
270
|
+
pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: ExtractionConfig) -> Result<()> {
|
|
271
|
+
let limits = ApiSizeLimits::default();
|
|
272
|
+
tracing::info!(
|
|
273
|
+
"Upload size limit: 100 MB (default, {} bytes)",
|
|
274
|
+
limits.max_request_body_bytes
|
|
275
|
+
);
|
|
276
|
+
serve_with_config_and_limits(host, port, config, limits).await
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
/// Start the API server with explicit config and size limits.
|
|
280
|
+
///
|
|
281
|
+
/// # Arguments
|
|
282
|
+
///
|
|
283
|
+
/// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
|
|
284
|
+
/// * `port` - Port number to bind to (e.g., 8000)
|
|
285
|
+
/// * `config` - Default extraction configuration for all requests
|
|
286
|
+
/// * `limits` - Size limits for request bodies and multipart uploads
|
|
287
|
+
///
|
|
288
|
+
/// # Examples
|
|
289
|
+
///
|
|
290
|
+
/// ```no_run
|
|
291
|
+
/// use kreuzberg::{ExtractionConfig, api::{serve_with_config_and_limits, ApiSizeLimits}};
|
|
292
|
+
///
|
|
293
|
+
/// #[tokio::main]
|
|
294
|
+
/// async fn main() -> kreuzberg::Result<()> {
|
|
295
|
+
/// let config = ExtractionConfig::from_toml_file("config/kreuzberg.toml")?;
|
|
296
|
+
/// let limits = ApiSizeLimits::from_mb(200, 200);
|
|
297
|
+
/// serve_with_config_and_limits("127.0.0.1", 8000, config, limits).await?;
|
|
298
|
+
/// Ok(())
|
|
299
|
+
/// }
|
|
300
|
+
/// ```
|
|
301
|
+
pub async fn serve_with_config_and_limits(
|
|
302
|
+
host: impl AsRef<str>,
|
|
303
|
+
port: u16,
|
|
304
|
+
config: ExtractionConfig,
|
|
305
|
+
limits: ApiSizeLimits,
|
|
306
|
+
) -> Result<()> {
|
|
307
|
+
let ip: IpAddr = host
|
|
308
|
+
.as_ref()
|
|
309
|
+
.parse()
|
|
310
|
+
.map_err(|e| crate::error::KreuzbergError::validation(format!("Invalid host address: {}", e)))?;
|
|
311
|
+
|
|
312
|
+
let addr = SocketAddr::new(ip, port);
|
|
313
|
+
let app = create_router_with_limits(config, limits);
|
|
314
|
+
|
|
315
|
+
tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
|
|
316
|
+
|
|
317
|
+
let listener = tokio::net::TcpListener::bind(addr)
|
|
318
|
+
.await
|
|
319
|
+
.map_err(crate::error::KreuzbergError::Io)?;
|
|
320
|
+
|
|
321
|
+
axum::serve(listener, app)
|
|
322
|
+
.await
|
|
323
|
+
.map_err(|e| crate::error::KreuzbergError::Other(e.to_string()))?;
|
|
324
|
+
|
|
325
|
+
Ok(())
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
/// Start the API server with default host and port.
|
|
329
|
+
///
|
|
330
|
+
/// Defaults: host = "127.0.0.1", port = 8000
|
|
331
|
+
///
|
|
332
|
+
/// Uses config file discovery (searches current/parent directories for kreuzberg.toml/yaml/json).
|
|
333
|
+
pub async fn serve_default() -> Result<()> {
|
|
334
|
+
serve("127.0.0.1", 8000).await
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
#[cfg(test)]
|
|
338
|
+
mod tests {
|
|
339
|
+
use super::*;
|
|
340
|
+
|
|
341
|
+
#[test]
|
|
342
|
+
fn test_create_router() {
|
|
343
|
+
let config = ExtractionConfig::default();
|
|
344
|
+
let _router = create_router(config);
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
#[test]
|
|
348
|
+
fn test_router_has_routes() {
|
|
349
|
+
let config = ExtractionConfig::default();
|
|
350
|
+
let router = create_router(config);
|
|
351
|
+
assert!(size_of_val(&router) > 0);
|
|
352
|
+
}
|
|
353
|
+
}
|