kreuzberg 4.0.0.rc2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +391 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
#![cfg(feature = "api")]
|
|
2
|
+
//! Integration tests for large PDF file extraction (issue #248).
|
|
3
|
+
//!
|
|
4
|
+
//! Tests verify that the Kreuzberg API server can handle large PDF files
|
|
5
|
+
//! without size limits or with very large limits (>2MB, >10MB, >100MB).
|
|
6
|
+
//!
|
|
7
|
+
//! These tests are designed to be TDD tests - they FAIL with the current
|
|
8
|
+
//! implementation if size limits are enforced, demonstrating the bug.
|
|
9
|
+
//!
|
|
10
|
+
//! The tests ensure:
|
|
11
|
+
//! - Large PDFs (>2MB) can be extracted without rejection
|
|
12
|
+
//! - Multipart uploads handle large payloads correctly
|
|
13
|
+
//! - Server doesn't impose unreasonable size restrictions
|
|
14
|
+
//! - Configuration allows tuning limits for different deployment scenarios
|
|
15
|
+
|
|
16
|
+
use axum::{
|
|
17
|
+
body::Body,
|
|
18
|
+
http::{Request, StatusCode},
|
|
19
|
+
};
|
|
20
|
+
use kreuzberg::{
|
|
21
|
+
ExtractionConfig,
|
|
22
|
+
api::{ApiSizeLimits, create_router_with_limits},
|
|
23
|
+
};
|
|
24
|
+
use tower::ServiceExt;
|
|
25
|
+
|
|
26
|
+
/// Helper function to create mock PDF content of a specified size.
|
|
27
|
+
///
|
|
28
|
+
/// Creates a minimal PDF structure that is valid and parseable, scaled to
|
|
29
|
+
/// the requested byte size. The PDF contains repeated text content to reach
|
|
30
|
+
/// the target size.
|
|
31
|
+
///
|
|
32
|
+
/// # Arguments
|
|
33
|
+
///
|
|
34
|
+
/// * `size_bytes` - Target size of the PDF in bytes
|
|
35
|
+
///
|
|
36
|
+
/// # Returns
|
|
37
|
+
///
|
|
38
|
+
/// A Vec<u8> containing valid PDF content of approximately the specified size
|
|
39
|
+
fn create_mock_pdf_content(size_bytes: usize) -> Vec<u8> {
|
|
40
|
+
let pdf_header = b"%PDF-1.4\n";
|
|
41
|
+
let mut content = pdf_header.to_vec();
|
|
42
|
+
|
|
43
|
+
let catalog = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n";
|
|
44
|
+
content.extend_from_slice(catalog);
|
|
45
|
+
|
|
46
|
+
let pages = b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n";
|
|
47
|
+
content.extend_from_slice(pages);
|
|
48
|
+
|
|
49
|
+
let page_header = b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>\nendobj\n";
|
|
50
|
+
content.extend_from_slice(page_header);
|
|
51
|
+
|
|
52
|
+
let text_content = b"BT /F1 12 Tf 50 750 Td (Large PDF Content for Testing) Tj ET\n";
|
|
53
|
+
let stream_prefix = b"4 0 obj\n<< /Length ";
|
|
54
|
+
let stream_suffix = b" >>\nstream\n";
|
|
55
|
+
let stream_end = b"\nendstream\nendobj\n";
|
|
56
|
+
|
|
57
|
+
let text_repeat_count = if size_bytes > content.len() + 200 {
|
|
58
|
+
(size_bytes - content.len() - 200) / text_content.len()
|
|
59
|
+
} else {
|
|
60
|
+
1
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
content.extend_from_slice(stream_prefix);
|
|
64
|
+
|
|
65
|
+
let stream_size = text_content.len() * text_repeat_count + text_repeat_count;
|
|
66
|
+
content.extend_from_slice(stream_size.to_string().as_bytes());
|
|
67
|
+
content.extend_from_slice(stream_suffix);
|
|
68
|
+
|
|
69
|
+
for _ in 0..text_repeat_count {
|
|
70
|
+
content.extend_from_slice(text_content);
|
|
71
|
+
content.push(b'\n');
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
content.extend_from_slice(stream_end);
|
|
75
|
+
|
|
76
|
+
let xref_offset = content.len();
|
|
77
|
+
let xref = b"xref\n0 5\n0000000000 65535 f \n";
|
|
78
|
+
content.extend_from_slice(xref);
|
|
79
|
+
|
|
80
|
+
let trailer = format!(
|
|
81
|
+
"trailer\n<< /Size 5 /Root 1 0 R >>\nstartxref\n{}\n%%EOF\n",
|
|
82
|
+
xref_offset
|
|
83
|
+
);
|
|
84
|
+
content.extend_from_slice(trailer.as_bytes());
|
|
85
|
+
|
|
86
|
+
content
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/// Helper function to create a multipart request body with a PDF file.
|
|
90
|
+
///
|
|
91
|
+
/// Constructs a properly formatted multipart/form-data request body
|
|
92
|
+
/// containing a single PDF file.
|
|
93
|
+
///
|
|
94
|
+
/// # Arguments
|
|
95
|
+
///
|
|
96
|
+
/// * `boundary` - The multipart boundary string
|
|
97
|
+
/// * `pdf_content` - The PDF file content as bytes
|
|
98
|
+
/// * `filename` - Name of the PDF file
|
|
99
|
+
///
|
|
100
|
+
/// # Returns
|
|
101
|
+
///
|
|
102
|
+
/// A Vec<u8> containing the complete multipart request body
|
|
103
|
+
fn create_multipart_pdf_request(boundary: &str, pdf_content: &[u8], filename: &str) -> Vec<u8> {
|
|
104
|
+
let mut body = Vec::new();
|
|
105
|
+
|
|
106
|
+
body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
|
107
|
+
|
|
108
|
+
body.extend_from_slice(
|
|
109
|
+
format!(
|
|
110
|
+
"Content-Disposition: form-data; name=\"files\"; filename=\"{}\"\r\n",
|
|
111
|
+
filename
|
|
112
|
+
)
|
|
113
|
+
.as_bytes(),
|
|
114
|
+
);
|
|
115
|
+
|
|
116
|
+
body.extend_from_slice(b"Content-Type: application/pdf\r\n");
|
|
117
|
+
|
|
118
|
+
body.extend_from_slice(b"\r\n");
|
|
119
|
+
|
|
120
|
+
body.extend_from_slice(pdf_content);
|
|
121
|
+
|
|
122
|
+
body.extend_from_slice(format!("\r\n--{}--\r\n", boundary).as_bytes());
|
|
123
|
+
|
|
124
|
+
body
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/// Test extracting a 5MB PDF file.
|
|
128
|
+
///
|
|
129
|
+
/// This test verifies that the API can handle PDF files larger than 2MB,
|
|
130
|
+
/// which was the issue reported in #248. The test should FAIL if the server
|
|
131
|
+
/// is rejecting requests based on file size limits.
|
|
132
|
+
///
|
|
133
|
+
/// # Expected Behavior
|
|
134
|
+
///
|
|
135
|
+
/// The request should succeed with HTTP 200 and return valid extraction results.
|
|
136
|
+
/// If the server has a hard limit below 5MB, this test will fail with HTTP 413
|
|
137
|
+
/// (Payload Too Large).
|
|
138
|
+
#[tokio::test]
|
|
139
|
+
async fn test_extract_5mb_pdf_file() {
|
|
140
|
+
let limits = ApiSizeLimits::from_mb(10, 10);
|
|
141
|
+
let router = create_router_with_limits(ExtractionConfig::default(), limits);
|
|
142
|
+
|
|
143
|
+
let pdf_size = 5 * 1024 * 1024;
|
|
144
|
+
let pdf_content = create_mock_pdf_content(pdf_size);
|
|
145
|
+
|
|
146
|
+
let boundary = "----large-pdf-boundary";
|
|
147
|
+
let request_body = create_multipart_pdf_request(boundary, &pdf_content, "large_5mb.pdf");
|
|
148
|
+
|
|
149
|
+
let request = Request::builder()
|
|
150
|
+
.method("POST")
|
|
151
|
+
.uri("/extract")
|
|
152
|
+
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
153
|
+
.header("content-length", request_body.len())
|
|
154
|
+
.body(Body::from(request_body))
|
|
155
|
+
.expect("Failed to build request");
|
|
156
|
+
|
|
157
|
+
let response = router.oneshot(request).await.expect("Request failed");
|
|
158
|
+
|
|
159
|
+
assert_eq!(
|
|
160
|
+
response.status(),
|
|
161
|
+
StatusCode::OK,
|
|
162
|
+
"Should successfully extract 5MB PDF file. If status is 413, the server has size limit issues (issue #248)."
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/// Test extracting a 10MB PDF file.
|
|
167
|
+
///
|
|
168
|
+
/// This test pushes the size limits further to verify that the API can handle
|
|
169
|
+
/// significantly large PDF files (10x the original problem size of 1MB).
|
|
170
|
+
///
|
|
171
|
+
/// # Expected Behavior
|
|
172
|
+
///
|
|
173
|
+
/// The request should succeed with HTTP 200. If this fails with HTTP 413,
|
|
174
|
+
/// it indicates the server's default size limits are too restrictive.
|
|
175
|
+
#[tokio::test]
|
|
176
|
+
async fn test_extract_10mb_pdf_file() {
|
|
177
|
+
let limits = ApiSizeLimits::from_mb(20, 20);
|
|
178
|
+
let router = create_router_with_limits(ExtractionConfig::default(), limits);
|
|
179
|
+
|
|
180
|
+
let pdf_size = 10 * 1024 * 1024;
|
|
181
|
+
let pdf_content = create_mock_pdf_content(pdf_size);
|
|
182
|
+
|
|
183
|
+
let boundary = "----large-pdf-boundary";
|
|
184
|
+
let request_body = create_multipart_pdf_request(boundary, &pdf_content, "large_10mb.pdf");
|
|
185
|
+
|
|
186
|
+
let request = Request::builder()
|
|
187
|
+
.method("POST")
|
|
188
|
+
.uri("/extract")
|
|
189
|
+
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
190
|
+
.header("content-length", request_body.len())
|
|
191
|
+
.body(Body::from(request_body))
|
|
192
|
+
.expect("Failed to build request");
|
|
193
|
+
|
|
194
|
+
let response = router.oneshot(request).await.expect("Request failed");
|
|
195
|
+
|
|
196
|
+
assert_eq!(
|
|
197
|
+
response.status(),
|
|
198
|
+
StatusCode::OK,
|
|
199
|
+
"Should successfully extract 10MB PDF file without size limit rejection"
|
|
200
|
+
);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/// Test extracting a 100MB PDF file.
|
|
204
|
+
///
|
|
205
|
+
/// This test verifies that the API can handle very large PDF files (100x the
|
|
206
|
+
/// original problem size). This is important for production deployments that
|
|
207
|
+
/// need to process large document repositories.
|
|
208
|
+
///
|
|
209
|
+
/// Note: This test may require significant memory and time.
|
|
210
|
+
///
|
|
211
|
+
/// # Expected Behavior
|
|
212
|
+
///
|
|
213
|
+
/// The request should succeed with HTTP 200. The test uses very large limits
|
|
214
|
+
/// (500MB) to allow the file to be processed.
|
|
215
|
+
#[tokio::test]
|
|
216
|
+
#[ignore]
|
|
217
|
+
async fn test_extract_100mb_pdf_file() {
|
|
218
|
+
let limits = ApiSizeLimits::from_mb(500, 500);
|
|
219
|
+
let router = create_router_with_limits(ExtractionConfig::default(), limits);
|
|
220
|
+
|
|
221
|
+
let pdf_size = 100 * 1024 * 1024;
|
|
222
|
+
let pdf_content = create_mock_pdf_content(pdf_size);
|
|
223
|
+
|
|
224
|
+
let boundary = "----large-pdf-boundary";
|
|
225
|
+
let request_body = create_multipart_pdf_request(boundary, &pdf_content, "large_100mb.pdf");
|
|
226
|
+
|
|
227
|
+
let request = Request::builder()
|
|
228
|
+
.method("POST")
|
|
229
|
+
.uri("/extract")
|
|
230
|
+
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
231
|
+
.header("content-length", request_body.len())
|
|
232
|
+
.body(Body::from(request_body))
|
|
233
|
+
.expect("Failed to build request");
|
|
234
|
+
|
|
235
|
+
let response = router.oneshot(request).await.expect("Request failed");
|
|
236
|
+
|
|
237
|
+
assert_eq!(
|
|
238
|
+
response.status(),
|
|
239
|
+
StatusCode::OK,
|
|
240
|
+
"Should successfully extract 100MB PDF file. Requires --ignored flag to run and significant memory."
|
|
241
|
+
);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/// Test that default size limits can be exceeded with custom configuration.
|
|
245
|
+
///
|
|
246
|
+
/// This test verifies that the API respects custom size limit configuration,
|
|
247
|
+
/// allowing deployments to tune limits based on their requirements.
|
|
248
|
+
///
|
|
249
|
+
/// # Expected Behavior
|
|
250
|
+
///
|
|
251
|
+
/// A 6MB file should fail with the default 100MB limit (actually it shouldn't fail,
|
|
252
|
+
/// but it demonstrates how to check if custom limits work). We test with a router
|
|
253
|
+
/// configured for smaller limits, then larger limits.
|
|
254
|
+
#[tokio::test]
|
|
255
|
+
async fn test_size_limits_configurable() {
|
|
256
|
+
let pdf_size = 6 * 1024 * 1024;
|
|
257
|
+
let pdf_content = create_mock_pdf_content(pdf_size);
|
|
258
|
+
let boundary = "----size-limit-test";
|
|
259
|
+
|
|
260
|
+
let small_limits = ApiSizeLimits::from_mb(5, 5);
|
|
261
|
+
let router_small = create_router_with_limits(ExtractionConfig::default(), small_limits);
|
|
262
|
+
|
|
263
|
+
let request_body = create_multipart_pdf_request(boundary, &pdf_content, "test_6mb.pdf");
|
|
264
|
+
|
|
265
|
+
let request = Request::builder()
|
|
266
|
+
.method("POST")
|
|
267
|
+
.uri("/extract")
|
|
268
|
+
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
269
|
+
.header("content-length", request_body.len())
|
|
270
|
+
.body(Body::from(request_body.clone()))
|
|
271
|
+
.expect("Failed to build request");
|
|
272
|
+
|
|
273
|
+
let response_small = router_small.oneshot(request).await.expect("Request failed");
|
|
274
|
+
|
|
275
|
+
assert_eq!(
|
|
276
|
+
response_small.status(),
|
|
277
|
+
StatusCode::PAYLOAD_TOO_LARGE,
|
|
278
|
+
"6MB file should be rejected when limit is 5MB"
|
|
279
|
+
);
|
|
280
|
+
|
|
281
|
+
let large_limits = ApiSizeLimits::from_mb(10, 10);
|
|
282
|
+
let router_large = create_router_with_limits(ExtractionConfig::default(), large_limits);
|
|
283
|
+
|
|
284
|
+
let request = Request::builder()
|
|
285
|
+
.method("POST")
|
|
286
|
+
.uri("/extract")
|
|
287
|
+
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
288
|
+
.header("content-length", request_body.len())
|
|
289
|
+
.body(Body::from(request_body))
|
|
290
|
+
.expect("Failed to build request");
|
|
291
|
+
|
|
292
|
+
let response_large = router_large.oneshot(request).await.expect("Request failed");
|
|
293
|
+
|
|
294
|
+
assert_eq!(
|
|
295
|
+
response_large.status(),
|
|
296
|
+
StatusCode::OK,
|
|
297
|
+
"6MB file should be accepted when limit is 10MB"
|
|
298
|
+
);
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/// Test that custom limits work via ApiSizeLimits::from_mb.
|
|
302
|
+
///
|
|
303
|
+
/// This test verifies the public API for configuring size limits,
|
|
304
|
+
/// ensuring that applications can set limits appropriate for their use case.
|
|
305
|
+
///
|
|
306
|
+
/// # Expected Behavior
|
|
307
|
+
///
|
|
308
|
+
/// The test creates limits for 15MB and 20MB separately, demonstrating
|
|
309
|
+
/// different request/field limits.
|
|
310
|
+
#[tokio::test]
|
|
311
|
+
async fn test_api_size_limits_from_mb() {
|
|
312
|
+
let limits_15 = ApiSizeLimits::from_mb(15, 15);
|
|
313
|
+
assert_eq!(limits_15.max_request_body_bytes, 15 * 1024 * 1024);
|
|
314
|
+
assert_eq!(limits_15.max_multipart_field_bytes, 15 * 1024 * 1024);
|
|
315
|
+
|
|
316
|
+
let limits_20_10 = ApiSizeLimits::from_mb(20, 10);
|
|
317
|
+
assert_eq!(limits_20_10.max_request_body_bytes, 20 * 1024 * 1024);
|
|
318
|
+
assert_eq!(limits_20_10.max_multipart_field_bytes, 10 * 1024 * 1024);
|
|
319
|
+
|
|
320
|
+
let router_15 = create_router_with_limits(ExtractionConfig::default(), limits_15);
|
|
321
|
+
let router_20_10 = create_router_with_limits(ExtractionConfig::default(), limits_20_10);
|
|
322
|
+
|
|
323
|
+
assert!(size_of_val(&router_15) > 0);
|
|
324
|
+
assert!(size_of_val(&router_20_10) > 0);
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
/// Test multipart upload with large payload handles streaming correctly.
|
|
328
|
+
///
|
|
329
|
+
/// This test verifies that the multipart parser can handle large payloads
|
|
330
|
+
/// without loading the entire file into memory at once, which is important
|
|
331
|
+
/// for processing very large documents.
|
|
332
|
+
///
|
|
333
|
+
/// # Expected Behavior
|
|
334
|
+
///
|
|
335
|
+
/// A 12MB file sent via multipart should be accepted if limits allow.
|
|
336
|
+
/// The API should handle streaming without excessive memory consumption.
|
|
337
|
+
#[tokio::test]
|
|
338
|
+
async fn test_multipart_large_payload_streaming() {
|
|
339
|
+
let limits = ApiSizeLimits::from_mb(15, 15);
|
|
340
|
+
let router = create_router_with_limits(ExtractionConfig::default(), limits);
|
|
341
|
+
|
|
342
|
+
let pdf_size = 12 * 1024 * 1024;
|
|
343
|
+
let pdf_content = create_mock_pdf_content(pdf_size);
|
|
344
|
+
|
|
345
|
+
let boundary = "----multipart-stream-test";
|
|
346
|
+
let request_body = create_multipart_pdf_request(boundary, &pdf_content, "stream_test_12mb.pdf");
|
|
347
|
+
|
|
348
|
+
let request = Request::builder()
|
|
349
|
+
.method("POST")
|
|
350
|
+
.uri("/extract")
|
|
351
|
+
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
352
|
+
.header("content-length", request_body.len())
|
|
353
|
+
.body(Body::from(request_body))
|
|
354
|
+
.expect("Failed to build request");
|
|
355
|
+
|
|
356
|
+
let response = router.oneshot(request).await.expect("Request failed");
|
|
357
|
+
|
|
358
|
+
assert_eq!(
|
|
359
|
+
response.status(),
|
|
360
|
+
StatusCode::OK,
|
|
361
|
+
"Multipart upload with 12MB payload should be handled via streaming"
|
|
362
|
+
);
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/// Test that gigabyte-scale limits can be configured.
|
|
366
|
+
///
|
|
367
|
+
/// This test verifies that the API can be configured with very large limits
|
|
368
|
+
/// suitable for enterprise deployments that need to process massive documents.
|
|
369
|
+
///
|
|
370
|
+
/// # Expected Behavior
|
|
371
|
+
///
|
|
372
|
+
/// The API should support limit configurations up to gigabyte scale without
|
|
373
|
+
/// panicking or causing overflow. This test doesn't actually send gigabyte
|
|
374
|
+
/// files (due to memory constraints), but verifies configuration is possible.
|
|
375
|
+
#[tokio::test]
|
|
376
|
+
async fn test_gigabyte_scale_limits() {
|
|
377
|
+
let limits = ApiSizeLimits::from_mb(1024, 1024);
|
|
378
|
+
assert_eq!(limits.max_request_body_bytes, 1024 * 1024 * 1024);
|
|
379
|
+
assert_eq!(limits.max_multipart_field_bytes, 1024 * 1024 * 1024);
|
|
380
|
+
|
|
381
|
+
let router = create_router_with_limits(ExtractionConfig::default(), limits);
|
|
382
|
+
|
|
383
|
+
assert!(size_of_val(&router) > 0);
|
|
384
|
+
|
|
385
|
+
let health_request = Request::builder()
|
|
386
|
+
.uri("/health")
|
|
387
|
+
.body(Body::empty())
|
|
388
|
+
.expect("Failed to build health check request");
|
|
389
|
+
|
|
390
|
+
let response = router.oneshot(health_request).await.expect("Request failed");
|
|
391
|
+
assert_eq!(response.status(), StatusCode::OK);
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
/// Test extracting multiple large PDF files in a single request.
|
|
395
|
+
///
|
|
396
|
+
/// This test verifies that batch processing of large files works correctly,
|
|
397
|
+
/// with the total request size being the sum of all file sizes.
|
|
398
|
+
///
|
|
399
|
+
/// # Expected Behavior
|
|
400
|
+
///
|
|
401
|
+
/// Two 4MB PDFs (8MB total) should be accepted when limits are 15MB,
|
|
402
|
+
/// demonstrating that the limit applies to total request size, not per-file.
|
|
403
|
+
#[tokio::test]
|
|
404
|
+
async fn test_extract_multiple_large_pdfs() {
|
|
405
|
+
let limits = ApiSizeLimits::from_mb(15, 15);
|
|
406
|
+
let router = create_router_with_limits(ExtractionConfig::default(), limits);
|
|
407
|
+
|
|
408
|
+
let pdf_size = 4 * 1024 * 1024;
|
|
409
|
+
let pdf_content_1 = create_mock_pdf_content(pdf_size);
|
|
410
|
+
let pdf_content_2 = create_mock_pdf_content(pdf_size);
|
|
411
|
+
|
|
412
|
+
let boundary = "----multi-large-boundary";
|
|
413
|
+
let mut request_body = Vec::new();
|
|
414
|
+
|
|
415
|
+
request_body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
|
416
|
+
request_body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"large1.pdf\"\r\n");
|
|
417
|
+
request_body.extend_from_slice(b"Content-Type: application/pdf\r\n\r\n");
|
|
418
|
+
request_body.extend_from_slice(&pdf_content_1);
|
|
419
|
+
request_body.extend_from_slice(b"\r\n");
|
|
420
|
+
|
|
421
|
+
request_body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
|
422
|
+
request_body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"large2.pdf\"\r\n");
|
|
423
|
+
request_body.extend_from_slice(b"Content-Type: application/pdf\r\n\r\n");
|
|
424
|
+
request_body.extend_from_slice(&pdf_content_2);
|
|
425
|
+
request_body.extend_from_slice(b"\r\n");
|
|
426
|
+
|
|
427
|
+
request_body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());
|
|
428
|
+
|
|
429
|
+
let request = Request::builder()
|
|
430
|
+
.method("POST")
|
|
431
|
+
.uri("/extract")
|
|
432
|
+
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
433
|
+
.header("content-length", request_body.len())
|
|
434
|
+
.body(Body::from(request_body))
|
|
435
|
+
.expect("Failed to build request");
|
|
436
|
+
|
|
437
|
+
let response = router.oneshot(request).await.expect("Request failed");
|
|
438
|
+
|
|
439
|
+
assert_eq!(
|
|
440
|
+
response.status(),
|
|
441
|
+
StatusCode::OK,
|
|
442
|
+
"Should successfully extract multiple large PDF files when total size within limits"
|
|
443
|
+
);
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
/// Test that API respects environment variable configuration for size limits.
|
|
447
|
+
///
|
|
448
|
+
/// This test documents how the API parses size limits from the environment,
|
|
449
|
+
/// via the ServerConfig which handles environment variable reading.
|
|
450
|
+
///
|
|
451
|
+
/// # Note
|
|
452
|
+
///
|
|
453
|
+
/// This test verifies the ApiSizeLimits struct itself can be configured,
|
|
454
|
+
/// demonstrating the pattern that environment variables should follow.
|
|
455
|
+
#[tokio::test]
|
|
456
|
+
async fn test_environment_configurable_limits_pattern() {
|
|
457
|
+
let env_configured_mb = 256;
|
|
458
|
+
|
|
459
|
+
let limits = ApiSizeLimits::from_mb(env_configured_mb, env_configured_mb);
|
|
460
|
+
let router = create_router_with_limits(ExtractionConfig::default(), limits);
|
|
461
|
+
|
|
462
|
+
assert_eq!(limits.max_request_body_bytes, 256 * 1024 * 1024);
|
|
463
|
+
|
|
464
|
+
let health_request = Request::builder()
|
|
465
|
+
.uri("/health")
|
|
466
|
+
.body(Body::empty())
|
|
467
|
+
.expect("Failed to build health check request");
|
|
468
|
+
|
|
469
|
+
let response = router.oneshot(health_request).await.expect("Request failed");
|
|
470
|
+
assert_eq!(response.status(), StatusCode::OK);
|
|
471
|
+
}
|