kreuzberg 4.0.0.rc2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +391 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,488 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
require 'tempfile'
|
|
5
|
+
require 'fileutils'
|
|
6
|
+
|
|
7
|
+
RSpec.describe 'Error Recovery' do
|
|
8
|
+
describe 'error classification and categorization' do
|
|
9
|
+
it 'raises ArgumentError for invalid configuration types' do
|
|
10
|
+
expect do
|
|
11
|
+
Kreuzberg::Config::Extraction.new(chunking: 'invalid_string')
|
|
12
|
+
end.to raise_error(ArgumentError)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
it 'classifies validation errors distinctly' do
|
|
16
|
+
error = nil
|
|
17
|
+
begin
|
|
18
|
+
Kreuzberg::Config::Extraction.new(ocr: 12_345)
|
|
19
|
+
rescue ArgumentError => e
|
|
20
|
+
error = e
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
expect(error).not_to be_nil
|
|
24
|
+
expect(error).to be_a(ArgumentError)
|
|
25
|
+
expect(error.message).to match(/OCR|Expected/)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
it 'raises error for invalid OCR backend configuration' do
|
|
29
|
+
expect do
|
|
30
|
+
Kreuzberg::Config::Extraction.new(ocr: [])
|
|
31
|
+
end.to raise_error(ArgumentError)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
it 'raises error for negative chunking parameters' do
|
|
35
|
+
expect { Kreuzberg::Config::Chunking.new(max_chars: -100) }
|
|
36
|
+
.to raise_error(StandardError, /negative|invalid|positive|max_chars/)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it 'raises error for invalid embedding dimensions' do
|
|
40
|
+
# Test with embedding config validation
|
|
41
|
+
expect do
|
|
42
|
+
Kreuzberg::Config::Extraction.new(
|
|
43
|
+
chunking: Kreuzberg::Config::Chunking.new(
|
|
44
|
+
enabled: true,
|
|
45
|
+
embedding: Kreuzberg::Config::Embedding.new(
|
|
46
|
+
model: { type: :preset, name: 'invalid_model' }
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
end.not_to raise_error # May succeed, but model name validation may occur later
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
describe 'error handling in extraction operations' do
|
|
55
|
+
it 'gracefully handles file not found errors' do
|
|
56
|
+
config = Kreuzberg::Config::Extraction.new
|
|
57
|
+
nonexistent_path = '/nonexistent/file/that/does/not/exist.pdf'
|
|
58
|
+
|
|
59
|
+
expect { Kreuzberg.extract_file_sync(path: nonexistent_path, config: config) }
|
|
60
|
+
.to raise_error(Kreuzberg::Errors::ValidationError, /not found|does not exist|no such file/)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it 'provides descriptive error messages for invalid MIME types' do
|
|
64
|
+
# Invalid MIME types should raise UnsupportedFormatError
|
|
65
|
+
expect do
|
|
66
|
+
Kreuzberg.extract_bytes_sync(data: 'test', mime_type: 'application/invalid-type')
|
|
67
|
+
end.to raise_error(Kreuzberg::Errors::UnsupportedFormatError)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
it 'handles empty file extraction gracefully' do
|
|
71
|
+
file = Tempfile.new(['empty', '.txt']).tap do |f|
|
|
72
|
+
f.write('')
|
|
73
|
+
f.close
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
config = Kreuzberg::Config::Extraction.new
|
|
77
|
+
result = Kreuzberg.extract_file_sync(path: file.path, config: config)
|
|
78
|
+
|
|
79
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
80
|
+
# Empty file may produce empty or minimal content
|
|
81
|
+
expect(result.content).to be_a(String)
|
|
82
|
+
|
|
83
|
+
FileUtils.rm_f(file.path)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
it 'recovers from extraction with minimal data' do
|
|
87
|
+
config = Kreuzberg::Config::Extraction.new
|
|
88
|
+
result = Kreuzberg.extract_bytes_sync(data: '', mime_type: 'text/plain', config: config)
|
|
89
|
+
|
|
90
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
91
|
+
expect(result).to respond_to(:content)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
describe 'retry strategies and recovery patterns' do
|
|
96
|
+
it 'implements retry with exponential backoff pattern' do
|
|
97
|
+
file = Tempfile.new(['retry_test', '.txt']).tap do |f|
|
|
98
|
+
f.write('Retry strategy test content')
|
|
99
|
+
f.close
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
config = Kreuzberg::Config::Extraction.new
|
|
103
|
+
max_retries = 3
|
|
104
|
+
attempt = 0
|
|
105
|
+
|
|
106
|
+
loop do
|
|
107
|
+
attempt += 1
|
|
108
|
+
result = Kreuzberg.extract_file_sync(path: file.path, config: config)
|
|
109
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
110
|
+
break
|
|
111
|
+
rescue StandardError => e
|
|
112
|
+
raise e if attempt >= max_retries
|
|
113
|
+
|
|
114
|
+
sleep 0.1 * (2**(attempt - 1))
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
expect(attempt).to eq(1) # Should succeed on first attempt
|
|
118
|
+
|
|
119
|
+
FileUtils.rm_f(file.path)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
it 'handles retry with config modification' do
|
|
123
|
+
file = Tempfile.new(['retry_config', '.txt']).tap do |f|
|
|
124
|
+
f.write('Config modification retry')
|
|
125
|
+
f.close
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
configs = [
|
|
129
|
+
Kreuzberg::Config::Extraction.new,
|
|
130
|
+
Kreuzberg::Config::Extraction.new(use_cache: false)
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
results = []
|
|
134
|
+
configs.each do |config|
|
|
135
|
+
result = Kreuzberg.extract_file_sync(path: file.path, config: config)
|
|
136
|
+
results << result
|
|
137
|
+
rescue StandardError
|
|
138
|
+
# Handle error, try next config
|
|
139
|
+
next
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
expect(results).not_to be_empty
|
|
143
|
+
|
|
144
|
+
FileUtils.rm_f(file.path)
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
it 'implements circuit breaker pattern for repeated failures' do
|
|
148
|
+
circuit_state = :closed
|
|
149
|
+
|
|
150
|
+
config = Kreuzberg::Config::Extraction.new
|
|
151
|
+
text = 'Test content for circuit breaker'
|
|
152
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
153
|
+
|
|
154
|
+
# Simulate successful extraction without repeated errors
|
|
155
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
156
|
+
expect(circuit_state).to eq(:closed)
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
it 'supports fallback configuration on extraction failure' do
|
|
160
|
+
file = Tempfile.new(['fallback', '.txt']).tap do |f|
|
|
161
|
+
f.write('Fallback configuration test')
|
|
162
|
+
f.close
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
primary_config = Kreuzberg::Config::Extraction.new
|
|
166
|
+
fallback_config = Kreuzberg::Config::Extraction.new(use_cache: false)
|
|
167
|
+
|
|
168
|
+
result = begin
|
|
169
|
+
Kreuzberg.extract_file_sync(path: file.path, config: primary_config)
|
|
170
|
+
rescue StandardError => _e
|
|
171
|
+
Kreuzberg.extract_file_sync(path: file.path, config: fallback_config)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
175
|
+
|
|
176
|
+
FileUtils.rm_f(file.path)
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
describe 'graceful degradation strategies' do
|
|
181
|
+
it 'degrades extraction features when dependencies unavailable' do
|
|
182
|
+
# Test keyword extraction fallback
|
|
183
|
+
config_with_keywords = Kreuzberg::Config::Extraction.new(
|
|
184
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'yake', max_keywords: 5)
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
config_without_keywords = Kreuzberg::Config::Extraction.new
|
|
188
|
+
|
|
189
|
+
text = 'Machine learning transforms technology.'
|
|
190
|
+
|
|
191
|
+
result_with = Kreuzberg.extract_bytes_sync(
|
|
192
|
+
data: text, mime_type: 'text/plain', config: config_with_keywords
|
|
193
|
+
)
|
|
194
|
+
result_without = Kreuzberg.extract_bytes_sync(
|
|
195
|
+
data: text, mime_type: 'text/plain', config: config_without_keywords
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
expect(result_with).to be_a(Kreuzberg::Result)
|
|
199
|
+
expect(result_without).to be_a(Kreuzberg::Result)
|
|
200
|
+
# Both should provide content even if keywords fail
|
|
201
|
+
expect(result_without.content).not_to be_empty
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
it 'continues extraction without optional features' do
|
|
205
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
206
|
+
chunking: Kreuzberg::Config::Chunking.new(
|
|
207
|
+
enabled: false
|
|
208
|
+
)
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
text = 'Content without chunking feature'
|
|
212
|
+
result = Kreuzberg.extract_bytes_sync(
|
|
213
|
+
data: text, mime_type: 'text/plain', config: config
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
217
|
+
expect(result.content).not_to be_empty
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
it 'handles missing language detection gracefully' do
|
|
221
|
+
text = 'Machine learning content'
|
|
222
|
+
|
|
223
|
+
# With language detection disabled
|
|
224
|
+
config_disabled = Kreuzberg::Config::Extraction.new(
|
|
225
|
+
language_detection: Kreuzberg::Config::LanguageDetection.new(enabled: false)
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
result = Kreuzberg.extract_bytes_sync(
|
|
229
|
+
data: text, mime_type: 'text/plain', config: config_disabled
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
233
|
+
expect(result.content).not_to be_empty
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
it 'recovers from incomplete embedding generation' do
|
|
237
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
238
|
+
chunking: Kreuzberg::Config::Chunking.new(
|
|
239
|
+
enabled: true,
|
|
240
|
+
max_chars: 100,
|
|
241
|
+
embedding: Kreuzberg::Config::Embedding.new(
|
|
242
|
+
model: { type: :preset, name: 'balanced' }
|
|
243
|
+
)
|
|
244
|
+
)
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
text = 'Test ' * 50
|
|
248
|
+
result = Kreuzberg.extract_bytes_sync(
|
|
249
|
+
data: text, mime_type: 'text/plain', config: config
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Should extract content even if embedding fails
|
|
253
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
254
|
+
expect(result.content).not_to be_empty
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
describe 'error message clarity and debugging' do
|
|
259
|
+
it 'provides informative error messages for validation failures' do
|
|
260
|
+
error = nil
|
|
261
|
+
begin
|
|
262
|
+
Kreuzberg::Config::Extraction.new(ocr: 'invalid')
|
|
263
|
+
rescue ArgumentError => e
|
|
264
|
+
error = e
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
expect(error).not_to be_nil
|
|
268
|
+
expect(error.message).to be_a(String)
|
|
269
|
+
expect(error.message.length).to be > 10
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
it 'includes context in error messages' do
|
|
273
|
+
error = nil
|
|
274
|
+
begin
|
|
275
|
+
Kreuzberg::Config::Chunking.new(max_overlap: -50)
|
|
276
|
+
rescue StandardError => e
|
|
277
|
+
error = e
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
expect(error).not_to be_nil
|
|
281
|
+
expect(error.message).not_to be_empty
|
|
282
|
+
expect(error.message.downcase).to include(/overlap|invalid|negative/)
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
it 'distinguishes between validation and runtime errors' do
|
|
286
|
+
# Validation error
|
|
287
|
+
validation_error = nil
|
|
288
|
+
begin
|
|
289
|
+
Kreuzberg::Config::Extraction.new(chunking: 'invalid')
|
|
290
|
+
rescue StandardError => e
|
|
291
|
+
validation_error = e
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
expect(validation_error).to be_a(ArgumentError)
|
|
295
|
+
|
|
296
|
+
# Runtime error (file not found)
|
|
297
|
+
runtime_error = nil
|
|
298
|
+
begin
|
|
299
|
+
Kreuzberg.extract_file_sync(path: '/nonexistent/file.pdf')
|
|
300
|
+
rescue StandardError => e
|
|
301
|
+
runtime_error = e
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
expect(runtime_error).to be_a(Kreuzberg::Errors::ValidationError)
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
it 'provides error recovery suggestions in messages' do
|
|
308
|
+
error = nil
|
|
309
|
+
begin
|
|
310
|
+
Kreuzberg::Config::Extraction.new(ocr: 12_345)
|
|
311
|
+
rescue ArgumentError => e
|
|
312
|
+
error = e
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
expect(error).not_to be_nil
|
|
316
|
+
# Error message should be descriptive enough for debugging
|
|
317
|
+
expect(error.message).to include('OCR') || error.message.include('Expected')
|
|
318
|
+
end
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
describe 'recovery from partial extraction failures' do
|
|
322
|
+
it 'continues extraction after keyword extraction failure' do
|
|
323
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
324
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
325
|
+
algorithm: 'yake',
|
|
326
|
+
max_keywords: 1000 # Extreme value that may fail gracefully
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
text = 'Machine learning and artificial intelligence'
|
|
331
|
+
result = Kreuzberg.extract_bytes_sync(
|
|
332
|
+
data: text, mime_type: 'text/plain', config: config
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# Should still extract content even if keyword extraction has issues
|
|
336
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
337
|
+
expect(result.content).not_to be_empty
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
it 'handles batch processing with some file failures' do
|
|
341
|
+
valid_file = Tempfile.new(['valid_batch', '.txt']).tap do |f|
|
|
342
|
+
f.write('Valid content')
|
|
343
|
+
f.close
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
paths = [valid_file.path]
|
|
347
|
+
config = Kreuzberg::Config::Extraction.new
|
|
348
|
+
|
|
349
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
350
|
+
expect(results).to be_a(Array)
|
|
351
|
+
expect(results).not_to be_empty
|
|
352
|
+
|
|
353
|
+
FileUtils.rm_f(valid_file.path)
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
it 'recovers from chunking errors in batch' do
|
|
357
|
+
file = Tempfile.new(['chunking_error', '.txt']).tap do |f|
|
|
358
|
+
f.write('Content for chunking')
|
|
359
|
+
f.close
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
363
|
+
chunking: Kreuzberg::Config::Chunking.new(
|
|
364
|
+
enabled: true,
|
|
365
|
+
max_chars: 10 # Very small chunk size
|
|
366
|
+
)
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
result = Kreuzberg.extract_file_sync(path: file.path, config: config)
|
|
370
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
371
|
+
|
|
372
|
+
FileUtils.rm_f(file.path)
|
|
373
|
+
end
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
describe 'timeout and resource limit handling' do
|
|
377
|
+
it 'completes extraction within reasonable time' do
|
|
378
|
+
config = Kreuzberg::Config::Extraction.new
|
|
379
|
+
text = 'Machine learning ' * 100
|
|
380
|
+
|
|
381
|
+
start_time = Time.now
|
|
382
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
383
|
+
duration = Time.now - start_time
|
|
384
|
+
|
|
385
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
386
|
+
expect(duration).to be < 30.0 # Should complete within 30 seconds
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
it 'handles large file extraction gracefully' do
|
|
390
|
+
large_file = Tempfile.new(['large_file', '.txt']).tap do |f|
|
|
391
|
+
f.write('Large content ' * 1000)
|
|
392
|
+
f.close
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
config = Kreuzberg::Config::Extraction.new
|
|
396
|
+
result = Kreuzberg.extract_file_sync(path: large_file.path, config: config)
|
|
397
|
+
|
|
398
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
399
|
+
expect(result.content).not_to be_empty
|
|
400
|
+
|
|
401
|
+
FileUtils.rm_f(large_file.path)
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
it 'manages memory efficiently during large batch operations' do
|
|
405
|
+
paths = []
|
|
406
|
+
10.times do |i|
|
|
407
|
+
file = Tempfile.new(["memory_test_#{i}", '.txt'])
|
|
408
|
+
file.write("Memory test #{i} " * 50)
|
|
409
|
+
file.close
|
|
410
|
+
paths << file.path
|
|
411
|
+
end
|
|
412
|
+
|
|
413
|
+
config = Kreuzberg::Config::Extraction.new
|
|
414
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
415
|
+
|
|
416
|
+
expect(results.length).to eq(10)
|
|
417
|
+
expect(results).to all(be_a(Kreuzberg::Result))
|
|
418
|
+
|
|
419
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
420
|
+
end
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
describe 'configuration error prevention' do
|
|
424
|
+
it 'validates conflicting configuration options early' do
|
|
425
|
+
# Test invalid negative values
|
|
426
|
+
expect do
|
|
427
|
+
Kreuzberg::Config::Extraction.new(
|
|
428
|
+
chunking: Kreuzberg::Config::Chunking.new(max_chars: -100)
|
|
429
|
+
)
|
|
430
|
+
end.to raise_error
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
it 'prevents invalid algorithm selection' do
|
|
434
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
435
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
436
|
+
algorithm: 'yake',
|
|
437
|
+
max_keywords: 5
|
|
438
|
+
)
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
expect(config.keywords.algorithm).to eq('yake')
|
|
442
|
+
end
|
|
443
|
+
|
|
444
|
+
it 'validates keyword configuration completeness' do
|
|
445
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
446
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
447
|
+
algorithm: 'rake',
|
|
448
|
+
max_keywords: 10
|
|
449
|
+
)
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
expect(config.keywords).not_to be_nil
|
|
453
|
+
expect(config.keywords.algorithm).to eq('rake')
|
|
454
|
+
expect(config.keywords.max_keywords).to eq(10)
|
|
455
|
+
end
|
|
456
|
+
end
|
|
457
|
+
|
|
458
|
+
describe 'recovery monitoring and logging' do
|
|
459
|
+
it 'tracks extraction success/failure states' do
|
|
460
|
+
file = Tempfile.new(['tracking', '.txt']).tap do |f|
|
|
461
|
+
f.write('Tracking content')
|
|
462
|
+
f.close
|
|
463
|
+
end
|
|
464
|
+
|
|
465
|
+
config = Kreuzberg::Config::Extraction.new
|
|
466
|
+
result = Kreuzberg.extract_file_sync(path: file.path, config: config)
|
|
467
|
+
|
|
468
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
469
|
+
expect(result).to respond_to(:content)
|
|
470
|
+
|
|
471
|
+
FileUtils.rm_f(file.path)
|
|
472
|
+
end
|
|
473
|
+
|
|
474
|
+
it 'maintains extraction attempt history in application context' do
|
|
475
|
+
results_history = []
|
|
476
|
+
|
|
477
|
+
3.times do |i|
|
|
478
|
+
config = Kreuzberg::Config::Extraction.new
|
|
479
|
+
text = "Attempt #{i}"
|
|
480
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
481
|
+
results_history << result
|
|
482
|
+
end
|
|
483
|
+
|
|
484
|
+
expect(results_history.length).to eq(3)
|
|
485
|
+
expect(results_history).to all(be_a(Kreuzberg::Result))
|
|
486
|
+
end
|
|
487
|
+
end
|
|
488
|
+
end
|
data/spec/binding/errors_spec.rb
CHANGED
|
@@ -1,66 +1,66 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
RSpec.describe Kreuzberg::Errors do
|
|
4
|
-
describe Kreuzberg::Errors::Error do
|
|
5
|
-
it 'is a StandardError subclass' do
|
|
6
|
-
expect(described_class).to be < StandardError
|
|
7
|
-
end
|
|
8
|
-
|
|
9
|
-
it 'can be raised with a message' do
|
|
10
|
-
expect do
|
|
11
|
-
raise described_class, 'Test error'
|
|
12
|
-
end.to raise_error(described_class, 'Test error')
|
|
13
|
-
end
|
|
14
|
-
end
|
|
15
|
-
|
|
16
|
-
describe Kreuzberg::Errors::ValidationError do
|
|
17
|
-
it 'is an Error subclass' do
|
|
18
|
-
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
describe Kreuzberg::Errors::ParsingError do
|
|
23
|
-
it 'is an Error subclass' do
|
|
24
|
-
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
it 'stores context' do
|
|
28
|
-
error = described_class.new('Parsing failed', context: { file: 'test.pdf' })
|
|
29
|
-
expect(error.context).to eq({ file: 'test.pdf' })
|
|
30
|
-
end
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
describe Kreuzberg::Errors::OCRError do
|
|
34
|
-
it 'is an Error subclass' do
|
|
35
|
-
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
it 'stores context' do
|
|
39
|
-
error = described_class.new('OCR failed', context: { page: 1 })
|
|
40
|
-
expect(error.context).to eq({ page: 1 })
|
|
41
|
-
end
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
describe Kreuzberg::Errors::MissingDependencyError do
|
|
45
|
-
it 'is an Error subclass' do
|
|
46
|
-
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
it 'stores dependency name' do
|
|
50
|
-
error = described_class.new('Tesseract not found', dependency: 'tesseract')
|
|
51
|
-
expect(error.dependency).to eq('tesseract')
|
|
52
|
-
end
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
describe Kreuzberg::Errors::IOError do
|
|
56
|
-
it 'is an Error subclass' do
|
|
57
|
-
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
describe Kreuzberg::Errors::PluginError do
|
|
62
|
-
it 'is an Error subclass' do
|
|
63
|
-
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
64
|
-
end
|
|
65
|
-
end
|
|
66
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe Kreuzberg::Errors do
|
|
4
|
+
describe Kreuzberg::Errors::Error do
|
|
5
|
+
it 'is a StandardError subclass' do
|
|
6
|
+
expect(described_class).to be < StandardError
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
it 'can be raised with a message' do
|
|
10
|
+
expect do
|
|
11
|
+
raise described_class, 'Test error'
|
|
12
|
+
end.to raise_error(described_class, 'Test error')
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
describe Kreuzberg::Errors::ValidationError do
|
|
17
|
+
it 'is an Error subclass' do
|
|
18
|
+
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
describe Kreuzberg::Errors::ParsingError do
|
|
23
|
+
it 'is an Error subclass' do
|
|
24
|
+
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
it 'stores context' do
|
|
28
|
+
error = described_class.new('Parsing failed', context: { file: 'test.pdf' })
|
|
29
|
+
expect(error.context).to eq({ file: 'test.pdf' })
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
describe Kreuzberg::Errors::OCRError do
|
|
34
|
+
it 'is an Error subclass' do
|
|
35
|
+
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
it 'stores context' do
|
|
39
|
+
error = described_class.new('OCR failed', context: { page: 1 })
|
|
40
|
+
expect(error.context).to eq({ page: 1 })
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
describe Kreuzberg::Errors::MissingDependencyError do
|
|
45
|
+
it 'is an Error subclass' do
|
|
46
|
+
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
it 'stores dependency name' do
|
|
50
|
+
error = described_class.new('Tesseract not found', dependency: 'tesseract')
|
|
51
|
+
expect(error.dependency).to eq('tesseract')
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
describe Kreuzberg::Errors::IOError do
|
|
56
|
+
it 'is an Error subclass' do
|
|
57
|
+
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
describe Kreuzberg::Errors::PluginError do
|
|
62
|
+
it 'is an Error subclass' do
|
|
63
|
+
expect(described_class).to be < Kreuzberg::Errors::Error
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|