kreuzberg 4.0.0.rc2 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +396 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
use std::ffi::{CStr, CString};
|
|
2
|
+
use std::os::raw::c_char;
|
|
3
|
+
use std::ptr;
|
|
4
|
+
|
|
5
|
+
unsafe extern "C" {
|
|
6
|
+
fn kreuzberg_register_document_extractor(
|
|
7
|
+
name: *const c_char,
|
|
8
|
+
callback: unsafe extern "C" fn(*const u8, usize, *const c_char, *const c_char) -> *mut c_char,
|
|
9
|
+
mime_types: *const c_char,
|
|
10
|
+
priority: i32,
|
|
11
|
+
) -> bool;
|
|
12
|
+
|
|
13
|
+
fn kreuzberg_unregister_document_extractor(name: *const c_char) -> bool;
|
|
14
|
+
|
|
15
|
+
fn kreuzberg_list_document_extractors() -> *mut c_char;
|
|
16
|
+
|
|
17
|
+
fn kreuzberg_last_error() -> *const c_char;
|
|
18
|
+
|
|
19
|
+
fn kreuzberg_free_string(s: *mut c_char);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
unsafe extern "C" fn test_extractor_callback(
|
|
23
|
+
_content: *const u8,
|
|
24
|
+
_content_len: usize,
|
|
25
|
+
_mime_type: *const c_char,
|
|
26
|
+
_config_json: *const c_char,
|
|
27
|
+
) -> *mut c_char {
|
|
28
|
+
let result = r#"{
|
|
29
|
+
"content": "test extracted content",
|
|
30
|
+
"mime_type": "text/plain",
|
|
31
|
+
"metadata": {}
|
|
32
|
+
}"#;
|
|
33
|
+
CString::new(result).unwrap().into_raw()
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
#[allow(dead_code)]
|
|
37
|
+
unsafe extern "C" fn failing_extractor_callback(
|
|
38
|
+
_content: *const u8,
|
|
39
|
+
_content_len: usize,
|
|
40
|
+
_mime_type: *const c_char,
|
|
41
|
+
_config_json: *const c_char,
|
|
42
|
+
) -> *mut c_char {
|
|
43
|
+
ptr::null_mut()
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
#[test]
|
|
47
|
+
fn test_register_document_extractor_success() {
|
|
48
|
+
unsafe {
|
|
49
|
+
let name = CString::new("test-extractor").unwrap();
|
|
50
|
+
let mime_types = CString::new("application/x-test,text/x-test").unwrap();
|
|
51
|
+
|
|
52
|
+
let success =
|
|
53
|
+
kreuzberg_register_document_extractor(name.as_ptr(), test_extractor_callback, mime_types.as_ptr(), 100);
|
|
54
|
+
|
|
55
|
+
assert!(success, "Failed to register extractor");
|
|
56
|
+
|
|
57
|
+
kreuzberg_unregister_document_extractor(name.as_ptr());
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
#[test]
|
|
62
|
+
fn test_register_document_extractor_null_name() {
|
|
63
|
+
unsafe {
|
|
64
|
+
let mime_types = CString::new("application/x-test").unwrap();
|
|
65
|
+
|
|
66
|
+
let success =
|
|
67
|
+
kreuzberg_register_document_extractor(ptr::null(), test_extractor_callback, mime_types.as_ptr(), 100);
|
|
68
|
+
|
|
69
|
+
assert!(!success, "Should fail with NULL name");
|
|
70
|
+
|
|
71
|
+
let error = kreuzberg_last_error();
|
|
72
|
+
assert!(!error.is_null());
|
|
73
|
+
let error_str = CStr::from_ptr(error).to_str().unwrap();
|
|
74
|
+
assert!(error_str.contains("NULL"), "Error should mention NULL: {}", error_str);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
#[test]
|
|
79
|
+
fn test_register_document_extractor_null_mime_types() {
|
|
80
|
+
unsafe {
|
|
81
|
+
let name = CString::new("test-extractor").unwrap();
|
|
82
|
+
|
|
83
|
+
let success = kreuzberg_register_document_extractor(name.as_ptr(), test_extractor_callback, ptr::null(), 100);
|
|
84
|
+
|
|
85
|
+
assert!(!success, "Should fail with NULL MIME types");
|
|
86
|
+
|
|
87
|
+
let error = kreuzberg_last_error();
|
|
88
|
+
assert!(!error.is_null());
|
|
89
|
+
let error_str = CStr::from_ptr(error).to_str().unwrap();
|
|
90
|
+
assert!(error_str.contains("MIME") || error_str.contains("NULL"));
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
#[test]
|
|
95
|
+
fn test_register_document_extractor_empty_mime_types() {
|
|
96
|
+
unsafe {
|
|
97
|
+
let name = CString::new("test-extractor").unwrap();
|
|
98
|
+
let mime_types = CString::new("").unwrap();
|
|
99
|
+
|
|
100
|
+
let success =
|
|
101
|
+
kreuzberg_register_document_extractor(name.as_ptr(), test_extractor_callback, mime_types.as_ptr(), 100);
|
|
102
|
+
|
|
103
|
+
assert!(!success, "Should fail with empty MIME types");
|
|
104
|
+
|
|
105
|
+
let error = kreuzberg_last_error();
|
|
106
|
+
assert!(!error.is_null());
|
|
107
|
+
let error_str = CStr::from_ptr(error).to_str().unwrap();
|
|
108
|
+
assert!(error_str.contains("MIME"));
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
#[test]
|
|
113
|
+
fn test_unregister_document_extractor_success() {
|
|
114
|
+
unsafe {
|
|
115
|
+
let name = CString::new("test-extractor-unregister").unwrap();
|
|
116
|
+
let mime_types = CString::new("application/x-test").unwrap();
|
|
117
|
+
|
|
118
|
+
let success =
|
|
119
|
+
kreuzberg_register_document_extractor(name.as_ptr(), test_extractor_callback, mime_types.as_ptr(), 100);
|
|
120
|
+
assert!(success);
|
|
121
|
+
|
|
122
|
+
let success = kreuzberg_unregister_document_extractor(name.as_ptr());
|
|
123
|
+
assert!(success, "Failed to unregister extractor");
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
#[test]
|
|
128
|
+
fn test_unregister_document_extractor_null_name() {
|
|
129
|
+
unsafe {
|
|
130
|
+
let success = kreuzberg_unregister_document_extractor(ptr::null());
|
|
131
|
+
assert!(!success, "Should fail with NULL name");
|
|
132
|
+
|
|
133
|
+
let error = kreuzberg_last_error();
|
|
134
|
+
assert!(!error.is_null());
|
|
135
|
+
let error_str = CStr::from_ptr(error).to_str().unwrap();
|
|
136
|
+
assert!(error_str.contains("NULL"));
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
#[test]
|
|
141
|
+
fn test_unregister_nonexistent_extractor() {
|
|
142
|
+
unsafe {
|
|
143
|
+
let name = CString::new("nonexistent-extractor").unwrap();
|
|
144
|
+
|
|
145
|
+
let success = kreuzberg_unregister_document_extractor(name.as_ptr());
|
|
146
|
+
assert!(success, "Unregistering nonexistent extractor should succeed");
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
#[test]
|
|
151
|
+
fn test_list_document_extractors() {
|
|
152
|
+
unsafe {
|
|
153
|
+
let name1 = CString::new("test-extractor-1").unwrap();
|
|
154
|
+
let name2 = CString::new("test-extractor-2").unwrap();
|
|
155
|
+
let mime_types = CString::new("application/x-test").unwrap();
|
|
156
|
+
|
|
157
|
+
kreuzberg_register_document_extractor(name1.as_ptr(), test_extractor_callback, mime_types.as_ptr(), 100);
|
|
158
|
+
kreuzberg_register_document_extractor(name2.as_ptr(), test_extractor_callback, mime_types.as_ptr(), 100);
|
|
159
|
+
|
|
160
|
+
let list_ptr = kreuzberg_list_document_extractors();
|
|
161
|
+
assert!(!list_ptr.is_null(), "List should not be NULL");
|
|
162
|
+
|
|
163
|
+
let list_str = CStr::from_ptr(list_ptr).to_str().unwrap();
|
|
164
|
+
assert!(list_str.contains("test-extractor-1"));
|
|
165
|
+
assert!(list_str.contains("test-extractor-2"));
|
|
166
|
+
|
|
167
|
+
kreuzberg_free_string(list_ptr);
|
|
168
|
+
|
|
169
|
+
kreuzberg_unregister_document_extractor(name1.as_ptr());
|
|
170
|
+
kreuzberg_unregister_document_extractor(name2.as_ptr());
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
#[test]
|
|
175
|
+
fn test_register_multiple_mime_types() {
|
|
176
|
+
unsafe {
|
|
177
|
+
let name = CString::new("multi-mime-extractor").unwrap();
|
|
178
|
+
let mime_types = CString::new("application/x-test1, text/x-test2 , image/x-test3").unwrap();
|
|
179
|
+
|
|
180
|
+
let success =
|
|
181
|
+
kreuzberg_register_document_extractor(name.as_ptr(), test_extractor_callback, mime_types.as_ptr(), 100);
|
|
182
|
+
|
|
183
|
+
assert!(success, "Failed to register with multiple MIME types");
|
|
184
|
+
|
|
185
|
+
kreuzberg_unregister_document_extractor(name.as_ptr());
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
#[test]
|
|
190
|
+
fn test_register_with_different_priorities() {
|
|
191
|
+
unsafe {
|
|
192
|
+
let name_high = CString::new("high-priority-extractor").unwrap();
|
|
193
|
+
let name_low = CString::new("low-priority-extractor").unwrap();
|
|
194
|
+
let mime_types = CString::new("application/x-test").unwrap();
|
|
195
|
+
|
|
196
|
+
let success1 = kreuzberg_register_document_extractor(
|
|
197
|
+
name_high.as_ptr(),
|
|
198
|
+
test_extractor_callback,
|
|
199
|
+
mime_types.as_ptr(),
|
|
200
|
+
200,
|
|
201
|
+
);
|
|
202
|
+
let success2 =
|
|
203
|
+
kreuzberg_register_document_extractor(name_low.as_ptr(), test_extractor_callback, mime_types.as_ptr(), 50);
|
|
204
|
+
|
|
205
|
+
assert!(success1 && success2, "Failed to register extractors");
|
|
206
|
+
|
|
207
|
+
kreuzberg_unregister_document_extractor(name_high.as_ptr());
|
|
208
|
+
kreuzberg_unregister_document_extractor(name_low.as_ptr());
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
#[test]
|
|
213
|
+
fn test_invalid_utf8_name() {
|
|
214
|
+
unsafe {
|
|
215
|
+
let invalid_name = b"test\xFF\xFEinvalid\0";
|
|
216
|
+
let mime_types = CString::new("application/x-test").unwrap();
|
|
217
|
+
|
|
218
|
+
let success = kreuzberg_register_document_extractor(
|
|
219
|
+
invalid_name.as_ptr() as *const c_char,
|
|
220
|
+
test_extractor_callback,
|
|
221
|
+
mime_types.as_ptr(),
|
|
222
|
+
100,
|
|
223
|
+
);
|
|
224
|
+
|
|
225
|
+
assert!(!success, "Should fail with invalid UTF-8 name");
|
|
226
|
+
|
|
227
|
+
let error = kreuzberg_last_error();
|
|
228
|
+
assert!(!error.is_null());
|
|
229
|
+
let error_str = CStr::from_ptr(error).to_str().unwrap();
|
|
230
|
+
assert!(error_str.contains("UTF-8"));
|
|
231
|
+
}
|
|
232
|
+
}
|
|
@@ -0,0 +1,470 @@
|
|
|
1
|
+
//! FFI plugin registration integration tests.
|
|
2
|
+
//!
|
|
3
|
+
//! Tests the FFI layer for registering and managing validators and post-processors.
|
|
4
|
+
|
|
5
|
+
use std::ffi::{CStr, CString};
|
|
6
|
+
use std::os::raw::c_char;
|
|
7
|
+
use std::ptr;
|
|
8
|
+
|
|
9
|
+
unsafe extern "C" {
|
|
10
|
+
fn kreuzberg_register_validator(name: *const c_char, callback: ValidatorCallback, priority: i32) -> bool;
|
|
11
|
+
fn kreuzberg_unregister_validator(name: *const c_char) -> bool;
|
|
12
|
+
fn kreuzberg_list_validators() -> *mut c_char;
|
|
13
|
+
fn kreuzberg_clear_validators() -> bool;
|
|
14
|
+
fn kreuzberg_free_string(s: *mut c_char);
|
|
15
|
+
fn kreuzberg_last_error() -> *const c_char;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
unsafe extern "C" {
|
|
19
|
+
fn kreuzberg_unregister_ocr_backend(name: *const c_char) -> bool;
|
|
20
|
+
fn kreuzberg_list_ocr_backends() -> *mut c_char;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
type ValidatorCallback = unsafe extern "C" fn(
|
|
24
|
+
content: *const c_char,
|
|
25
|
+
mime_type: *const c_char,
|
|
26
|
+
metadata_json: *const c_char,
|
|
27
|
+
config_json: *const c_char,
|
|
28
|
+
) -> *mut c_char;
|
|
29
|
+
|
|
30
|
+
/// Helper to convert *const c_char to String
|
|
31
|
+
unsafe fn c_str_to_string(ptr: *const c_char) -> Option<String> {
|
|
32
|
+
if ptr.is_null() {
|
|
33
|
+
None
|
|
34
|
+
} else {
|
|
35
|
+
unsafe { Some(CStr::from_ptr(ptr).to_string_lossy().into_owned()) }
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/// Helper to get last error message
|
|
40
|
+
unsafe fn get_last_error() -> Option<String> {
|
|
41
|
+
let error_ptr = unsafe { kreuzberg_last_error() };
|
|
42
|
+
unsafe { c_str_to_string(error_ptr) }
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/// Mock validator callback that always passes
|
|
46
|
+
unsafe extern "C" fn passing_validator_callback(
|
|
47
|
+
_content: *const c_char,
|
|
48
|
+
_mime_type: *const c_char,
|
|
49
|
+
_metadata_json: *const c_char,
|
|
50
|
+
_config_json: *const c_char,
|
|
51
|
+
) -> *mut c_char {
|
|
52
|
+
ptr::null_mut()
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/// Mock validator callback that always fails
|
|
56
|
+
unsafe extern "C" fn failing_validator_callback(
|
|
57
|
+
_content: *const c_char,
|
|
58
|
+
_mime_type: *const c_char,
|
|
59
|
+
_metadata_json: *const c_char,
|
|
60
|
+
_config_json: *const c_char,
|
|
61
|
+
) -> *mut c_char {
|
|
62
|
+
let error_msg = CString::new("Validation failed: content too short").unwrap();
|
|
63
|
+
error_msg.into_raw()
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/// Test successful validator registration.
|
|
67
|
+
#[test]
|
|
68
|
+
fn test_register_validator_succeeds() {
|
|
69
|
+
unsafe {
|
|
70
|
+
kreuzberg_clear_validators();
|
|
71
|
+
|
|
72
|
+
let name = CString::new("test-validator").unwrap();
|
|
73
|
+
let result = kreuzberg_register_validator(name.as_ptr(), passing_validator_callback, 50);
|
|
74
|
+
|
|
75
|
+
assert!(result, "Validator registration should succeed");
|
|
76
|
+
|
|
77
|
+
let list_ptr = kreuzberg_list_validators();
|
|
78
|
+
assert!(!list_ptr.is_null(), "List should not be null");
|
|
79
|
+
|
|
80
|
+
let list_json = c_str_to_string(list_ptr).expect("Should have valid JSON");
|
|
81
|
+
kreuzberg_free_string(list_ptr);
|
|
82
|
+
|
|
83
|
+
assert!(
|
|
84
|
+
list_json.contains("test-validator"),
|
|
85
|
+
"List should contain registered validator"
|
|
86
|
+
);
|
|
87
|
+
|
|
88
|
+
kreuzberg_clear_validators();
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/// Test registering multiple validators.
|
|
93
|
+
#[test]
|
|
94
|
+
fn test_register_multiple_validators_succeeds() {
|
|
95
|
+
unsafe {
|
|
96
|
+
kreuzberg_clear_validators();
|
|
97
|
+
|
|
98
|
+
let validator1 = CString::new("validator-1").unwrap();
|
|
99
|
+
let validator2 = CString::new("validator-2").unwrap();
|
|
100
|
+
let validator3 = CString::new("validator-3").unwrap();
|
|
101
|
+
|
|
102
|
+
assert!(
|
|
103
|
+
kreuzberg_register_validator(validator1.as_ptr(), passing_validator_callback, 100),
|
|
104
|
+
"First validator registration should succeed"
|
|
105
|
+
);
|
|
106
|
+
assert!(
|
|
107
|
+
kreuzberg_register_validator(validator2.as_ptr(), passing_validator_callback, 50),
|
|
108
|
+
"Second validator registration should succeed"
|
|
109
|
+
);
|
|
110
|
+
assert!(
|
|
111
|
+
kreuzberg_register_validator(validator3.as_ptr(), failing_validator_callback, 25),
|
|
112
|
+
"Third validator registration should succeed"
|
|
113
|
+
);
|
|
114
|
+
|
|
115
|
+
let list_ptr = kreuzberg_list_validators();
|
|
116
|
+
let list_json = c_str_to_string(list_ptr).expect("Should have valid JSON");
|
|
117
|
+
kreuzberg_free_string(list_ptr);
|
|
118
|
+
|
|
119
|
+
assert!(list_json.contains("validator-1"), "Should contain validator-1");
|
|
120
|
+
assert!(list_json.contains("validator-2"), "Should contain validator-2");
|
|
121
|
+
assert!(list_json.contains("validator-3"), "Should contain validator-3");
|
|
122
|
+
|
|
123
|
+
kreuzberg_clear_validators();
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/// Test unregistering validator.
|
|
128
|
+
#[test]
|
|
129
|
+
fn test_unregister_validator_succeeds() {
|
|
130
|
+
unsafe {
|
|
131
|
+
kreuzberg_clear_validators();
|
|
132
|
+
|
|
133
|
+
let name = CString::new("temp-validator").unwrap();
|
|
134
|
+
kreuzberg_register_validator(name.as_ptr(), passing_validator_callback, 50);
|
|
135
|
+
|
|
136
|
+
let result = kreuzberg_unregister_validator(name.as_ptr());
|
|
137
|
+
assert!(result, "Unregistration should succeed");
|
|
138
|
+
|
|
139
|
+
let list_ptr = kreuzberg_list_validators();
|
|
140
|
+
let list_json = c_str_to_string(list_ptr).expect("Should have valid JSON");
|
|
141
|
+
kreuzberg_free_string(list_ptr);
|
|
142
|
+
|
|
143
|
+
assert!(
|
|
144
|
+
!list_json.contains("temp-validator"),
|
|
145
|
+
"List should not contain unregistered validator"
|
|
146
|
+
);
|
|
147
|
+
|
|
148
|
+
kreuzberg_clear_validators();
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/// Test unregistering non-existent validator fails gracefully.
|
|
153
|
+
#[test]
|
|
154
|
+
fn test_unregister_nonexistent_validator_fails_gracefully() {
|
|
155
|
+
unsafe {
|
|
156
|
+
kreuzberg_clear_validators();
|
|
157
|
+
|
|
158
|
+
let name = CString::new("nonexistent-validator").unwrap();
|
|
159
|
+
let result = kreuzberg_unregister_validator(name.as_ptr());
|
|
160
|
+
|
|
161
|
+
assert!(result, "Unregistering non-existent validator should succeed (no-op)");
|
|
162
|
+
|
|
163
|
+
kreuzberg_clear_validators();
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/// Test registering validator with null name fails gracefully.
|
|
168
|
+
#[test]
|
|
169
|
+
fn test_register_validator_with_null_name_fails_gracefully() {
|
|
170
|
+
unsafe {
|
|
171
|
+
let result = kreuzberg_register_validator(ptr::null(), passing_validator_callback, 50);
|
|
172
|
+
|
|
173
|
+
assert!(!result, "Registration with null name should fail");
|
|
174
|
+
|
|
175
|
+
let error = get_last_error();
|
|
176
|
+
assert!(error.is_some(), "Should have error message");
|
|
177
|
+
let error_msg = error.unwrap();
|
|
178
|
+
assert!(
|
|
179
|
+
error_msg.contains("null") || error_msg.contains("invalid") || error_msg.contains("empty"),
|
|
180
|
+
"Error should mention null/invalid: {}",
|
|
181
|
+
error_msg
|
|
182
|
+
);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/// Test registering validator with empty name fails gracefully.
|
|
187
|
+
#[test]
|
|
188
|
+
fn test_register_validator_with_empty_name_fails_gracefully() {
|
|
189
|
+
unsafe {
|
|
190
|
+
let name = CString::new("").unwrap();
|
|
191
|
+
let result = kreuzberg_register_validator(name.as_ptr(), passing_validator_callback, 50);
|
|
192
|
+
|
|
193
|
+
assert!(!result, "Registration with empty name should fail");
|
|
194
|
+
|
|
195
|
+
let error = get_last_error();
|
|
196
|
+
assert!(error.is_some(), "Should have error message");
|
|
197
|
+
let error_msg = error.unwrap();
|
|
198
|
+
assert!(
|
|
199
|
+
error_msg.contains("empty") || error_msg.contains("invalid"),
|
|
200
|
+
"Error should mention empty/invalid: {}",
|
|
201
|
+
error_msg
|
|
202
|
+
);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/// Test registering validator with whitespace in name fails gracefully.
|
|
207
|
+
#[test]
|
|
208
|
+
fn test_register_validator_with_whitespace_in_name_fails_gracefully() {
|
|
209
|
+
unsafe {
|
|
210
|
+
let name = CString::new("validator with spaces").unwrap();
|
|
211
|
+
let result = kreuzberg_register_validator(name.as_ptr(), passing_validator_callback, 50);
|
|
212
|
+
|
|
213
|
+
assert!(!result, "Registration with whitespace in name should fail");
|
|
214
|
+
|
|
215
|
+
let error = get_last_error();
|
|
216
|
+
assert!(error.is_some(), "Should have error message");
|
|
217
|
+
let error_msg = error.unwrap();
|
|
218
|
+
assert!(
|
|
219
|
+
error_msg.contains("whitespace") || error_msg.contains("invalid"),
|
|
220
|
+
"Error should mention whitespace/invalid: {}",
|
|
221
|
+
error_msg
|
|
222
|
+
);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/// Test registering validator with invalid UTF-8 fails gracefully.
|
|
227
|
+
#[test]
|
|
228
|
+
fn test_register_validator_with_invalid_utf8_fails_gracefully() {
|
|
229
|
+
unsafe {
|
|
230
|
+
let invalid_bytes = vec![
|
|
231
|
+
b'v', b'a', b'l', b'i', b'd', b'a', b't', b'o', b'r', b'-', 0xFF, 0xFE, 0x00,
|
|
232
|
+
];
|
|
233
|
+
let name_ptr = invalid_bytes.as_ptr() as *const i8;
|
|
234
|
+
let result = kreuzberg_register_validator(name_ptr, passing_validator_callback, 50);
|
|
235
|
+
|
|
236
|
+
assert!(!result, "Should fail with invalid UTF-8");
|
|
237
|
+
let error = get_last_error();
|
|
238
|
+
assert!(error.is_some(), "Should have error message on failure");
|
|
239
|
+
assert!(
|
|
240
|
+
error.unwrap().contains("Invalid UTF-8"),
|
|
241
|
+
"Error should mention UTF-8 issue"
|
|
242
|
+
);
|
|
243
|
+
|
|
244
|
+
kreuzberg_clear_validators();
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/// Test clearing all validators.
|
|
249
|
+
#[test]
|
|
250
|
+
fn test_clear_validators_succeeds() {
|
|
251
|
+
unsafe {
|
|
252
|
+
kreuzberg_clear_validators();
|
|
253
|
+
|
|
254
|
+
let v1 = CString::new("validator-1").unwrap();
|
|
255
|
+
let v2 = CString::new("validator-2").unwrap();
|
|
256
|
+
kreuzberg_register_validator(v1.as_ptr(), passing_validator_callback, 50);
|
|
257
|
+
kreuzberg_register_validator(v2.as_ptr(), passing_validator_callback, 50);
|
|
258
|
+
|
|
259
|
+
let result = kreuzberg_clear_validators();
|
|
260
|
+
assert!(result, "Clear should succeed");
|
|
261
|
+
|
|
262
|
+
let list_ptr = kreuzberg_list_validators();
|
|
263
|
+
let list_json = c_str_to_string(list_ptr).expect("Should have valid JSON");
|
|
264
|
+
kreuzberg_free_string(list_ptr);
|
|
265
|
+
|
|
266
|
+
let validators: Vec<String> = serde_json::from_str(&list_json).unwrap_or_default();
|
|
267
|
+
assert_eq!(validators.len(), 0, "List should be empty after clear");
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/// Test listing validators returns valid JSON.
|
|
272
|
+
#[test]
|
|
273
|
+
fn test_list_validators_returns_valid_json() {
|
|
274
|
+
unsafe {
|
|
275
|
+
kreuzberg_clear_validators();
|
|
276
|
+
|
|
277
|
+
let name = CString::new("test-validator").unwrap();
|
|
278
|
+
kreuzberg_register_validator(name.as_ptr(), passing_validator_callback, 50);
|
|
279
|
+
|
|
280
|
+
let list_ptr = kreuzberg_list_validators();
|
|
281
|
+
assert!(!list_ptr.is_null(), "List should not be null");
|
|
282
|
+
|
|
283
|
+
let list_json = c_str_to_string(list_ptr).expect("Should have valid JSON");
|
|
284
|
+
kreuzberg_free_string(list_ptr);
|
|
285
|
+
|
|
286
|
+
let validators: Vec<String> = serde_json::from_str(&list_json).expect("Should be valid JSON array");
|
|
287
|
+
assert!(
|
|
288
|
+
validators.contains(&"test-validator".to_string()),
|
|
289
|
+
"Should contain registered validator"
|
|
290
|
+
);
|
|
291
|
+
|
|
292
|
+
kreuzberg_clear_validators();
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/// Test listing empty validators returns empty array.
|
|
297
|
+
#[test]
|
|
298
|
+
fn test_list_empty_validators_returns_empty_array() {
|
|
299
|
+
unsafe {
|
|
300
|
+
kreuzberg_clear_validators();
|
|
301
|
+
|
|
302
|
+
let list_ptr = kreuzberg_list_validators();
|
|
303
|
+
assert!(!list_ptr.is_null(), "List should not be null");
|
|
304
|
+
|
|
305
|
+
let list_json = c_str_to_string(list_ptr).expect("Should have valid JSON");
|
|
306
|
+
kreuzberg_free_string(list_ptr);
|
|
307
|
+
|
|
308
|
+
let validators: Vec<String> = serde_json::from_str(&list_json).expect("Should be valid JSON array");
|
|
309
|
+
assert_eq!(validators.len(), 0, "Should be empty array");
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
/// Test registering duplicate validator replaces previous one.
|
|
314
|
+
#[test]
|
|
315
|
+
fn test_register_duplicate_validator_replaces_previous() {
|
|
316
|
+
unsafe {
|
|
317
|
+
kreuzberg_clear_validators();
|
|
318
|
+
|
|
319
|
+
let name = CString::new("duplicate-validator").unwrap();
|
|
320
|
+
|
|
321
|
+
kreuzberg_register_validator(name.as_ptr(), passing_validator_callback, 50);
|
|
322
|
+
|
|
323
|
+
let result = kreuzberg_register_validator(name.as_ptr(), failing_validator_callback, 100);
|
|
324
|
+
|
|
325
|
+
assert!(result, "Duplicate registration should succeed (replace)");
|
|
326
|
+
|
|
327
|
+
let list_ptr = kreuzberg_list_validators();
|
|
328
|
+
let list_json = c_str_to_string(list_ptr).expect("Should have valid JSON");
|
|
329
|
+
kreuzberg_free_string(list_ptr);
|
|
330
|
+
|
|
331
|
+
let validators: Vec<String> = serde_json::from_str(&list_json).unwrap();
|
|
332
|
+
let duplicate_count = validators.iter().filter(|v| *v == "duplicate-validator").count();
|
|
333
|
+
assert_eq!(duplicate_count, 1, "Should only have one instance of the validator");
|
|
334
|
+
|
|
335
|
+
kreuzberg_clear_validators();
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/// Test validator priorities are respected.
|
|
340
|
+
#[test]
|
|
341
|
+
fn test_validator_priorities_are_registered() {
|
|
342
|
+
unsafe {
|
|
343
|
+
kreuzberg_clear_validators();
|
|
344
|
+
|
|
345
|
+
let low_priority = CString::new("low-priority-validator").unwrap();
|
|
346
|
+
let high_priority = CString::new("high-priority-validator").unwrap();
|
|
347
|
+
|
|
348
|
+
kreuzberg_register_validator(low_priority.as_ptr(), passing_validator_callback, 10);
|
|
349
|
+
kreuzberg_register_validator(high_priority.as_ptr(), passing_validator_callback, 100);
|
|
350
|
+
|
|
351
|
+
let list_ptr = kreuzberg_list_validators();
|
|
352
|
+
let list_json = c_str_to_string(list_ptr).expect("Should have valid JSON");
|
|
353
|
+
kreuzberg_free_string(list_ptr);
|
|
354
|
+
|
|
355
|
+
assert!(
|
|
356
|
+
list_json.contains("low-priority-validator"),
|
|
357
|
+
"Should contain low priority validator"
|
|
358
|
+
);
|
|
359
|
+
assert!(
|
|
360
|
+
list_json.contains("high-priority-validator"),
|
|
361
|
+
"Should contain high priority validator"
|
|
362
|
+
);
|
|
363
|
+
|
|
364
|
+
kreuzberg_clear_validators();
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/// Test listing OCR backends returns valid JSON.
|
|
369
|
+
#[test]
|
|
370
|
+
fn test_list_ocr_backends_returns_valid_json() {
|
|
371
|
+
unsafe {
|
|
372
|
+
let list_ptr = kreuzberg_list_ocr_backends();
|
|
373
|
+
assert!(!list_ptr.is_null(), "List should not be null");
|
|
374
|
+
|
|
375
|
+
let list_json = c_str_to_string(list_ptr).expect("Should have valid JSON");
|
|
376
|
+
kreuzberg_free_string(list_ptr);
|
|
377
|
+
|
|
378
|
+
let backends: Vec<String> = serde_json::from_str(&list_json).expect("Should be valid JSON array");
|
|
379
|
+
|
|
380
|
+
assert!(backends.is_empty() || !backends.is_empty(), "Should be a valid array");
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
/// Test unregistering non-existent OCR backend succeeds gracefully.
|
|
385
|
+
#[test]
|
|
386
|
+
fn test_unregister_nonexistent_ocr_backend_succeeds_gracefully() {
|
|
387
|
+
unsafe {
|
|
388
|
+
let name = CString::new("nonexistent-ocr-backend").unwrap();
|
|
389
|
+
let result = kreuzberg_unregister_ocr_backend(name.as_ptr());
|
|
390
|
+
|
|
391
|
+
assert!(result, "Unregistering non-existent OCR backend should succeed (no-op)");
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
/// Test unregistering OCR backend with null name fails gracefully.
|
|
396
|
+
#[test]
|
|
397
|
+
fn test_unregister_ocr_backend_with_null_name_fails_gracefully() {
|
|
398
|
+
unsafe {
|
|
399
|
+
let result = kreuzberg_unregister_ocr_backend(ptr::null());
|
|
400
|
+
|
|
401
|
+
assert!(!result, "Unregistration with null name should fail");
|
|
402
|
+
|
|
403
|
+
let error = get_last_error();
|
|
404
|
+
assert!(error.is_some(), "Should have error message");
|
|
405
|
+
let error_msg = error.unwrap();
|
|
406
|
+
assert!(
|
|
407
|
+
error_msg.contains("NULL") || error_msg.contains("null"),
|
|
408
|
+
"Error should mention null: {}",
|
|
409
|
+
error_msg
|
|
410
|
+
);
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
/// Test unregistering OCR backend with empty name fails gracefully.
|
|
415
|
+
#[test]
|
|
416
|
+
fn test_unregister_ocr_backend_with_empty_name_fails_gracefully() {
|
|
417
|
+
unsafe {
|
|
418
|
+
let name = CString::new("").unwrap();
|
|
419
|
+
let result = kreuzberg_unregister_ocr_backend(name.as_ptr());
|
|
420
|
+
|
|
421
|
+
assert!(!result, "Unregistration with empty name should fail");
|
|
422
|
+
|
|
423
|
+
let error = get_last_error();
|
|
424
|
+
assert!(error.is_some(), "Should have error message");
|
|
425
|
+
let error_msg = error.unwrap();
|
|
426
|
+
assert!(
|
|
427
|
+
error_msg.contains("empty") || error_msg.contains("invalid"),
|
|
428
|
+
"Error should mention empty/invalid: {}",
|
|
429
|
+
error_msg
|
|
430
|
+
);
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
/// Test unregistering OCR backend with whitespace in name fails gracefully.
|
|
435
|
+
#[test]
|
|
436
|
+
fn test_unregister_ocr_backend_with_whitespace_in_name_fails_gracefully() {
|
|
437
|
+
unsafe {
|
|
438
|
+
let name = CString::new("ocr backend with spaces").unwrap();
|
|
439
|
+
let result = kreuzberg_unregister_ocr_backend(name.as_ptr());
|
|
440
|
+
|
|
441
|
+
assert!(!result, "Unregistration with whitespace in name should fail");
|
|
442
|
+
|
|
443
|
+
let error = get_last_error();
|
|
444
|
+
assert!(error.is_some(), "Should have error message");
|
|
445
|
+
let error_msg = error.unwrap();
|
|
446
|
+
assert!(
|
|
447
|
+
error_msg.contains("whitespace") || error_msg.contains("invalid"),
|
|
448
|
+
"Error should mention whitespace/invalid: {}",
|
|
449
|
+
error_msg
|
|
450
|
+
);
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
/// Test unregistering OCR backend with invalid UTF-8 fails gracefully.
|
|
455
|
+
#[test]
|
|
456
|
+
fn test_unregister_ocr_backend_with_invalid_utf8_fails_gracefully() {
|
|
457
|
+
unsafe {
|
|
458
|
+
let invalid_bytes = [b'o', b'c', b'r', b'-', 0xFF, 0xFE, 0x00];
|
|
459
|
+
let name_ptr = invalid_bytes.as_ptr() as *const i8;
|
|
460
|
+
let result = kreuzberg_unregister_ocr_backend(name_ptr);
|
|
461
|
+
|
|
462
|
+
assert!(!result, "Should fail with invalid UTF-8");
|
|
463
|
+
let error = get_last_error();
|
|
464
|
+
assert!(error.is_some(), "Should have error message on failure");
|
|
465
|
+
assert!(
|
|
466
|
+
error.unwrap().contains("Invalid UTF-8"),
|
|
467
|
+
"Error should mention UTF-8 issue"
|
|
468
|
+
);
|
|
469
|
+
}
|
|
470
|
+
}
|