kreuzberg 4.0.0.rc2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +391 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,977 @@
|
|
|
1
|
+
//! C FFI bindings for Kreuzberg document intelligence library.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides a C-compatible API that can be consumed by Java (Panama FFI),
|
|
4
|
+
//! Go (cgo), C# (P/Invoke), Zig, and other languages with C FFI support.
|
|
5
|
+
|
|
6
|
+
mod batch_streaming;
|
|
7
|
+
mod config;
|
|
8
|
+
mod error;
|
|
9
|
+
mod extraction;
|
|
10
|
+
mod helpers;
|
|
11
|
+
mod memory;
|
|
12
|
+
mod mime;
|
|
13
|
+
mod panic_shield;
|
|
14
|
+
mod plugins;
|
|
15
|
+
mod result;
|
|
16
|
+
mod result_pool;
|
|
17
|
+
mod result_view;
|
|
18
|
+
mod string_intern;
|
|
19
|
+
mod types;
|
|
20
|
+
mod util;
|
|
21
|
+
mod validation;
|
|
22
|
+
|
|
23
|
+
pub use batch_streaming::{
|
|
24
|
+
ErrorCallback, ResultCallback, kreuzberg_extract_batch_parallel, kreuzberg_extract_batch_streaming,
|
|
25
|
+
};
|
|
26
|
+
pub use config::{
|
|
27
|
+
kreuzberg_config_discover, kreuzberg_config_free, kreuzberg_config_from_file, kreuzberg_config_from_json,
|
|
28
|
+
kreuzberg_config_get_field, kreuzberg_config_is_valid, kreuzberg_config_merge, kreuzberg_config_to_json,
|
|
29
|
+
kreuzberg_get_embedding_preset, kreuzberg_list_embedding_presets, kreuzberg_load_extraction_config_from_file,
|
|
30
|
+
};
|
|
31
|
+
pub use error::ErrorCode as KreuzbergErrorCode;
|
|
32
|
+
pub use error::{
|
|
33
|
+
CErrorDetails, kreuzberg_classify_error, kreuzberg_error_code_count, kreuzberg_error_code_description,
|
|
34
|
+
kreuzberg_error_code_internal, kreuzberg_error_code_io, kreuzberg_error_code_missing_dependency,
|
|
35
|
+
kreuzberg_error_code_name, kreuzberg_error_code_ocr, kreuzberg_error_code_parsing, kreuzberg_error_code_plugin,
|
|
36
|
+
kreuzberg_error_code_unsupported_format, kreuzberg_error_code_validation, kreuzberg_get_error_details,
|
|
37
|
+
};
|
|
38
|
+
pub use extraction::{
|
|
39
|
+
kreuzberg_batch_extract_bytes_sync, kreuzberg_batch_extract_files_sync, kreuzberg_extract_bytes_sync,
|
|
40
|
+
kreuzberg_extract_bytes_sync_with_config, kreuzberg_extract_file_sync, kreuzberg_extract_file_sync_with_config,
|
|
41
|
+
};
|
|
42
|
+
pub use helpers::*;
|
|
43
|
+
pub use memory::{kreuzberg_clone_string, kreuzberg_free_batch_result, kreuzberg_free_result, kreuzberg_free_string};
|
|
44
|
+
pub use mime::{
|
|
45
|
+
kreuzberg_detect_mime_type, kreuzberg_detect_mime_type_from_bytes, kreuzberg_detect_mime_type_from_path,
|
|
46
|
+
kreuzberg_get_extensions_for_mime, kreuzberg_validate_mime_type,
|
|
47
|
+
};
|
|
48
|
+
pub use panic_shield::{
|
|
49
|
+
ErrorCode, StructuredError, clear_structured_error, get_last_error_code, get_last_error_message,
|
|
50
|
+
get_last_panic_context, set_structured_error,
|
|
51
|
+
};
|
|
52
|
+
pub use plugins::*;
|
|
53
|
+
pub use result::{
|
|
54
|
+
CMetadataField, kreuzberg_result_get_chunk_count, kreuzberg_result_get_detected_language,
|
|
55
|
+
kreuzberg_result_get_metadata_field, kreuzberg_result_get_page_count,
|
|
56
|
+
};
|
|
57
|
+
pub use result_pool::{
|
|
58
|
+
CResultPoolStats, ResultPool, kreuzberg_extract_file_into_pool, kreuzberg_extract_file_into_pool_view,
|
|
59
|
+
kreuzberg_result_pool_free, kreuzberg_result_pool_new, kreuzberg_result_pool_reset, kreuzberg_result_pool_stats,
|
|
60
|
+
};
|
|
61
|
+
pub use result_view::{
|
|
62
|
+
CExtractionResultView, kreuzberg_get_result_view, kreuzberg_view_get_content, kreuzberg_view_get_mime_type,
|
|
63
|
+
};
|
|
64
|
+
pub use string_intern::{
|
|
65
|
+
CStringInternStats, kreuzberg_free_interned_string, kreuzberg_intern_string, kreuzberg_string_intern_reset,
|
|
66
|
+
kreuzberg_string_intern_stats,
|
|
67
|
+
};
|
|
68
|
+
pub use types::*;
|
|
69
|
+
pub use util::{kreuzberg_last_error, kreuzberg_last_error_code, kreuzberg_last_panic_context, kreuzberg_version};
|
|
70
|
+
pub use validation::*;
|
|
71
|
+
|
|
72
|
+
#[cfg(test)]
|
|
73
|
+
mod tests {
|
|
74
|
+
use super::*;
|
|
75
|
+
use std::ffi::{CStr, CString};
|
|
76
|
+
use std::os::raw::c_char;
|
|
77
|
+
use std::ptr;
|
|
78
|
+
|
|
79
|
+
#[test]
|
|
80
|
+
fn test_version() {
|
|
81
|
+
unsafe {
|
|
82
|
+
let version = kreuzberg_version();
|
|
83
|
+
assert!(!version.is_null());
|
|
84
|
+
let version_str = CStr::from_ptr(version).to_str().unwrap();
|
|
85
|
+
assert!(!version_str.is_empty());
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
#[test]
|
|
90
|
+
fn test_null_path() {
|
|
91
|
+
unsafe {
|
|
92
|
+
let result = kreuzberg_extract_file_sync(ptr::null());
|
|
93
|
+
assert!(result.is_null());
|
|
94
|
+
|
|
95
|
+
let error = kreuzberg_last_error();
|
|
96
|
+
assert!(!error.is_null());
|
|
97
|
+
let error_str = CStr::from_ptr(error).to_str().unwrap();
|
|
98
|
+
assert!(error_str.contains("NULL"));
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
#[test]
|
|
103
|
+
fn test_nonexistent_file() {
|
|
104
|
+
unsafe {
|
|
105
|
+
let path = CString::new("/nonexistent/file.pdf").unwrap();
|
|
106
|
+
let result = kreuzberg_extract_file_sync(path.as_ptr());
|
|
107
|
+
assert!(result.is_null());
|
|
108
|
+
|
|
109
|
+
let error = kreuzberg_last_error();
|
|
110
|
+
assert!(!error.is_null());
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// ==================== Struct Layout Tests ====================
|
|
115
|
+
|
|
116
|
+
#[test]
|
|
117
|
+
fn test_cextraction_result_layout() {
|
|
118
|
+
// Test size
|
|
119
|
+
assert_eq!(
|
|
120
|
+
std::mem::size_of::<CExtractionResult>(),
|
|
121
|
+
104,
|
|
122
|
+
"CExtractionResult must be exactly 104 bytes"
|
|
123
|
+
);
|
|
124
|
+
|
|
125
|
+
// Test alignment
|
|
126
|
+
assert_eq!(
|
|
127
|
+
std::mem::align_of::<CExtractionResult>(),
|
|
128
|
+
8,
|
|
129
|
+
"CExtractionResult must be 8-byte aligned"
|
|
130
|
+
);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
#[test]
|
|
134
|
+
fn test_cbatch_result_layout() {
|
|
135
|
+
// Test size
|
|
136
|
+
assert_eq!(
|
|
137
|
+
std::mem::size_of::<CBatchResult>(),
|
|
138
|
+
24,
|
|
139
|
+
"CBatchResult must be exactly 24 bytes"
|
|
140
|
+
);
|
|
141
|
+
|
|
142
|
+
// Test alignment
|
|
143
|
+
assert_eq!(
|
|
144
|
+
std::mem::align_of::<CBatchResult>(),
|
|
145
|
+
8,
|
|
146
|
+
"CBatchResult must be 8-byte aligned"
|
|
147
|
+
);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
#[test]
|
|
151
|
+
fn test_cbytes_with_mime_layout() {
|
|
152
|
+
// Test size
|
|
153
|
+
assert_eq!(
|
|
154
|
+
std::mem::size_of::<CBytesWithMime>(),
|
|
155
|
+
24,
|
|
156
|
+
"CBytesWithMime must be exactly 24 bytes"
|
|
157
|
+
);
|
|
158
|
+
|
|
159
|
+
// Test alignment
|
|
160
|
+
assert_eq!(
|
|
161
|
+
std::mem::align_of::<CBytesWithMime>(),
|
|
162
|
+
8,
|
|
163
|
+
"CBytesWithMime must be 8-byte aligned"
|
|
164
|
+
);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// ==================== Memory Safety Tests ====================
|
|
168
|
+
|
|
169
|
+
/// Helper function to create a mock CExtractionResult for testing
|
|
170
|
+
fn create_mock_extraction_result() -> *mut CExtractionResult {
|
|
171
|
+
Box::into_raw(Box::new(CExtractionResult {
|
|
172
|
+
content: CString::new("test content").unwrap().into_raw(),
|
|
173
|
+
mime_type: CString::new("text/plain").unwrap().into_raw(),
|
|
174
|
+
language: CString::new("en").unwrap().into_raw(),
|
|
175
|
+
date: ptr::null_mut(),
|
|
176
|
+
subject: ptr::null_mut(),
|
|
177
|
+
tables_json: ptr::null_mut(),
|
|
178
|
+
detected_languages_json: ptr::null_mut(),
|
|
179
|
+
metadata_json: ptr::null_mut(),
|
|
180
|
+
chunks_json: ptr::null_mut(),
|
|
181
|
+
images_json: ptr::null_mut(),
|
|
182
|
+
page_structure_json: ptr::null_mut(),
|
|
183
|
+
pages_json: ptr::null_mut(),
|
|
184
|
+
success: true,
|
|
185
|
+
_padding1: [0u8; 7],
|
|
186
|
+
}))
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
#[test]
|
|
190
|
+
fn test_batch_result_allocation_deallocation() {
|
|
191
|
+
unsafe {
|
|
192
|
+
// Simulate the exact allocation pattern from kreuzberg_batch_extract_files_sync
|
|
193
|
+
let c_results = vec![
|
|
194
|
+
create_mock_extraction_result(),
|
|
195
|
+
create_mock_extraction_result(),
|
|
196
|
+
create_mock_extraction_result(),
|
|
197
|
+
];
|
|
198
|
+
|
|
199
|
+
let actual_count = c_results.len();
|
|
200
|
+
|
|
201
|
+
// This is the exact pattern used in kreuzberg_batch_extract_files_sync
|
|
202
|
+
let results_array = c_results.into_boxed_slice();
|
|
203
|
+
let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
|
|
204
|
+
|
|
205
|
+
let batch_result = Box::into_raw(Box::new(CBatchResult {
|
|
206
|
+
results: results_ptr,
|
|
207
|
+
count: actual_count,
|
|
208
|
+
success: true,
|
|
209
|
+
_padding2: [0u8; 7],
|
|
210
|
+
}));
|
|
211
|
+
|
|
212
|
+
// Verify the batch result is valid
|
|
213
|
+
assert!(!batch_result.is_null());
|
|
214
|
+
assert_eq!((*batch_result).count, 3);
|
|
215
|
+
assert!((*batch_result).success);
|
|
216
|
+
|
|
217
|
+
// Now free it using the public API
|
|
218
|
+
kreuzberg_free_batch_result(batch_result);
|
|
219
|
+
|
|
220
|
+
// If we got here without crashing, the allocation/deallocation pattern is correct
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
#[test]
|
|
225
|
+
fn test_free_null_batch() {
|
|
226
|
+
unsafe {
|
|
227
|
+
// Freeing NULL batch should not crash
|
|
228
|
+
kreuzberg_free_batch_result(ptr::null_mut());
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
#[test]
|
|
233
|
+
fn test_free_null_result() {
|
|
234
|
+
unsafe {
|
|
235
|
+
// Freeing NULL result should not crash
|
|
236
|
+
kreuzberg_free_result(ptr::null_mut());
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
#[test]
|
|
241
|
+
fn test_free_null_string() {
|
|
242
|
+
unsafe {
|
|
243
|
+
// Freeing NULL string should not crash
|
|
244
|
+
kreuzberg_free_string(ptr::null_mut());
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
#[test]
|
|
249
|
+
fn test_batch_result_with_empty_results() {
|
|
250
|
+
unsafe {
|
|
251
|
+
// Test batch result with zero results
|
|
252
|
+
let c_results: Vec<*mut CExtractionResult> = Vec::new();
|
|
253
|
+
let results_array = c_results.into_boxed_slice();
|
|
254
|
+
let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
|
|
255
|
+
|
|
256
|
+
let batch_result = Box::into_raw(Box::new(CBatchResult {
|
|
257
|
+
results: results_ptr,
|
|
258
|
+
count: 0,
|
|
259
|
+
success: true,
|
|
260
|
+
_padding2: [0u8; 7],
|
|
261
|
+
}));
|
|
262
|
+
|
|
263
|
+
assert!(!batch_result.is_null());
|
|
264
|
+
assert_eq!((*batch_result).count, 0);
|
|
265
|
+
|
|
266
|
+
// Free should handle empty batch gracefully
|
|
267
|
+
kreuzberg_free_batch_result(batch_result);
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
#[test]
|
|
272
|
+
fn test_batch_result_with_null_elements() {
|
|
273
|
+
unsafe {
|
|
274
|
+
// Test batch result where some elements are NULL
|
|
275
|
+
let c_results = vec![
|
|
276
|
+
create_mock_extraction_result(),
|
|
277
|
+
ptr::null_mut(), // NULL element
|
|
278
|
+
create_mock_extraction_result(),
|
|
279
|
+
];
|
|
280
|
+
|
|
281
|
+
let actual_count = c_results.len();
|
|
282
|
+
let results_array = c_results.into_boxed_slice();
|
|
283
|
+
let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
|
|
284
|
+
|
|
285
|
+
let batch_result = Box::into_raw(Box::new(CBatchResult {
|
|
286
|
+
results: results_ptr,
|
|
287
|
+
count: actual_count,
|
|
288
|
+
success: true,
|
|
289
|
+
_padding2: [0u8; 7],
|
|
290
|
+
}));
|
|
291
|
+
|
|
292
|
+
// Free should handle NULL elements gracefully
|
|
293
|
+
kreuzberg_free_batch_result(batch_result);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
#[test]
|
|
298
|
+
fn test_batch_result_single_element() {
|
|
299
|
+
unsafe {
|
|
300
|
+
// Test batch allocation/deallocation with exactly 1 result
|
|
301
|
+
// This is important for boundary condition testing
|
|
302
|
+
let c_results = vec![create_mock_extraction_result()];
|
|
303
|
+
|
|
304
|
+
let actual_count = c_results.len();
|
|
305
|
+
let results_array = c_results.into_boxed_slice();
|
|
306
|
+
let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
|
|
307
|
+
|
|
308
|
+
let batch_result = Box::into_raw(Box::new(CBatchResult {
|
|
309
|
+
results: results_ptr,
|
|
310
|
+
count: actual_count,
|
|
311
|
+
success: true,
|
|
312
|
+
_padding2: [0u8; 7],
|
|
313
|
+
}));
|
|
314
|
+
|
|
315
|
+
// Verify the batch result is valid
|
|
316
|
+
assert!(!batch_result.is_null());
|
|
317
|
+
assert_eq!((*batch_result).count, 1);
|
|
318
|
+
assert!((*batch_result).success);
|
|
319
|
+
|
|
320
|
+
// Free should handle single-element batch correctly
|
|
321
|
+
kreuzberg_free_batch_result(batch_result);
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
#[test]
|
|
326
|
+
fn test_batch_result_large_size() {
|
|
327
|
+
unsafe {
|
|
328
|
+
// Test batch with 100 elements to catch boundary conditions
|
|
329
|
+
// This verifies the system can handle larger batches without memory corruption
|
|
330
|
+
let mut c_results = Vec::with_capacity(100);
|
|
331
|
+
|
|
332
|
+
for _ in 0..100 {
|
|
333
|
+
c_results.push(create_mock_extraction_result());
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
let actual_count = c_results.len();
|
|
337
|
+
let results_array = c_results.into_boxed_slice();
|
|
338
|
+
let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
|
|
339
|
+
|
|
340
|
+
let batch_result = Box::into_raw(Box::new(CBatchResult {
|
|
341
|
+
results: results_ptr,
|
|
342
|
+
count: actual_count,
|
|
343
|
+
success: true,
|
|
344
|
+
_padding2: [0u8; 7],
|
|
345
|
+
}));
|
|
346
|
+
|
|
347
|
+
// Verify the batch result is valid
|
|
348
|
+
assert!(!batch_result.is_null());
|
|
349
|
+
assert_eq!((*batch_result).count, 100);
|
|
350
|
+
assert!((*batch_result).success);
|
|
351
|
+
|
|
352
|
+
// Free should handle large batch correctly without memory issues
|
|
353
|
+
kreuzberg_free_batch_result(batch_result);
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
#[test]
|
|
358
|
+
fn test_repeated_allocation_deallocation() {
|
|
359
|
+
unsafe {
|
|
360
|
+
// Stress test: 1000 iterations of allocation/deallocation
|
|
361
|
+
// This catches memory leaks, corruption, and use-after-free issues
|
|
362
|
+
for _ in 0..1000 {
|
|
363
|
+
let result = create_mock_extraction_result();
|
|
364
|
+
|
|
365
|
+
// Verify the result is valid
|
|
366
|
+
assert!(!result.is_null());
|
|
367
|
+
assert!((*result).success);
|
|
368
|
+
|
|
369
|
+
// Free the result
|
|
370
|
+
kreuzberg_free_result(result);
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
// If we got here without crashing or leaking, the memory management is sound
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// ==================== Box/Vec Symmetry Test ====================
|
|
378
|
+
|
|
379
|
+
#[test]
|
|
380
|
+
fn test_box_vec_symmetry() {
|
|
381
|
+
unsafe {
|
|
382
|
+
// This test verifies the fix for the allocation/deallocation bug
|
|
383
|
+
// Create a Vec, convert to boxed slice, then verify correct deallocation
|
|
384
|
+
|
|
385
|
+
let mut vec = Vec::with_capacity(5);
|
|
386
|
+
vec.push(42u32);
|
|
387
|
+
vec.push(100u32);
|
|
388
|
+
vec.push(255u32);
|
|
389
|
+
|
|
390
|
+
let len = vec.len();
|
|
391
|
+
|
|
392
|
+
// Convert to boxed slice (this is what kreuzberg_batch_extract_files_sync does)
|
|
393
|
+
let boxed_slice = vec.into_boxed_slice();
|
|
394
|
+
let raw_ptr = Box::into_raw(boxed_slice) as *mut u32;
|
|
395
|
+
|
|
396
|
+
// Verify we can read the values
|
|
397
|
+
assert_eq!(*raw_ptr.add(0), 42);
|
|
398
|
+
assert_eq!(*raw_ptr.add(1), 100);
|
|
399
|
+
assert_eq!(*raw_ptr.add(2), 255);
|
|
400
|
+
|
|
401
|
+
// Now deallocate using the correct method (from kreuzberg_free_batch_result)
|
|
402
|
+
// IMPORTANT: Must use Box::from_raw with slice pointer, not Vec::from_raw_parts
|
|
403
|
+
let _boxed_slice = Box::from_raw(std::ptr::slice_from_raw_parts_mut(raw_ptr, len));
|
|
404
|
+
|
|
405
|
+
// If we got here without crashing, the symmetry is correct
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
#[test]
|
|
410
|
+
fn test_box_vec_symmetry_pointers() {
|
|
411
|
+
unsafe {
|
|
412
|
+
// Test with pointer types (as used in CBatchResult)
|
|
413
|
+
let vec: Vec<*mut CExtractionResult> = vec![
|
|
414
|
+
create_mock_extraction_result(),
|
|
415
|
+
create_mock_extraction_result(),
|
|
416
|
+
create_mock_extraction_result(),
|
|
417
|
+
];
|
|
418
|
+
|
|
419
|
+
let len = vec.len();
|
|
420
|
+
|
|
421
|
+
// Convert to boxed slice
|
|
422
|
+
let boxed_slice = vec.into_boxed_slice();
|
|
423
|
+
let raw_ptr = Box::into_raw(boxed_slice) as *mut *mut CExtractionResult;
|
|
424
|
+
|
|
425
|
+
// Free individual results first
|
|
426
|
+
for i in 0..len {
|
|
427
|
+
let result_ptr = *raw_ptr.add(i);
|
|
428
|
+
if !result_ptr.is_null() {
|
|
429
|
+
kreuzberg_free_result(result_ptr);
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// Now free the array itself
|
|
434
|
+
let _boxed_slice = Box::from_raw(std::ptr::slice_from_raw_parts_mut(raw_ptr, len));
|
|
435
|
+
|
|
436
|
+
// If we got here without crashing, the symmetry is correct
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// ==================== FFI Function Smoke Tests ====================
|
|
441
|
+
|
|
442
|
+
#[test]
|
|
443
|
+
fn test_version_not_null() {
|
|
444
|
+
unsafe {
|
|
445
|
+
let version = kreuzberg_version();
|
|
446
|
+
assert!(!version.is_null(), "Version string should not be NULL");
|
|
447
|
+
|
|
448
|
+
let version_str = CStr::from_ptr(version).to_str().unwrap();
|
|
449
|
+
assert!(!version_str.is_empty(), "Version string should not be empty");
|
|
450
|
+
|
|
451
|
+
// Verify it looks like a version string (has dots or numbers)
|
|
452
|
+
assert!(
|
|
453
|
+
version_str.contains('.') || version_str.chars().any(|c| c.is_numeric()),
|
|
454
|
+
"Version string should contain version info"
|
|
455
|
+
);
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
#[test]
|
|
460
|
+
fn test_null_config_handling() {
|
|
461
|
+
unsafe {
|
|
462
|
+
// Test that functions handle NULL config gracefully
|
|
463
|
+
// kreuzberg_batch_extract_files_sync with NULL config should use defaults
|
|
464
|
+
|
|
465
|
+
// Create a valid file paths array
|
|
466
|
+
let path1 = CString::new("/tmp/test1.txt").unwrap();
|
|
467
|
+
let path2 = CString::new("/tmp/test2.txt").unwrap();
|
|
468
|
+
let paths = [path1.as_ptr(), path2.as_ptr()];
|
|
469
|
+
|
|
470
|
+
// This should not crash with NULL config (though it may fail due to missing files)
|
|
471
|
+
let result = kreuzberg_batch_extract_files_sync(paths.as_ptr(), 2, ptr::null());
|
|
472
|
+
|
|
473
|
+
// Result might be NULL due to file not existing, but it shouldn't crash
|
|
474
|
+
if !result.is_null() {
|
|
475
|
+
kreuzberg_free_batch_result(result);
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
#[test]
|
|
481
|
+
fn test_extraction_result_free_with_null_fields() {
|
|
482
|
+
unsafe {
|
|
483
|
+
// Test freeing a result where most fields are NULL
|
|
484
|
+
let result = Box::into_raw(Box::new(CExtractionResult {
|
|
485
|
+
content: CString::new("content").unwrap().into_raw(),
|
|
486
|
+
mime_type: CString::new("text/plain").unwrap().into_raw(),
|
|
487
|
+
language: ptr::null_mut(),
|
|
488
|
+
date: ptr::null_mut(),
|
|
489
|
+
subject: ptr::null_mut(),
|
|
490
|
+
tables_json: ptr::null_mut(),
|
|
491
|
+
detected_languages_json: ptr::null_mut(),
|
|
492
|
+
metadata_json: ptr::null_mut(),
|
|
493
|
+
chunks_json: ptr::null_mut(),
|
|
494
|
+
images_json: ptr::null_mut(),
|
|
495
|
+
page_structure_json: ptr::null_mut(),
|
|
496
|
+
pages_json: ptr::null_mut(),
|
|
497
|
+
success: true,
|
|
498
|
+
_padding1: [0u8; 7],
|
|
499
|
+
}));
|
|
500
|
+
|
|
501
|
+
// Should not crash when freeing result with NULL fields
|
|
502
|
+
kreuzberg_free_result(result);
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
#[test]
|
|
507
|
+
fn test_extraction_result_free_all_fields_allocated() {
|
|
508
|
+
unsafe {
|
|
509
|
+
// Test freeing a result where ALL 12 string fields are allocated
|
|
510
|
+
// This verifies that kreuzberg_free_result properly frees all fields
|
|
511
|
+
let result = Box::into_raw(Box::new(CExtractionResult {
|
|
512
|
+
content: CString::new("test content").unwrap().into_raw(),
|
|
513
|
+
mime_type: CString::new("application/pdf").unwrap().into_raw(),
|
|
514
|
+
language: CString::new("en").unwrap().into_raw(),
|
|
515
|
+
date: CString::new("2024-01-01").unwrap().into_raw(),
|
|
516
|
+
subject: CString::new("Test Subject").unwrap().into_raw(),
|
|
517
|
+
tables_json: CString::new("[]").unwrap().into_raw(),
|
|
518
|
+
detected_languages_json: CString::new("[\"en\"]").unwrap().into_raw(),
|
|
519
|
+
metadata_json: CString::new("{}").unwrap().into_raw(),
|
|
520
|
+
chunks_json: CString::new("[{\"text\":\"chunk1\"}]").unwrap().into_raw(),
|
|
521
|
+
images_json: CString::new("[{\"data\":\"base64\"}]").unwrap().into_raw(),
|
|
522
|
+
page_structure_json: CString::new("{\"pages\":1}").unwrap().into_raw(),
|
|
523
|
+
pages_json: CString::new("[{\"page\":1,\"content\":\"test\"}]").unwrap().into_raw(),
|
|
524
|
+
success: true,
|
|
525
|
+
_padding1: [0u8; 7],
|
|
526
|
+
}));
|
|
527
|
+
|
|
528
|
+
// Should properly free all 12 allocated string fields without leaking memory
|
|
529
|
+
kreuzberg_free_result(result);
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
#[test]
|
|
534
|
+
fn test_string_allocation_deallocation() {
|
|
535
|
+
unsafe {
|
|
536
|
+
// Test string cloning and freeing
|
|
537
|
+
let original = CString::new("test string").unwrap();
|
|
538
|
+
let cloned = kreuzberg_clone_string(original.as_ptr());
|
|
539
|
+
|
|
540
|
+
assert!(!cloned.is_null(), "Cloned string should not be NULL");
|
|
541
|
+
|
|
542
|
+
let cloned_str = CStr::from_ptr(cloned).to_str().unwrap();
|
|
543
|
+
assert_eq!(cloned_str, "test string", "Cloned string should match original");
|
|
544
|
+
|
|
545
|
+
// Free the cloned string
|
|
546
|
+
kreuzberg_free_string(cloned);
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
#[test]
|
|
551
|
+
fn test_clone_null_string() {
|
|
552
|
+
unsafe {
|
|
553
|
+
// Cloning NULL should return NULL and set error
|
|
554
|
+
clear_last_error();
|
|
555
|
+
let cloned = kreuzberg_clone_string(ptr::null());
|
|
556
|
+
|
|
557
|
+
assert!(cloned.is_null(), "Cloning NULL should return NULL");
|
|
558
|
+
|
|
559
|
+
let error = kreuzberg_last_error();
|
|
560
|
+
assert!(!error.is_null(), "Error should be set");
|
|
561
|
+
let error_str = CStr::from_ptr(error).to_str().unwrap();
|
|
562
|
+
assert!(error_str.contains("NULL"), "Error should mention NULL");
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
#[test]
|
|
567
|
+
fn test_batch_result_success_field() {
|
|
568
|
+
unsafe {
|
|
569
|
+
// Test that success field is properly set
|
|
570
|
+
let c_results: Vec<*mut CExtractionResult> = Vec::new();
|
|
571
|
+
let results_array = c_results.into_boxed_slice();
|
|
572
|
+
let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
|
|
573
|
+
|
|
574
|
+
let batch_result = Box::into_raw(Box::new(CBatchResult {
|
|
575
|
+
results: results_ptr,
|
|
576
|
+
count: 0,
|
|
577
|
+
success: true,
|
|
578
|
+
_padding2: [0u8; 7],
|
|
579
|
+
}));
|
|
580
|
+
|
|
581
|
+
assert!((*batch_result).success, "Success field should be true");
|
|
582
|
+
|
|
583
|
+
kreuzberg_free_batch_result(batch_result);
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
#[test]
|
|
588
|
+
fn test_last_error_cleared() {
|
|
589
|
+
unsafe {
|
|
590
|
+
// Test that clear_last_error works
|
|
591
|
+
set_last_error("test error".to_string());
|
|
592
|
+
|
|
593
|
+
let error = kreuzberg_last_error();
|
|
594
|
+
assert!(!error.is_null());
|
|
595
|
+
|
|
596
|
+
clear_last_error();
|
|
597
|
+
|
|
598
|
+
let error_after = kreuzberg_last_error();
|
|
599
|
+
assert!(error_after.is_null(), "Error should be cleared");
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
// ==================== Additional Safety Net Tests (PR #1) ====================
|
|
604
|
+
|
|
605
|
+
/// Test CExtractionResult size exactly matches FFI contract
|
|
606
|
+
#[test]
|
|
607
|
+
fn test_c_extraction_result_size() {
|
|
608
|
+
assert_eq!(std::mem::size_of::<CExtractionResult>(), 104);
|
|
609
|
+
assert_eq!(std::mem::align_of::<CExtractionResult>(), 8);
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
/// Test CBatchResult size exactly matches FFI contract
|
|
613
|
+
#[test]
|
|
614
|
+
fn test_c_batch_result_size() {
|
|
615
|
+
assert_eq!(std::mem::size_of::<CBatchResult>(), 24);
|
|
616
|
+
assert_eq!(std::mem::align_of::<CBatchResult>(), 8);
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
/// Test CBytesWithMime size exactly matches FFI contract
|
|
620
|
+
#[test]
|
|
621
|
+
fn test_c_bytes_with_mime_size() {
|
|
622
|
+
assert_eq!(std::mem::size_of::<CBytesWithMime>(), 24);
|
|
623
|
+
assert_eq!(std::mem::align_of::<CBytesWithMime>(), 8);
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
/// Test that kreuzberg_extract_bytes_sync handles NULL data pointer
|
|
627
|
+
#[test]
|
|
628
|
+
fn test_extract_bytes_null_data() {
|
|
629
|
+
unsafe {
|
|
630
|
+
let mime = CString::new("text/plain").unwrap();
|
|
631
|
+
let result = kreuzberg_extract_bytes_sync(ptr::null(), 0, mime.as_ptr());
|
|
632
|
+
assert!(result.is_null(), "Should return NULL for NULL data pointer");
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
/// Test that kreuzberg_extract_bytes_sync handles NULL mime type
|
|
637
|
+
#[test]
|
|
638
|
+
fn test_extract_bytes_null_mime() {
|
|
639
|
+
unsafe {
|
|
640
|
+
let data = b"test data";
|
|
641
|
+
let result = kreuzberg_extract_bytes_sync(data.as_ptr(), data.len(), ptr::null());
|
|
642
|
+
assert!(result.is_null(), "Should return NULL for NULL mime type");
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
/// Test that kreuzberg_batch_extract_files_sync handles NULL paths pointer
|
|
647
|
+
#[test]
|
|
648
|
+
fn test_batch_extract_null_paths() {
|
|
649
|
+
unsafe {
|
|
650
|
+
let result = kreuzberg_batch_extract_files_sync(ptr::null(), 0, ptr::null());
|
|
651
|
+
assert!(result.is_null(), "Should return NULL for NULL paths pointer");
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
/// Test that kreuzberg_batch_extract_bytes_sync handles NULL bytes pointer
|
|
656
|
+
#[test]
|
|
657
|
+
fn test_batch_extract_bytes_null() {
|
|
658
|
+
unsafe {
|
|
659
|
+
let result = kreuzberg_batch_extract_bytes_sync(ptr::null(), 0, ptr::null());
|
|
660
|
+
assert!(result.is_null(), "Should return NULL for NULL bytes pointer");
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
/// Test that kreuzberg_register_ocr_backend handles NULL name
|
|
665
|
+
#[test]
|
|
666
|
+
fn test_register_ocr_backend_null_name() {
|
|
667
|
+
unsafe {
|
|
668
|
+
extern "C" fn dummy_callback(_: *const u8, _: usize, _: *const c_char) -> *mut c_char {
|
|
669
|
+
ptr::null_mut()
|
|
670
|
+
}
|
|
671
|
+
let result = kreuzberg_register_ocr_backend(ptr::null(), dummy_callback);
|
|
672
|
+
assert!(!result, "Should return false for NULL backend name");
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
/// Test that kreuzberg_unregister_ocr_backend handles NULL name
|
|
677
|
+
#[test]
|
|
678
|
+
fn test_unregister_ocr_backend_null_name() {
|
|
679
|
+
unsafe {
|
|
680
|
+
let result = kreuzberg_unregister_ocr_backend(ptr::null());
|
|
681
|
+
assert!(!result, "Should return false for NULL backend name");
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
/// Test that kreuzberg_register_post_processor handles NULL name
|
|
686
|
+
#[test]
|
|
687
|
+
fn test_register_post_processor_null_name() {
|
|
688
|
+
unsafe {
|
|
689
|
+
extern "C" fn dummy_callback(_: *const c_char) -> *mut c_char {
|
|
690
|
+
ptr::null_mut()
|
|
691
|
+
}
|
|
692
|
+
let result = kreuzberg_register_post_processor(ptr::null(), dummy_callback, 0);
|
|
693
|
+
assert!(!result, "Should return false for NULL processor name");
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
/// Test that kreuzberg_unregister_post_processor handles NULL name
|
|
698
|
+
#[test]
|
|
699
|
+
fn test_unregister_post_processor_null_name() {
|
|
700
|
+
unsafe {
|
|
701
|
+
let result = kreuzberg_unregister_post_processor(ptr::null());
|
|
702
|
+
assert!(!result, "Should return false for NULL processor name");
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
/// Test that kreuzberg_register_validator handles NULL name
|
|
707
|
+
#[test]
|
|
708
|
+
fn test_register_validator_null_name() {
|
|
709
|
+
unsafe {
|
|
710
|
+
extern "C" fn dummy_callback(_: *const c_char) -> *mut c_char {
|
|
711
|
+
ptr::null_mut()
|
|
712
|
+
}
|
|
713
|
+
let result = kreuzberg_register_validator(ptr::null(), dummy_callback, 0);
|
|
714
|
+
assert!(!result, "Should return false for NULL validator name");
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
/// Test that kreuzberg_unregister_validator handles NULL name
|
|
719
|
+
#[test]
|
|
720
|
+
fn test_unregister_validator_null_name() {
|
|
721
|
+
unsafe {
|
|
722
|
+
let result = kreuzberg_unregister_validator(ptr::null());
|
|
723
|
+
assert!(!result, "Should return false for NULL validator name");
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
/// Test that kreuzberg_get_ocr_languages handles NULL backend
|
|
728
|
+
#[test]
|
|
729
|
+
fn test_get_ocr_languages_null_backend() {
|
|
730
|
+
unsafe {
|
|
731
|
+
let result = kreuzberg_get_ocr_languages(ptr::null());
|
|
732
|
+
assert!(result.is_null(), "Should return NULL for NULL backend name");
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
/// Test that kreuzberg_is_language_supported handles NULL backend
|
|
737
|
+
#[test]
|
|
738
|
+
fn test_is_language_supported_null_backend() {
|
|
739
|
+
unsafe {
|
|
740
|
+
let lang = CString::new("en").unwrap();
|
|
741
|
+
let result = kreuzberg_is_language_supported(ptr::null(), lang.as_ptr());
|
|
742
|
+
assert_eq!(result, 0, "Should return 0 (false) for NULL backend");
|
|
743
|
+
}
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
/// Test that kreuzberg_is_language_supported handles NULL language
|
|
747
|
+
#[test]
|
|
748
|
+
fn test_is_language_supported_null_language() {
|
|
749
|
+
unsafe {
|
|
750
|
+
let backend = CString::new("tesseract").unwrap();
|
|
751
|
+
let result = kreuzberg_is_language_supported(backend.as_ptr(), ptr::null());
|
|
752
|
+
assert_eq!(result, 0, "Should return 0 (false) for NULL language");
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
/// Test that kreuzberg_validate_binarization_method handles NULL
|
|
757
|
+
#[test]
|
|
758
|
+
fn test_validate_binarization_method_null() {
|
|
759
|
+
unsafe {
|
|
760
|
+
let result = kreuzberg_validate_binarization_method(ptr::null());
|
|
761
|
+
assert_eq!(result, 0, "Should return 0 (invalid) for NULL method");
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
/// Test that kreuzberg_validate_token_reduction_level handles NULL
|
|
766
|
+
#[test]
|
|
767
|
+
fn test_validate_token_reduction_level_null() {
|
|
768
|
+
unsafe {
|
|
769
|
+
let result = kreuzberg_validate_token_reduction_level(ptr::null());
|
|
770
|
+
assert_eq!(result, 0, "Should return 0 (invalid) for NULL level");
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
/// Test that kreuzberg_validate_ocr_backend handles NULL
|
|
775
|
+
#[test]
|
|
776
|
+
fn test_validate_ocr_backend_null() {
|
|
777
|
+
unsafe {
|
|
778
|
+
let result = kreuzberg_validate_ocr_backend(ptr::null());
|
|
779
|
+
assert_eq!(result, 0, "Should return 0 (invalid) for NULL backend");
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
/// Test that kreuzberg_validate_language_code handles NULL
|
|
784
|
+
#[test]
|
|
785
|
+
fn test_validate_language_code_null() {
|
|
786
|
+
unsafe {
|
|
787
|
+
let result = kreuzberg_validate_language_code(ptr::null());
|
|
788
|
+
assert_eq!(result, 0, "Should return 0 (invalid) for NULL language code");
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
/// Test that kreuzberg_validate_output_format handles NULL
|
|
793
|
+
#[test]
|
|
794
|
+
fn test_validate_output_format_null() {
|
|
795
|
+
unsafe {
|
|
796
|
+
let result = kreuzberg_validate_output_format(ptr::null());
|
|
797
|
+
assert_eq!(result, 0, "Should return 0 (invalid) for NULL format");
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
/// Test that kreuzberg_version returns non-null
|
|
802
|
+
#[test]
|
|
803
|
+
fn test_version_returns_non_null() {
|
|
804
|
+
unsafe {
|
|
805
|
+
let version = kreuzberg_version();
|
|
806
|
+
assert!(!version.is_null(), "kreuzberg_version should never return NULL");
|
|
807
|
+
let version_str = CStr::from_ptr(version).to_str().unwrap();
|
|
808
|
+
assert!(!version_str.is_empty(), "Version string should not be empty");
|
|
809
|
+
}
|
|
810
|
+
}
|
|
811
|
+
|
|
812
|
+
/// Test that kreuzberg_last_error returns NULL when no error
|
|
813
|
+
#[test]
|
|
814
|
+
fn test_last_error_null_when_no_error() {
|
|
815
|
+
unsafe {
|
|
816
|
+
clear_last_error();
|
|
817
|
+
let error = kreuzberg_last_error();
|
|
818
|
+
assert!(error.is_null(), "Should return NULL when no error is set");
|
|
819
|
+
}
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
/// Test that kreuzberg_clone_string returns non-null for valid input
|
|
823
|
+
#[test]
|
|
824
|
+
fn test_clone_string_returns_non_null() {
|
|
825
|
+
unsafe {
|
|
826
|
+
let input = CString::new("test").unwrap();
|
|
827
|
+
let cloned = kreuzberg_clone_string(input.as_ptr());
|
|
828
|
+
assert!(!cloned.is_null(), "Clone should return non-NULL for valid input");
|
|
829
|
+
kreuzberg_free_string(cloned);
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
/// Test clearing OCR backends doesn't crash
|
|
834
|
+
#[test]
|
|
835
|
+
fn test_clear_ocr_backends_doesnt_crash() {
|
|
836
|
+
unsafe {
|
|
837
|
+
// This should not crash even if called multiple times
|
|
838
|
+
kreuzberg_clear_ocr_backends();
|
|
839
|
+
kreuzberg_clear_ocr_backends();
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
/// Test clearing post processors doesn't crash
|
|
844
|
+
#[test]
|
|
845
|
+
fn test_clear_post_processors_doesnt_crash() {
|
|
846
|
+
unsafe {
|
|
847
|
+
// This should not crash even if called multiple times
|
|
848
|
+
kreuzberg_clear_post_processors();
|
|
849
|
+
kreuzberg_clear_post_processors();
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
/// Test clearing validators doesn't crash
|
|
854
|
+
#[test]
|
|
855
|
+
fn test_clear_validators_doesnt_crash() {
|
|
856
|
+
unsafe {
|
|
857
|
+
// This should not crash even if called multiple times
|
|
858
|
+
kreuzberg_clear_validators();
|
|
859
|
+
kreuzberg_clear_validators();
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
/// Test clearing document extractors doesn't crash
|
|
864
|
+
#[test]
|
|
865
|
+
fn test_clear_document_extractors_doesnt_crash() {
|
|
866
|
+
unsafe {
|
|
867
|
+
// This should not crash even if called multiple times
|
|
868
|
+
kreuzberg_clear_document_extractors();
|
|
869
|
+
kreuzberg_clear_document_extractors();
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
/// Test that list functions return non-null JSON arrays
|
|
874
|
+
#[test]
|
|
875
|
+
fn test_list_functions_return_non_null() {
|
|
876
|
+
unsafe {
|
|
877
|
+
// All list functions should return non-NULL JSON arrays (even if empty)
|
|
878
|
+
let ocr = kreuzberg_list_ocr_backends();
|
|
879
|
+
assert!(!ocr.is_null(), "list_ocr_backends should return non-NULL");
|
|
880
|
+
kreuzberg_free_string(ocr);
|
|
881
|
+
|
|
882
|
+
let processors = kreuzberg_list_post_processors();
|
|
883
|
+
assert!(!processors.is_null(), "list_post_processors should return non-NULL");
|
|
884
|
+
kreuzberg_free_string(processors);
|
|
885
|
+
|
|
886
|
+
let validators = kreuzberg_list_validators();
|
|
887
|
+
assert!(!validators.is_null(), "list_validators should return non-NULL");
|
|
888
|
+
kreuzberg_free_string(validators);
|
|
889
|
+
|
|
890
|
+
let extractors = kreuzberg_list_document_extractors();
|
|
891
|
+
assert!(!extractors.is_null(), "list_document_extractors should return non-NULL");
|
|
892
|
+
kreuzberg_free_string(extractors);
|
|
893
|
+
|
|
894
|
+
let backends_with_langs = kreuzberg_list_ocr_backends_with_languages();
|
|
895
|
+
assert!(
|
|
896
|
+
!backends_with_langs.is_null(),
|
|
897
|
+
"list_ocr_backends_with_languages should return non-NULL"
|
|
898
|
+
);
|
|
899
|
+
kreuzberg_free_string(backends_with_langs);
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
/// Test numeric validation functions with edge cases
|
|
904
|
+
#[test]
|
|
905
|
+
fn test_numeric_validation_edge_cases() {
|
|
906
|
+
// Test Tesseract PSM validation with invalid values
|
|
907
|
+
assert_eq!(
|
|
908
|
+
kreuzberg_validate_tesseract_psm(-1),
|
|
909
|
+
0,
|
|
910
|
+
"Negative PSM should be invalid"
|
|
911
|
+
);
|
|
912
|
+
assert_eq!(kreuzberg_validate_tesseract_psm(0), 1, "PSM 0 should be valid");
|
|
913
|
+
assert_eq!(kreuzberg_validate_tesseract_psm(13), 1, "PSM 13 should be valid");
|
|
914
|
+
assert_eq!(kreuzberg_validate_tesseract_psm(14), 0, "PSM 14 should be invalid");
|
|
915
|
+
|
|
916
|
+
// Test Tesseract OEM validation
|
|
917
|
+
assert_eq!(
|
|
918
|
+
kreuzberg_validate_tesseract_oem(-1),
|
|
919
|
+
0,
|
|
920
|
+
"Negative OEM should be invalid"
|
|
921
|
+
);
|
|
922
|
+
assert_eq!(kreuzberg_validate_tesseract_oem(0), 1, "OEM 0 should be valid");
|
|
923
|
+
assert_eq!(kreuzberg_validate_tesseract_oem(3), 1, "OEM 3 should be valid");
|
|
924
|
+
assert_eq!(kreuzberg_validate_tesseract_oem(4), 0, "OEM 4 should be invalid");
|
|
925
|
+
|
|
926
|
+
// Test confidence validation
|
|
927
|
+
assert_eq!(
|
|
928
|
+
kreuzberg_validate_confidence(-0.1),
|
|
929
|
+
0,
|
|
930
|
+
"Negative confidence should be invalid"
|
|
931
|
+
);
|
|
932
|
+
assert_eq!(kreuzberg_validate_confidence(0.0), 1, "0.0 confidence should be valid");
|
|
933
|
+
assert_eq!(kreuzberg_validate_confidence(0.5), 1, "0.5 confidence should be valid");
|
|
934
|
+
assert_eq!(kreuzberg_validate_confidence(1.0), 1, "1.0 confidence should be valid");
|
|
935
|
+
assert_eq!(
|
|
936
|
+
kreuzberg_validate_confidence(1.1),
|
|
937
|
+
0,
|
|
938
|
+
"1.1 confidence should be invalid"
|
|
939
|
+
);
|
|
940
|
+
|
|
941
|
+
// Test DPI validation
|
|
942
|
+
assert_eq!(kreuzberg_validate_dpi(0), 0, "0 DPI should be invalid");
|
|
943
|
+
assert_eq!(kreuzberg_validate_dpi(-1), 0, "-1 DPI should be invalid");
|
|
944
|
+
assert_eq!(kreuzberg_validate_dpi(1), 1, "1 DPI should be valid");
|
|
945
|
+
assert_eq!(kreuzberg_validate_dpi(72), 1, "72 DPI should be valid");
|
|
946
|
+
assert_eq!(kreuzberg_validate_dpi(300), 1, "300 DPI should be valid");
|
|
947
|
+
assert_eq!(kreuzberg_validate_dpi(2400), 1, "2400 DPI should be valid");
|
|
948
|
+
assert_eq!(kreuzberg_validate_dpi(2401), 0, "2401 DPI should be invalid");
|
|
949
|
+
|
|
950
|
+
// Test chunking params validation
|
|
951
|
+
assert_eq!(
|
|
952
|
+
kreuzberg_validate_chunking_params(0, 0),
|
|
953
|
+
0,
|
|
954
|
+
"0 max_chars should be invalid"
|
|
955
|
+
);
|
|
956
|
+
assert_eq!(
|
|
957
|
+
kreuzberg_validate_chunking_params(100, 0),
|
|
958
|
+
1,
|
|
959
|
+
"Valid params should pass"
|
|
960
|
+
);
|
|
961
|
+
assert_eq!(
|
|
962
|
+
kreuzberg_validate_chunking_params(100, 50),
|
|
963
|
+
1,
|
|
964
|
+
"Valid overlap should pass"
|
|
965
|
+
);
|
|
966
|
+
assert_eq!(
|
|
967
|
+
kreuzberg_validate_chunking_params(100, 100),
|
|
968
|
+
0,
|
|
969
|
+
"Overlap >= max_chars should be invalid"
|
|
970
|
+
);
|
|
971
|
+
assert_eq!(
|
|
972
|
+
kreuzberg_validate_chunking_params(100, 101),
|
|
973
|
+
0,
|
|
974
|
+
"Overlap > max_chars should be invalid"
|
|
975
|
+
);
|
|
976
|
+
}
|
|
977
|
+
}
|