kreuzberg 4.0.0.rc2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +391 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,588 @@
|
|
|
1
|
+
//! Batch result streaming FFI module.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides callback-based batch processing to avoid holding all results in memory.
|
|
4
|
+
//! Processes files one at a time, calling a user-provided callback for each result.
|
|
5
|
+
//!
|
|
6
|
+
//! # Benefits
|
|
7
|
+
//!
|
|
8
|
+
//! - 30-50% memory reduction for large batches (no accumulation of results)
|
|
9
|
+
//! - Early error detection (fail-fast on first error or continue processing)
|
|
10
|
+
//! - Progress reporting and cancellation support
|
|
11
|
+
//! - Optional parallel processing with rayon
|
|
12
|
+
//!
|
|
13
|
+
//! # Safety Model
|
|
14
|
+
//!
|
|
15
|
+
//! - Callback receives borrowed result pointer valid only during the callback
|
|
16
|
+
//! - Caller must copy/serialize data before callback returns if persistence needed
|
|
17
|
+
//! - Results are automatically freed after callback returns
|
|
18
|
+
//! - Error callback is optional and independent of result processing
|
|
19
|
+
//!
|
|
20
|
+
//! # Example (C)
|
|
21
|
+
//!
|
|
22
|
+
//! ```c
|
|
23
|
+
//! int process_result(const CExtractionResult* result, size_t index, void* user_data) {
|
|
24
|
+
//! // Process or copy result data here
|
|
25
|
+
//! printf("Processing file %zu\n", index);
|
|
26
|
+
//! return 0; // Continue processing
|
|
27
|
+
//! }
|
|
28
|
+
//!
|
|
29
|
+
//! void handle_error(size_t index, const char* error_msg, void* user_data) {
|
|
30
|
+
//! fprintf(stderr, "Error processing file %zu: %s\n", index, error_msg);
|
|
31
|
+
//! }
|
|
32
|
+
//!
|
|
33
|
+
//! const char* files[] = {"doc1.pdf", "doc2.txt", "doc3.docx"};
|
|
34
|
+
//! int result = kreuzberg_extract_batch_streaming(
|
|
35
|
+
//! files, 3, NULL, process_result, NULL, handle_error
|
|
36
|
+
//! );
|
|
37
|
+
//! ```
|
|
38
|
+
|
|
39
|
+
use crate::result_view::{CExtractionResultView, create_result_view};
|
|
40
|
+
use crate::{FfiResult, clear_last_error, parse_extraction_config_from_json, set_last_error};
|
|
41
|
+
use kreuzberg::types::ExtractionResult;
|
|
42
|
+
use std::ffi::{CStr, CString};
|
|
43
|
+
use std::os::raw::{c_char, c_int, c_void};
|
|
44
|
+
use std::path::Path;
|
|
45
|
+
#[cfg(feature = "rayon")]
|
|
46
|
+
use std::sync::Arc;
|
|
47
|
+
#[cfg(feature = "rayon")]
|
|
48
|
+
use std::sync::atomic::{AtomicBool, Ordering};
|
|
49
|
+
|
|
50
|
+
/// Callback function invoked for each successfully extracted result.
|
|
51
|
+
///
|
|
52
|
+
/// # Arguments
|
|
53
|
+
///
|
|
54
|
+
/// * `result` - Borrowed pointer to extraction result (valid only during callback)
|
|
55
|
+
/// * `file_index` - Zero-based index of the file in the batch
|
|
56
|
+
/// * `user_data` - User-provided context pointer
|
|
57
|
+
///
|
|
58
|
+
/// # Returns
|
|
59
|
+
///
|
|
60
|
+
/// - `0` to continue processing remaining files
|
|
61
|
+
/// - Non-zero to cancel batch processing (no further callbacks)
|
|
62
|
+
///
|
|
63
|
+
/// # Safety
|
|
64
|
+
///
|
|
65
|
+
/// - `result` pointer is valid only during the callback execution
|
|
66
|
+
/// - `result` is automatically freed after callback returns
|
|
67
|
+
/// - Caller must copy/serialize data if needed beyond callback scope
|
|
68
|
+
/// - `user_data` is passed through opaquely (caller manages lifetime)
|
|
69
|
+
pub type ResultCallback =
|
|
70
|
+
unsafe extern "C" fn(result: *const CExtractionResultView, file_index: usize, user_data: *mut c_void) -> c_int;
|
|
71
|
+
|
|
72
|
+
/// Callback function invoked when a file extraction fails.
|
|
73
|
+
///
|
|
74
|
+
/// # Arguments
|
|
75
|
+
///
|
|
76
|
+
/// * `file_index` - Zero-based index of the file that failed
|
|
77
|
+
/// * `error_msg` - Null-terminated UTF-8 error message (valid only during callback)
|
|
78
|
+
/// * `user_data` - User-provided context pointer
|
|
79
|
+
///
|
|
80
|
+
/// # Safety
|
|
81
|
+
///
|
|
82
|
+
/// - `error_msg` is valid only during callback execution
|
|
83
|
+
/// - Caller must copy string if needed beyond callback scope
|
|
84
|
+
/// - `user_data` is passed through opaquely (caller manages lifetime)
|
|
85
|
+
pub type ErrorCallback = unsafe extern "C" fn(file_index: usize, error_msg: *const c_char, user_data: *mut c_void);
|
|
86
|
+
|
|
87
|
+
/// Extract multiple files in streaming mode with callback-based result delivery.
|
|
88
|
+
///
|
|
89
|
+
/// Processes files one at a time without accumulating results in memory.
|
|
90
|
+
/// Each result is passed to the callback and then freed automatically.
|
|
91
|
+
///
|
|
92
|
+
/// # Arguments
|
|
93
|
+
///
|
|
94
|
+
/// * `files` - Array of null-terminated file path strings
|
|
95
|
+
/// * `count` - Number of files in the array
|
|
96
|
+
/// * `config_json` - Optional JSON configuration string (NULL for defaults)
|
|
97
|
+
/// * `result_callback` - Callback invoked for each successful extraction
|
|
98
|
+
/// * `user_data` - Optional user context passed to callbacks
|
|
99
|
+
/// * `error_callback` - Optional callback invoked for extraction failures
|
|
100
|
+
///
|
|
101
|
+
/// # Returns
|
|
102
|
+
///
|
|
103
|
+
/// - `0` on success (all files processed or cancelled by callback)
|
|
104
|
+
/// - `-1` on error (invalid arguments, configuration parsing failure)
|
|
105
|
+
///
|
|
106
|
+
/// # Error Handling
|
|
107
|
+
///
|
|
108
|
+
/// - Individual file failures invoke `error_callback` but don't stop processing
|
|
109
|
+
/// - Callback can return non-zero to cancel remaining files
|
|
110
|
+
/// - Invalid arguments or config parsing errors return `-1` immediately
|
|
111
|
+
///
|
|
112
|
+
/// # Safety
|
|
113
|
+
///
|
|
114
|
+
/// - `files` must point to valid array of `count` C string pointers
|
|
115
|
+
/// - All file path strings must be valid null-terminated UTF-8
|
|
116
|
+
/// - `config_json` must be valid null-terminated UTF-8 if not NULL
|
|
117
|
+
/// - `result_callback` must be a valid function pointer
|
|
118
|
+
/// - `error_callback` must be a valid function pointer if not NULL
|
|
119
|
+
/// - Result pointers passed to callbacks are valid only during callback
|
|
120
|
+
/// - Callbacks must not store result pointers for later use
|
|
121
|
+
///
|
|
122
|
+
/// # Example (C)
|
|
123
|
+
///
|
|
124
|
+
/// ```c
|
|
125
|
+
/// int process_result(const CExtractionResultView* result, size_t index, void* data) {
|
|
126
|
+
/// // Copy data needed beyond callback scope
|
|
127
|
+
/// char content[1024];
|
|
128
|
+
/// size_t copy_len = result->content_len < 1024 ? result->content_len : 1023;
|
|
129
|
+
/// memcpy(content, result->content_ptr, copy_len);
|
|
130
|
+
/// content[copy_len] = '\0';
|
|
131
|
+
/// return 0; // Continue
|
|
132
|
+
/// }
|
|
133
|
+
///
|
|
134
|
+
/// void handle_error(size_t index, const char* msg, void* data) {
|
|
135
|
+
/// fprintf(stderr, "File %zu failed: %s\n", index, msg);
|
|
136
|
+
/// }
|
|
137
|
+
///
|
|
138
|
+
/// const char* files[] = {"a.pdf", "b.txt", "c.docx"};
|
|
139
|
+
/// kreuzberg_extract_batch_streaming(files, 3, NULL, process_result, NULL, handle_error);
|
|
140
|
+
/// ```
|
|
141
|
+
#[unsafe(no_mangle)]
|
|
142
|
+
pub unsafe extern "C" fn kreuzberg_extract_batch_streaming(
|
|
143
|
+
files: *const *const c_char,
|
|
144
|
+
count: usize,
|
|
145
|
+
config_json: *const c_char,
|
|
146
|
+
result_callback: ResultCallback,
|
|
147
|
+
user_data: *mut c_void,
|
|
148
|
+
error_callback: Option<ErrorCallback>,
|
|
149
|
+
) -> c_int {
|
|
150
|
+
clear_last_error();
|
|
151
|
+
|
|
152
|
+
if files.is_null() {
|
|
153
|
+
set_last_error("Files array cannot be NULL".to_string());
|
|
154
|
+
return -1;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
if count == 0 {
|
|
158
|
+
return 0;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
let config = if !config_json.is_null() {
|
|
162
|
+
match unsafe { CStr::from_ptr(config_json) }.to_str() {
|
|
163
|
+
Ok(config_str) => match parse_extraction_config_from_json(config_str) {
|
|
164
|
+
Ok(cfg) => cfg,
|
|
165
|
+
Err(e) => {
|
|
166
|
+
set_last_error(format!("Invalid configuration: {}", e));
|
|
167
|
+
return -1;
|
|
168
|
+
}
|
|
169
|
+
},
|
|
170
|
+
Err(e) => {
|
|
171
|
+
set_last_error(format!("Invalid UTF-8 in config: {}", e));
|
|
172
|
+
return -1;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
} else {
|
|
176
|
+
Default::default()
|
|
177
|
+
};
|
|
178
|
+
|
|
179
|
+
for i in 0..count {
|
|
180
|
+
let file_ptr = unsafe { *files.add(i) };
|
|
181
|
+
|
|
182
|
+
if file_ptr.is_null() {
|
|
183
|
+
if let Some(err_cb) = error_callback
|
|
184
|
+
&& let Ok(err_msg) = CString::new("File path is NULL")
|
|
185
|
+
{
|
|
186
|
+
unsafe { err_cb(i, err_msg.as_ptr(), user_data) };
|
|
187
|
+
}
|
|
188
|
+
continue;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
let file_path = match unsafe { CStr::from_ptr(file_ptr) }.to_str() {
|
|
192
|
+
Ok(s) => s,
|
|
193
|
+
Err(e) => {
|
|
194
|
+
if let Some(err_cb) = error_callback
|
|
195
|
+
&& let Ok(err_msg) = CString::new(format!("Invalid UTF-8 in file path: {}", e))
|
|
196
|
+
{
|
|
197
|
+
unsafe { err_cb(i, err_msg.as_ptr(), user_data) };
|
|
198
|
+
}
|
|
199
|
+
continue;
|
|
200
|
+
}
|
|
201
|
+
};
|
|
202
|
+
|
|
203
|
+
match extract_file_internal(file_path, &config) {
|
|
204
|
+
Ok(result) => {
|
|
205
|
+
let view = create_result_view(&result);
|
|
206
|
+
|
|
207
|
+
let continue_processing = unsafe { result_callback(&view as *const _, i, user_data) };
|
|
208
|
+
|
|
209
|
+
if continue_processing != 0 {
|
|
210
|
+
return 0;
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
Err(e) => {
|
|
214
|
+
if let Some(err_cb) = error_callback
|
|
215
|
+
&& let Ok(err_msg) = CString::new(e)
|
|
216
|
+
{
|
|
217
|
+
unsafe { err_cb(i, err_msg.as_ptr(), user_data) };
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
0
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/// Extract multiple files in parallel streaming mode.
|
|
227
|
+
///
|
|
228
|
+
/// Similar to `kreuzberg_extract_batch_streaming` but processes files in parallel
|
|
229
|
+
/// using a thread pool. Results are delivered via callback as they complete.
|
|
230
|
+
///
|
|
231
|
+
/// # Arguments
|
|
232
|
+
///
|
|
233
|
+
/// * `files` - Array of null-terminated file path strings
|
|
234
|
+
/// * `count` - Number of files in the array
|
|
235
|
+
/// * `config_json` - Optional JSON configuration string (NULL for defaults)
|
|
236
|
+
/// * `result_callback` - Thread-safe callback invoked for each successful extraction
|
|
237
|
+
/// * `user_data` - Optional user context passed to callbacks (must be thread-safe)
|
|
238
|
+
/// * `error_callback` - Optional thread-safe callback invoked for failures
|
|
239
|
+
/// * `max_parallel` - Maximum number of parallel extractions (0 = number of CPUs)
|
|
240
|
+
///
|
|
241
|
+
/// # Returns
|
|
242
|
+
///
|
|
243
|
+
/// - `0` on success (all files processed or cancelled)
|
|
244
|
+
/// - `-1` on error (invalid arguments, configuration parsing failure)
|
|
245
|
+
///
|
|
246
|
+
/// # Thread Safety
|
|
247
|
+
///
|
|
248
|
+
/// - Both callbacks may be invoked concurrently from multiple threads
|
|
249
|
+
/// - `user_data` must be thread-safe (e.g., synchronized with mutex)
|
|
250
|
+
/// - Callback can set atomic flag to signal cancellation
|
|
251
|
+
///
|
|
252
|
+
/// # Safety
|
|
253
|
+
///
|
|
254
|
+
/// Same requirements as `kreuzberg_extract_batch_streaming`, plus:
|
|
255
|
+
/// - Callbacks must be thread-safe
|
|
256
|
+
/// - `user_data` must support concurrent access
|
|
257
|
+
///
|
|
258
|
+
/// # Example (C)
|
|
259
|
+
///
|
|
260
|
+
/// ```c
|
|
261
|
+
/// typedef struct {
|
|
262
|
+
/// pthread_mutex_t lock;
|
|
263
|
+
/// atomic_int cancel_flag;
|
|
264
|
+
/// } BatchContext;
|
|
265
|
+
///
|
|
266
|
+
/// int process_result(const CExtractionResultView* result, size_t index, void* data) {
|
|
267
|
+
/// BatchContext* ctx = (BatchContext*)data;
|
|
268
|
+
/// pthread_mutex_lock(&ctx->lock);
|
|
269
|
+
/// // Process result with thread safety
|
|
270
|
+
/// pthread_mutex_unlock(&ctx->lock);
|
|
271
|
+
/// return atomic_load(&ctx->cancel_flag);
|
|
272
|
+
/// }
|
|
273
|
+
/// ```
|
|
274
|
+
#[unsafe(no_mangle)]
|
|
275
|
+
#[cfg_attr(not(feature = "rayon"), allow(unused_variables))]
|
|
276
|
+
pub unsafe extern "C" fn kreuzberg_extract_batch_parallel(
|
|
277
|
+
files: *const *const c_char,
|
|
278
|
+
count: usize,
|
|
279
|
+
config_json: *const c_char,
|
|
280
|
+
result_callback: ResultCallback,
|
|
281
|
+
user_data: *mut c_void,
|
|
282
|
+
error_callback: Option<ErrorCallback>,
|
|
283
|
+
max_parallel: usize,
|
|
284
|
+
) -> c_int {
|
|
285
|
+
clear_last_error();
|
|
286
|
+
|
|
287
|
+
if files.is_null() {
|
|
288
|
+
set_last_error("Files array cannot be NULL".to_string());
|
|
289
|
+
return -1;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
if count == 0 {
|
|
293
|
+
return 0;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
let config = if !config_json.is_null() {
|
|
297
|
+
match unsafe { CStr::from_ptr(config_json) }.to_str() {
|
|
298
|
+
Ok(config_str) => match parse_extraction_config_from_json(config_str) {
|
|
299
|
+
Ok(cfg) => cfg,
|
|
300
|
+
Err(e) => {
|
|
301
|
+
set_last_error(format!("Invalid configuration: {}", e));
|
|
302
|
+
return -1;
|
|
303
|
+
}
|
|
304
|
+
},
|
|
305
|
+
Err(e) => {
|
|
306
|
+
set_last_error(format!("Invalid UTF-8 in config: {}", e));
|
|
307
|
+
return -1;
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
} else {
|
|
311
|
+
Default::default()
|
|
312
|
+
};
|
|
313
|
+
|
|
314
|
+
let mut file_paths = Vec::with_capacity(count);
|
|
315
|
+
for i in 0..count {
|
|
316
|
+
let file_ptr = unsafe { *files.add(i) };
|
|
317
|
+
|
|
318
|
+
if file_ptr.is_null() {
|
|
319
|
+
if let Some(err_cb) = error_callback
|
|
320
|
+
&& let Ok(err_msg) = CString::new("File path is NULL")
|
|
321
|
+
{
|
|
322
|
+
unsafe { err_cb(i, err_msg.as_ptr(), user_data) };
|
|
323
|
+
}
|
|
324
|
+
continue;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
match unsafe { CStr::from_ptr(file_ptr) }.to_str() {
|
|
328
|
+
Ok(s) => file_paths.push((i, s.to_string())),
|
|
329
|
+
Err(e) => {
|
|
330
|
+
if let Some(err_cb) = error_callback
|
|
331
|
+
&& let Ok(err_msg) = CString::new(format!("Invalid UTF-8: {}", e))
|
|
332
|
+
{
|
|
333
|
+
unsafe { err_cb(i, err_msg.as_ptr(), user_data) };
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
#[cfg(feature = "rayon")]
|
|
340
|
+
{
|
|
341
|
+
use rayon::prelude::*;
|
|
342
|
+
|
|
343
|
+
let cancelled = Arc::new(AtomicBool::new(false));
|
|
344
|
+
let config = Arc::new(config);
|
|
345
|
+
|
|
346
|
+
let pool = if max_parallel > 0 {
|
|
347
|
+
rayon::ThreadPoolBuilder::new().num_threads(max_parallel).build()
|
|
348
|
+
} else {
|
|
349
|
+
rayon::ThreadPoolBuilder::new().build()
|
|
350
|
+
};
|
|
351
|
+
|
|
352
|
+
let pool = match pool {
|
|
353
|
+
Ok(p) => p,
|
|
354
|
+
Err(e) => {
|
|
355
|
+
set_last_error(format!("Failed to create thread pool: {}", e));
|
|
356
|
+
return -1;
|
|
357
|
+
}
|
|
358
|
+
};
|
|
359
|
+
|
|
360
|
+
let user_data_ptr = user_data as usize;
|
|
361
|
+
|
|
362
|
+
pool.install(|| {
|
|
363
|
+
file_paths.par_iter().for_each(|(index, path)| {
|
|
364
|
+
if cancelled.load(Ordering::Relaxed) {
|
|
365
|
+
return;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
match extract_file_internal(path, &config) {
|
|
369
|
+
Ok(result) => {
|
|
370
|
+
let view = create_result_view(&result);
|
|
371
|
+
|
|
372
|
+
let should_cancel =
|
|
373
|
+
unsafe { result_callback(&view as *const _, *index, user_data_ptr as *mut c_void) };
|
|
374
|
+
|
|
375
|
+
if should_cancel != 0 {
|
|
376
|
+
cancelled.store(true, Ordering::Relaxed);
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
Err(e) => {
|
|
380
|
+
if let Some(err_cb) = error_callback
|
|
381
|
+
&& let Ok(err_msg) = CString::new(e)
|
|
382
|
+
{
|
|
383
|
+
unsafe { err_cb(*index, err_msg.as_ptr(), user_data_ptr as *mut c_void) };
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
});
|
|
388
|
+
});
|
|
389
|
+
|
|
390
|
+
0
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
#[cfg(not(feature = "rayon"))]
|
|
394
|
+
{
|
|
395
|
+
set_last_error("Parallel processing requires 'rayon' feature to be enabled".to_string());
|
|
396
|
+
-1
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
/// Internal function to extract a file with error handling.
|
|
401
|
+
///
|
|
402
|
+
/// Returns Result<ExtractionResult, String> for easier error propagation.
|
|
403
|
+
fn extract_file_internal(
|
|
404
|
+
file_path: &str,
|
|
405
|
+
config: &kreuzberg::core::config::ExtractionConfig,
|
|
406
|
+
) -> FfiResult<ExtractionResult> {
|
|
407
|
+
let path = Path::new(file_path);
|
|
408
|
+
|
|
409
|
+
if !path.exists() {
|
|
410
|
+
return Err(format!("File not found: {}", file_path));
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
let rt = tokio::runtime::Runtime::new().map_err(|e| format!("Failed to create runtime: {}", e))?;
|
|
414
|
+
|
|
415
|
+
rt.block_on(async {
|
|
416
|
+
kreuzberg::core::extractor::extract_file(path, None, config)
|
|
417
|
+
.await
|
|
418
|
+
.map_err(|e| format!("Extraction failed: {}", e))
|
|
419
|
+
})
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
#[cfg(test)]
|
|
423
|
+
mod tests {
|
|
424
|
+
use super::*;
|
|
425
|
+
use std::ffi::CString;
|
|
426
|
+
use std::ptr;
|
|
427
|
+
use std::sync::Mutex;
|
|
428
|
+
|
|
429
|
+
struct TestContext {
|
|
430
|
+
results: Vec<String>,
|
|
431
|
+
errors: Vec<String>,
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
unsafe extern "C" fn test_result_callback(
|
|
435
|
+
result: *const CExtractionResultView,
|
|
436
|
+
file_index: usize,
|
|
437
|
+
user_data: *mut c_void,
|
|
438
|
+
) -> c_int {
|
|
439
|
+
let ctx = unsafe { &mut *(user_data as *mut Mutex<TestContext>) };
|
|
440
|
+
let guard = ctx.get_mut().unwrap();
|
|
441
|
+
|
|
442
|
+
if !result.is_null() {
|
|
443
|
+
let view = unsafe { &*result };
|
|
444
|
+
let content = if !view.content_ptr.is_null() && view.content_len > 0 {
|
|
445
|
+
unsafe {
|
|
446
|
+
String::from_utf8_lossy(std::slice::from_raw_parts(view.content_ptr, view.content_len)).to_string()
|
|
447
|
+
}
|
|
448
|
+
} else {
|
|
449
|
+
String::new()
|
|
450
|
+
};
|
|
451
|
+
guard.results.push(format!("File {}: {}", file_index, content));
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
0
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
unsafe extern "C" fn test_error_callback(file_index: usize, error_msg: *const c_char, user_data: *mut c_void) {
|
|
458
|
+
let ctx = unsafe { &mut *(user_data as *mut Mutex<TestContext>) };
|
|
459
|
+
let guard = ctx.get_mut().unwrap();
|
|
460
|
+
|
|
461
|
+
let msg = unsafe { CStr::from_ptr(error_msg).to_string_lossy().to_string() };
|
|
462
|
+
guard.errors.push(format!("File {}: {}", file_index, msg));
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
#[test]
|
|
466
|
+
fn test_batch_streaming_basic() {
|
|
467
|
+
let temp_dir = tempfile::tempdir().unwrap();
|
|
468
|
+
let file1 = temp_dir.path().join("test1.txt");
|
|
469
|
+
let file2 = temp_dir.path().join("test2.txt");
|
|
470
|
+
std::fs::write(&file1, "Content 1").unwrap();
|
|
471
|
+
std::fs::write(&file2, "Content 2").unwrap();
|
|
472
|
+
|
|
473
|
+
let path1 = CString::new(file1.to_str().unwrap()).unwrap();
|
|
474
|
+
let path2 = CString::new(file2.to_str().unwrap()).unwrap();
|
|
475
|
+
let files = [path1.as_ptr(), path2.as_ptr()];
|
|
476
|
+
|
|
477
|
+
let context = Mutex::new(TestContext {
|
|
478
|
+
results: Vec::new(),
|
|
479
|
+
errors: Vec::new(),
|
|
480
|
+
});
|
|
481
|
+
|
|
482
|
+
let result = unsafe {
|
|
483
|
+
kreuzberg_extract_batch_streaming(
|
|
484
|
+
files.as_ptr(),
|
|
485
|
+
files.len(),
|
|
486
|
+
ptr::null(),
|
|
487
|
+
test_result_callback,
|
|
488
|
+
&context as *const _ as *mut c_void,
|
|
489
|
+
Some(test_error_callback),
|
|
490
|
+
)
|
|
491
|
+
};
|
|
492
|
+
|
|
493
|
+
assert_eq!(result, 0);
|
|
494
|
+
|
|
495
|
+
let ctx = context.lock().unwrap();
|
|
496
|
+
assert_eq!(ctx.results.len(), 2);
|
|
497
|
+
assert_eq!(ctx.errors.len(), 0);
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
#[test]
|
|
501
|
+
fn test_batch_streaming_with_errors() {
|
|
502
|
+
let path1 = CString::new("/nonexistent/file.txt").unwrap();
|
|
503
|
+
let files = [path1.as_ptr()];
|
|
504
|
+
|
|
505
|
+
let context = Mutex::new(TestContext {
|
|
506
|
+
results: Vec::new(),
|
|
507
|
+
errors: Vec::new(),
|
|
508
|
+
});
|
|
509
|
+
|
|
510
|
+
let result = unsafe {
|
|
511
|
+
kreuzberg_extract_batch_streaming(
|
|
512
|
+
files.as_ptr(),
|
|
513
|
+
files.len(),
|
|
514
|
+
ptr::null(),
|
|
515
|
+
test_result_callback,
|
|
516
|
+
&context as *const _ as *mut c_void,
|
|
517
|
+
Some(test_error_callback),
|
|
518
|
+
)
|
|
519
|
+
};
|
|
520
|
+
|
|
521
|
+
assert_eq!(result, 0);
|
|
522
|
+
|
|
523
|
+
let ctx = context.lock().unwrap();
|
|
524
|
+
assert_eq!(ctx.results.len(), 0);
|
|
525
|
+
assert_eq!(ctx.errors.len(), 1);
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
#[test]
|
|
529
|
+
fn test_batch_streaming_cancellation() {
|
|
530
|
+
unsafe extern "C" fn cancel_callback(
|
|
531
|
+
_result: *const CExtractionResultView,
|
|
532
|
+
file_index: usize,
|
|
533
|
+
_user_data: *mut c_void,
|
|
534
|
+
) -> c_int {
|
|
535
|
+
if file_index == 0 { 1 } else { 0 }
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
let temp_dir = tempfile::tempdir().unwrap();
|
|
539
|
+
let file1 = temp_dir.path().join("test1.txt");
|
|
540
|
+
let file2 = temp_dir.path().join("test2.txt");
|
|
541
|
+
std::fs::write(&file1, "Content 1").unwrap();
|
|
542
|
+
std::fs::write(&file2, "Content 2").unwrap();
|
|
543
|
+
|
|
544
|
+
let path1 = CString::new(file1.to_str().unwrap()).unwrap();
|
|
545
|
+
let path2 = CString::new(file2.to_str().unwrap()).unwrap();
|
|
546
|
+
let files = [path1.as_ptr(), path2.as_ptr()];
|
|
547
|
+
|
|
548
|
+
let result = unsafe {
|
|
549
|
+
kreuzberg_extract_batch_streaming(
|
|
550
|
+
files.as_ptr(),
|
|
551
|
+
files.len(),
|
|
552
|
+
ptr::null(),
|
|
553
|
+
cancel_callback,
|
|
554
|
+
ptr::null_mut(),
|
|
555
|
+
None,
|
|
556
|
+
)
|
|
557
|
+
};
|
|
558
|
+
|
|
559
|
+
assert_eq!(result, 0);
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
#[test]
|
|
563
|
+
fn test_batch_streaming_null_files() {
|
|
564
|
+
let result = unsafe {
|
|
565
|
+
kreuzberg_extract_batch_streaming(ptr::null(), 1, ptr::null(), test_result_callback, ptr::null_mut(), None)
|
|
566
|
+
};
|
|
567
|
+
|
|
568
|
+
assert_eq!(result, -1);
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
#[test]
|
|
572
|
+
fn test_batch_streaming_empty() {
|
|
573
|
+
let files: Vec<*const c_char> = Vec::new();
|
|
574
|
+
|
|
575
|
+
let result = unsafe {
|
|
576
|
+
kreuzberg_extract_batch_streaming(
|
|
577
|
+
files.as_ptr(),
|
|
578
|
+
0,
|
|
579
|
+
ptr::null(),
|
|
580
|
+
test_result_callback,
|
|
581
|
+
ptr::null_mut(),
|
|
582
|
+
None,
|
|
583
|
+
)
|
|
584
|
+
};
|
|
585
|
+
|
|
586
|
+
assert_eq!(result, 0);
|
|
587
|
+
}
|
|
588
|
+
}
|