kreuzberg 4.0.0.rc2 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +396 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,555 @@
|
|
|
1
|
+
//! Core extraction FFI functions.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides the main FFI entry points for document extraction operations.
|
|
4
|
+
//! These functions are the most critical part of the FFI layer and handle both
|
|
5
|
+
//! synchronous file and byte array extraction operations, including batch processing.
|
|
6
|
+
//!
|
|
7
|
+
//! # Safety
|
|
8
|
+
//!
|
|
9
|
+
//! All functions in this module are marked as `unsafe extern "C"` because they interact
|
|
10
|
+
//! with raw C pointers and must follow strict memory management rules. Callers are
|
|
11
|
+
//! responsible for ensuring:
|
|
12
|
+
//! - All input pointers are valid and properly aligned
|
|
13
|
+
//! - All returned pointers are freed with the appropriate free functions
|
|
14
|
+
//! - Configuration JSON is valid UTF-8 and valid JSON
|
|
15
|
+
//!
|
|
16
|
+
//! # Memory Management
|
|
17
|
+
//!
|
|
18
|
+
//! The batch extraction functions have special memory management requirements:
|
|
19
|
+
//! - They allocate a vector of results and convert it to a boxed slice
|
|
20
|
+
//! - The slice pointer is cast and stored in the CBatchResult
|
|
21
|
+
//! - Deallocation must reconstruct the slice before freeing
|
|
22
|
+
//! - This is handled by `kreuzberg_free_batch_result` in the memory module
|
|
23
|
+
|
|
24
|
+
use std::ffi::CStr;
|
|
25
|
+
use std::os::raw::c_char;
|
|
26
|
+
use std::path::Path;
|
|
27
|
+
use std::ptr;
|
|
28
|
+
|
|
29
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
30
|
+
|
|
31
|
+
use crate::ffi_panic_guard;
|
|
32
|
+
use crate::helpers::{clear_last_error, parse_extraction_config_from_json, set_last_error, to_c_extraction_result};
|
|
33
|
+
use crate::memory::kreuzberg_free_result;
|
|
34
|
+
use crate::types::{CBatchResult, CBytesWithMime, CExtractionResult};
|
|
35
|
+
|
|
36
|
+
/// Extract text and metadata from a file (synchronous).
|
|
37
|
+
///
|
|
38
|
+
/// # Safety
|
|
39
|
+
///
|
|
40
|
+
/// - `file_path` must be a valid null-terminated C string
|
|
41
|
+
/// - The returned pointer must be freed with `kreuzberg_free_result`
|
|
42
|
+
/// - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
43
|
+
///
|
|
44
|
+
/// # Example (C)
|
|
45
|
+
///
|
|
46
|
+
/// ```c
|
|
47
|
+
/// const char* path = "/path/to/document.pdf";
|
|
48
|
+
/// CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
49
|
+
/// if (result != NULL && result->success) {
|
|
50
|
+
/// printf("Content: %s\n", result->content);
|
|
51
|
+
/// printf("MIME: %s\n", result->mime_type);
|
|
52
|
+
/// kreuzberg_free_result(result);
|
|
53
|
+
/// } else {
|
|
54
|
+
/// const char* error = kreuzberg_last_error();
|
|
55
|
+
/// printf("Error: %s\n", error);
|
|
56
|
+
/// }
|
|
57
|
+
/// ```
|
|
58
|
+
#[unsafe(no_mangle)]
|
|
59
|
+
pub unsafe extern "C" fn kreuzberg_extract_file_sync(file_path: *const c_char) -> *mut CExtractionResult {
|
|
60
|
+
ffi_panic_guard!("kreuzberg_extract_file_sync", {
|
|
61
|
+
clear_last_error();
|
|
62
|
+
|
|
63
|
+
if file_path.is_null() {
|
|
64
|
+
set_last_error("file_path cannot be NULL".to_string());
|
|
65
|
+
return ptr::null_mut();
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
|
|
69
|
+
Ok(s) => s,
|
|
70
|
+
Err(e) => {
|
|
71
|
+
set_last_error(format!("Invalid UTF-8 in file path: {}", e));
|
|
72
|
+
return ptr::null_mut();
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
let path = Path::new(path_str);
|
|
77
|
+
let config = ExtractionConfig::default();
|
|
78
|
+
|
|
79
|
+
match kreuzberg::extract_file_sync(path, None, &config) {
|
|
80
|
+
Ok(result) => match to_c_extraction_result(result) {
|
|
81
|
+
Ok(ptr) => ptr,
|
|
82
|
+
Err(e) => {
|
|
83
|
+
set_last_error(e);
|
|
84
|
+
ptr::null_mut()
|
|
85
|
+
}
|
|
86
|
+
},
|
|
87
|
+
Err(e) => {
|
|
88
|
+
set_last_error(e.to_string());
|
|
89
|
+
ptr::null_mut()
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
})
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/// Extract text and metadata from a file with custom configuration (synchronous).
|
|
96
|
+
///
|
|
97
|
+
/// # Safety
|
|
98
|
+
///
|
|
99
|
+
/// - `file_path` must be a valid null-terminated C string
|
|
100
|
+
/// - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
101
|
+
/// - The returned pointer must be freed with `kreuzberg_free_result`
|
|
102
|
+
/// - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
103
|
+
///
|
|
104
|
+
/// # Example (C)
|
|
105
|
+
///
|
|
106
|
+
/// ```c
|
|
107
|
+
/// const char* path = "/path/to/document.pdf";
|
|
108
|
+
/// const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
|
|
109
|
+
/// CExtractionResult* result = kreuzberg_extract_file_sync_with_config(path, config);
|
|
110
|
+
/// if (result != NULL && result->success) {
|
|
111
|
+
/// printf("Content: %s\n", result->content);
|
|
112
|
+
/// kreuzberg_free_result(result);
|
|
113
|
+
/// }
|
|
114
|
+
/// ```
|
|
115
|
+
#[unsafe(no_mangle)]
|
|
116
|
+
pub unsafe extern "C" fn kreuzberg_extract_file_sync_with_config(
|
|
117
|
+
file_path: *const c_char,
|
|
118
|
+
config_json: *const c_char,
|
|
119
|
+
) -> *mut CExtractionResult {
|
|
120
|
+
ffi_panic_guard!("kreuzberg_extract_file_sync_with_config", {
|
|
121
|
+
clear_last_error();
|
|
122
|
+
|
|
123
|
+
if file_path.is_null() {
|
|
124
|
+
set_last_error("file_path cannot be NULL".to_string());
|
|
125
|
+
return ptr::null_mut();
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
|
|
129
|
+
Ok(s) => s,
|
|
130
|
+
Err(e) => {
|
|
131
|
+
set_last_error(format!("Invalid UTF-8 in file path: {}", e));
|
|
132
|
+
return ptr::null_mut();
|
|
133
|
+
}
|
|
134
|
+
};
|
|
135
|
+
|
|
136
|
+
let path = Path::new(path_str);
|
|
137
|
+
|
|
138
|
+
let config = if config_json.is_null() {
|
|
139
|
+
ExtractionConfig::default()
|
|
140
|
+
} else {
|
|
141
|
+
let config_str = match unsafe { CStr::from_ptr(config_json) }.to_str() {
|
|
142
|
+
Ok(s) => s,
|
|
143
|
+
Err(e) => {
|
|
144
|
+
set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
|
|
145
|
+
return ptr::null_mut();
|
|
146
|
+
}
|
|
147
|
+
};
|
|
148
|
+
|
|
149
|
+
match parse_extraction_config_from_json(config_str) {
|
|
150
|
+
Ok(cfg) => cfg,
|
|
151
|
+
Err(e) => {
|
|
152
|
+
set_last_error(e);
|
|
153
|
+
return ptr::null_mut();
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
};
|
|
157
|
+
|
|
158
|
+
match kreuzberg::extract_file_sync(path, None, &config) {
|
|
159
|
+
Ok(result) => match to_c_extraction_result(result) {
|
|
160
|
+
Ok(ptr) => ptr,
|
|
161
|
+
Err(e) => {
|
|
162
|
+
set_last_error(e);
|
|
163
|
+
ptr::null_mut()
|
|
164
|
+
}
|
|
165
|
+
},
|
|
166
|
+
Err(e) => {
|
|
167
|
+
set_last_error(e.to_string());
|
|
168
|
+
ptr::null_mut()
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
})
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/// Extract text and metadata from byte array (synchronous).
|
|
175
|
+
///
|
|
176
|
+
/// # Safety
|
|
177
|
+
///
|
|
178
|
+
/// - `data` must be a valid pointer to a byte array of length `data_len`
|
|
179
|
+
/// - `mime_type` must be a valid null-terminated C string
|
|
180
|
+
/// - The returned pointer must be freed with `kreuzberg_free_result`
|
|
181
|
+
/// - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
182
|
+
///
|
|
183
|
+
/// # Example (C)
|
|
184
|
+
///
|
|
185
|
+
/// ```c
|
|
186
|
+
/// const uint8_t* data = ...; // Document bytes
|
|
187
|
+
/// size_t len = ...; // Length of data
|
|
188
|
+
/// const char* mime = "application/pdf";
|
|
189
|
+
/// CExtractionResult* result = kreuzberg_extract_bytes_sync(data, len, mime);
|
|
190
|
+
/// if (result != NULL && result->success) {
|
|
191
|
+
/// printf("Content: %s\n", result->content);
|
|
192
|
+
/// kreuzberg_free_result(result);
|
|
193
|
+
/// } else {
|
|
194
|
+
/// const char* error = kreuzberg_last_error();
|
|
195
|
+
/// printf("Error: %s\n", error);
|
|
196
|
+
/// }
|
|
197
|
+
/// ```
|
|
198
|
+
#[unsafe(no_mangle)]
|
|
199
|
+
pub unsafe extern "C" fn kreuzberg_extract_bytes_sync(
|
|
200
|
+
data: *const u8,
|
|
201
|
+
data_len: usize,
|
|
202
|
+
mime_type: *const c_char,
|
|
203
|
+
) -> *mut CExtractionResult {
|
|
204
|
+
ffi_panic_guard!("kreuzberg_extract_bytes_sync", {
|
|
205
|
+
clear_last_error();
|
|
206
|
+
|
|
207
|
+
if data.is_null() {
|
|
208
|
+
set_last_error("data cannot be NULL".to_string());
|
|
209
|
+
return ptr::null_mut();
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
if mime_type.is_null() {
|
|
213
|
+
set_last_error("mime_type cannot be NULL".to_string());
|
|
214
|
+
return ptr::null_mut();
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
let bytes = unsafe { std::slice::from_raw_parts(data, data_len) };
|
|
218
|
+
|
|
219
|
+
let mime_str = match unsafe { CStr::from_ptr(mime_type) }.to_str() {
|
|
220
|
+
Ok(s) => s,
|
|
221
|
+
Err(e) => {
|
|
222
|
+
set_last_error(format!("Invalid UTF-8 in MIME type: {}", e));
|
|
223
|
+
return ptr::null_mut();
|
|
224
|
+
}
|
|
225
|
+
};
|
|
226
|
+
|
|
227
|
+
let config = ExtractionConfig::default();
|
|
228
|
+
|
|
229
|
+
match kreuzberg::extract_bytes_sync(bytes, mime_str, &config) {
|
|
230
|
+
Ok(result) => match to_c_extraction_result(result) {
|
|
231
|
+
Ok(ptr) => ptr,
|
|
232
|
+
Err(e) => {
|
|
233
|
+
set_last_error(e);
|
|
234
|
+
ptr::null_mut()
|
|
235
|
+
}
|
|
236
|
+
},
|
|
237
|
+
Err(e) => {
|
|
238
|
+
set_last_error(e.to_string());
|
|
239
|
+
ptr::null_mut()
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
})
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
/// Extract text and metadata from byte array with custom configuration (synchronous).
|
|
246
|
+
///
|
|
247
|
+
/// # Safety
|
|
248
|
+
///
|
|
249
|
+
/// - `data` must be a valid pointer to a byte array of length `data_len`
|
|
250
|
+
/// - `mime_type` must be a valid null-terminated C string
|
|
251
|
+
/// - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
252
|
+
/// - The returned pointer must be freed with `kreuzberg_free_result`
|
|
253
|
+
/// - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
254
|
+
///
|
|
255
|
+
/// # Example (C)
|
|
256
|
+
///
|
|
257
|
+
/// ```c
|
|
258
|
+
/// const uint8_t* data = ...; // Document bytes
|
|
259
|
+
/// size_t len = ...; // Length of data
|
|
260
|
+
/// const char* mime = "application/pdf";
|
|
261
|
+
/// const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
|
|
262
|
+
/// CExtractionResult* result = kreuzberg_extract_bytes_sync_with_config(data, len, mime, config);
|
|
263
|
+
/// if (result != NULL && result->success) {
|
|
264
|
+
/// printf("Content: %s\n", result->content);
|
|
265
|
+
/// kreuzberg_free_result(result);
|
|
266
|
+
/// }
|
|
267
|
+
/// ```
|
|
268
|
+
#[unsafe(no_mangle)]
|
|
269
|
+
pub unsafe extern "C" fn kreuzberg_extract_bytes_sync_with_config(
|
|
270
|
+
data: *const u8,
|
|
271
|
+
data_len: usize,
|
|
272
|
+
mime_type: *const c_char,
|
|
273
|
+
config_json: *const c_char,
|
|
274
|
+
) -> *mut CExtractionResult {
|
|
275
|
+
ffi_panic_guard!("kreuzberg_extract_bytes_sync_with_config", {
|
|
276
|
+
clear_last_error();
|
|
277
|
+
|
|
278
|
+
if data.is_null() {
|
|
279
|
+
set_last_error("data cannot be NULL".to_string());
|
|
280
|
+
return ptr::null_mut();
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
if mime_type.is_null() {
|
|
284
|
+
set_last_error("mime_type cannot be NULL".to_string());
|
|
285
|
+
return ptr::null_mut();
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
let bytes = unsafe { std::slice::from_raw_parts(data, data_len) };
|
|
289
|
+
|
|
290
|
+
let mime_str = match unsafe { CStr::from_ptr(mime_type) }.to_str() {
|
|
291
|
+
Ok(s) => s,
|
|
292
|
+
Err(e) => {
|
|
293
|
+
set_last_error(format!("Invalid UTF-8 in MIME type: {}", e));
|
|
294
|
+
return ptr::null_mut();
|
|
295
|
+
}
|
|
296
|
+
};
|
|
297
|
+
|
|
298
|
+
let config = if config_json.is_null() {
|
|
299
|
+
ExtractionConfig::default()
|
|
300
|
+
} else {
|
|
301
|
+
let config_str = match unsafe { CStr::from_ptr(config_json) }.to_str() {
|
|
302
|
+
Ok(s) => s,
|
|
303
|
+
Err(e) => {
|
|
304
|
+
set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
|
|
305
|
+
return ptr::null_mut();
|
|
306
|
+
}
|
|
307
|
+
};
|
|
308
|
+
|
|
309
|
+
match parse_extraction_config_from_json(config_str) {
|
|
310
|
+
Ok(cfg) => cfg,
|
|
311
|
+
Err(e) => {
|
|
312
|
+
set_last_error(e);
|
|
313
|
+
return ptr::null_mut();
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
};
|
|
317
|
+
|
|
318
|
+
match kreuzberg::extract_bytes_sync(bytes, mime_str, &config) {
|
|
319
|
+
Ok(result) => match to_c_extraction_result(result) {
|
|
320
|
+
Ok(ptr) => ptr,
|
|
321
|
+
Err(e) => {
|
|
322
|
+
set_last_error(e);
|
|
323
|
+
ptr::null_mut()
|
|
324
|
+
}
|
|
325
|
+
},
|
|
326
|
+
Err(e) => {
|
|
327
|
+
set_last_error(e.to_string());
|
|
328
|
+
ptr::null_mut()
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
})
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
/// Batch extract text and metadata from multiple files (synchronous).
|
|
335
|
+
///
|
|
336
|
+
/// # Safety
|
|
337
|
+
///
|
|
338
|
+
/// - `file_paths` must be a valid pointer to an array of null-terminated C strings
|
|
339
|
+
/// - `count` must be the number of file paths in the array
|
|
340
|
+
/// - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
341
|
+
/// - The returned pointer must be freed with `kreuzberg_free_batch_result`
|
|
342
|
+
/// - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
343
|
+
///
|
|
344
|
+
/// # Critical Memory Management
|
|
345
|
+
///
|
|
346
|
+
/// This function has special memory management requirements due to the need to allocate
|
|
347
|
+
/// an array of result pointers:
|
|
348
|
+
///
|
|
349
|
+
/// 1. Results are collected in a Vec<*mut CExtractionResult>
|
|
350
|
+
/// 2. The vec is converted to a boxed slice (changes allocation metadata)
|
|
351
|
+
/// 3. The boxed slice pointer is cast to *mut *mut CExtractionResult
|
|
352
|
+
/// 4. This pointer is stored in CBatchResult
|
|
353
|
+
/// 5. Deallocation must reverse this process using slice_from_raw_parts
|
|
354
|
+
///
|
|
355
|
+
/// The Go segfault issue was caused by incorrect deallocation in the memory module.
|
|
356
|
+
/// This allocation pattern must be perfectly mirrored in the free function.
|
|
357
|
+
#[unsafe(no_mangle)]
|
|
358
|
+
pub unsafe extern "C" fn kreuzberg_batch_extract_files_sync(
|
|
359
|
+
file_paths: *const *const c_char,
|
|
360
|
+
count: usize,
|
|
361
|
+
config_json: *const c_char,
|
|
362
|
+
) -> *mut CBatchResult {
|
|
363
|
+
ffi_panic_guard!("kreuzberg_batch_extract_files_sync", {
|
|
364
|
+
clear_last_error();
|
|
365
|
+
|
|
366
|
+
if file_paths.is_null() {
|
|
367
|
+
set_last_error("file_paths cannot be NULL".to_string());
|
|
368
|
+
return ptr::null_mut();
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
let config = if config_json.is_null() {
|
|
372
|
+
ExtractionConfig::default()
|
|
373
|
+
} else {
|
|
374
|
+
let config_str = match unsafe { CStr::from_ptr(config_json) }.to_str() {
|
|
375
|
+
Ok(s) => s,
|
|
376
|
+
Err(e) => {
|
|
377
|
+
set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
|
|
378
|
+
return ptr::null_mut();
|
|
379
|
+
}
|
|
380
|
+
};
|
|
381
|
+
|
|
382
|
+
match parse_extraction_config_from_json(config_str) {
|
|
383
|
+
Ok(cfg) => cfg,
|
|
384
|
+
Err(e) => {
|
|
385
|
+
set_last_error(e);
|
|
386
|
+
return ptr::null_mut();
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
};
|
|
390
|
+
|
|
391
|
+
let mut paths: Vec<std::path::PathBuf> = Vec::with_capacity(count);
|
|
392
|
+
for i in 0..count {
|
|
393
|
+
let path_ptr = unsafe { *file_paths.add(i) };
|
|
394
|
+
if path_ptr.is_null() {
|
|
395
|
+
set_last_error(format!("File path at index {} is NULL", i));
|
|
396
|
+
return ptr::null_mut();
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
let path_str = match unsafe { CStr::from_ptr(path_ptr) }.to_str() {
|
|
400
|
+
Ok(s) => s,
|
|
401
|
+
Err(e) => {
|
|
402
|
+
set_last_error(format!("Invalid UTF-8 in file path at index {}: {}", i, e));
|
|
403
|
+
return ptr::null_mut();
|
|
404
|
+
}
|
|
405
|
+
};
|
|
406
|
+
|
|
407
|
+
paths.push(std::path::PathBuf::from(path_str));
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
let path_refs: Vec<&std::path::Path> = paths.iter().map(|p| p.as_path()).collect();
|
|
411
|
+
match kreuzberg::batch_extract_file_sync(path_refs, &config) {
|
|
412
|
+
Ok(results) => {
|
|
413
|
+
let mut c_results = Vec::with_capacity(results.len());
|
|
414
|
+
for result in results {
|
|
415
|
+
match to_c_extraction_result(result) {
|
|
416
|
+
Ok(ptr) => c_results.push(ptr),
|
|
417
|
+
Err(e) => {
|
|
418
|
+
for c_res in c_results {
|
|
419
|
+
unsafe { kreuzberg_free_result(c_res) };
|
|
420
|
+
}
|
|
421
|
+
set_last_error(e);
|
|
422
|
+
return ptr::null_mut();
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
let actual_count = c_results.len();
|
|
428
|
+
let results_array = c_results.into_boxed_slice();
|
|
429
|
+
let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
|
|
430
|
+
|
|
431
|
+
Box::into_raw(Box::new(CBatchResult {
|
|
432
|
+
results: results_ptr,
|
|
433
|
+
count: actual_count,
|
|
434
|
+
success: true,
|
|
435
|
+
_padding2: [0u8; 7],
|
|
436
|
+
}))
|
|
437
|
+
}
|
|
438
|
+
Err(e) => {
|
|
439
|
+
set_last_error(e.to_string());
|
|
440
|
+
ptr::null_mut()
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
})
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
/// Batch extract text and metadata from multiple byte arrays (synchronous).
|
|
447
|
+
///
|
|
448
|
+
/// # Safety
|
|
449
|
+
///
|
|
450
|
+
/// - `items` must be a valid pointer to an array of CBytesWithMime structures
|
|
451
|
+
/// - `count` must be the number of items in the array
|
|
452
|
+
/// - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
453
|
+
/// - The returned pointer must be freed with `kreuzberg_free_batch_result`
|
|
454
|
+
/// - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
455
|
+
///
|
|
456
|
+
/// # Critical Memory Management
|
|
457
|
+
///
|
|
458
|
+
/// This function shares the same critical memory management pattern as
|
|
459
|
+
/// `kreuzberg_batch_extract_files_sync`. See that function's documentation
|
|
460
|
+
/// for details on the Box/Vec/slice allocation pattern.
|
|
461
|
+
#[unsafe(no_mangle)]
|
|
462
|
+
pub unsafe extern "C" fn kreuzberg_batch_extract_bytes_sync(
|
|
463
|
+
items: *const CBytesWithMime,
|
|
464
|
+
count: usize,
|
|
465
|
+
config_json: *const c_char,
|
|
466
|
+
) -> *mut CBatchResult {
|
|
467
|
+
ffi_panic_guard!("kreuzberg_batch_extract_bytes_sync", {
|
|
468
|
+
clear_last_error();
|
|
469
|
+
|
|
470
|
+
if items.is_null() {
|
|
471
|
+
set_last_error("items cannot be NULL".to_string());
|
|
472
|
+
return ptr::null_mut();
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
let config = if config_json.is_null() {
|
|
476
|
+
ExtractionConfig::default()
|
|
477
|
+
} else {
|
|
478
|
+
let config_str = match unsafe { CStr::from_ptr(config_json) }.to_str() {
|
|
479
|
+
Ok(s) => s,
|
|
480
|
+
Err(e) => {
|
|
481
|
+
set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
|
|
482
|
+
return ptr::null_mut();
|
|
483
|
+
}
|
|
484
|
+
};
|
|
485
|
+
|
|
486
|
+
match parse_extraction_config_from_json(config_str) {
|
|
487
|
+
Ok(cfg) => cfg,
|
|
488
|
+
Err(e) => {
|
|
489
|
+
set_last_error(e);
|
|
490
|
+
return ptr::null_mut();
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
};
|
|
494
|
+
|
|
495
|
+
let mut contents: Vec<(Vec<u8>, String)> = Vec::with_capacity(count);
|
|
496
|
+
for i in 0..count {
|
|
497
|
+
let item = unsafe { &*items.add(i) };
|
|
498
|
+
|
|
499
|
+
if item.data.is_null() {
|
|
500
|
+
set_last_error(format!("Data at index {} is NULL", i));
|
|
501
|
+
return ptr::null_mut();
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
if item.mime_type.is_null() {
|
|
505
|
+
set_last_error(format!("MIME type at index {} is NULL", i));
|
|
506
|
+
return ptr::null_mut();
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
let bytes = unsafe { std::slice::from_raw_parts(item.data, item.data_len) };
|
|
510
|
+
|
|
511
|
+
let mime_str = match unsafe { CStr::from_ptr(item.mime_type) }.to_str() {
|
|
512
|
+
Ok(s) => s,
|
|
513
|
+
Err(e) => {
|
|
514
|
+
set_last_error(format!("Invalid UTF-8 in MIME type at index {}: {}", i, e));
|
|
515
|
+
return ptr::null_mut();
|
|
516
|
+
}
|
|
517
|
+
};
|
|
518
|
+
|
|
519
|
+
contents.push((bytes.to_vec(), mime_str.to_string()));
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
match kreuzberg::batch_extract_bytes_sync(contents, &config) {
|
|
523
|
+
Ok(results) => {
|
|
524
|
+
let mut c_results = Vec::with_capacity(results.len());
|
|
525
|
+
for result in results {
|
|
526
|
+
match to_c_extraction_result(result) {
|
|
527
|
+
Ok(ptr) => c_results.push(ptr),
|
|
528
|
+
Err(e) => {
|
|
529
|
+
for c_res in c_results {
|
|
530
|
+
unsafe { kreuzberg_free_result(c_res) };
|
|
531
|
+
}
|
|
532
|
+
set_last_error(e);
|
|
533
|
+
return ptr::null_mut();
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
let actual_count = c_results.len();
|
|
539
|
+
let results_array = c_results.into_boxed_slice();
|
|
540
|
+
let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
|
|
541
|
+
|
|
542
|
+
Box::into_raw(Box::new(CBatchResult {
|
|
543
|
+
results: results_ptr,
|
|
544
|
+
count: actual_count,
|
|
545
|
+
success: true,
|
|
546
|
+
_padding2: [0u8; 7],
|
|
547
|
+
}))
|
|
548
|
+
}
|
|
549
|
+
Err(e) => {
|
|
550
|
+
set_last_error(e.to_string());
|
|
551
|
+
ptr::null_mut()
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
})
|
|
555
|
+
}
|