kreuzberg 4.0.0.rc2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +391 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
//! DocumentExtractor plugin system FFI bindings
|
|
2
|
+
//!
|
|
3
|
+
//! Provides FFI functions for registering, managing, and executing custom document extractors
|
|
4
|
+
//! from C/Java/other FFI languages.
|
|
5
|
+
|
|
6
|
+
use std::ffi::{CStr, CString};
|
|
7
|
+
use std::os::raw::c_char;
|
|
8
|
+
use std::ptr;
|
|
9
|
+
use std::sync::Arc;
|
|
10
|
+
|
|
11
|
+
use async_trait::async_trait;
|
|
12
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
13
|
+
use kreuzberg::plugins::Plugin;
|
|
14
|
+
use kreuzberg::types::ExtractionResult;
|
|
15
|
+
use kreuzberg::{KreuzbergError, Result};
|
|
16
|
+
|
|
17
|
+
use crate::helpers::{clear_last_error, set_last_error};
|
|
18
|
+
use crate::{ffi_panic_guard, ffi_panic_guard_bool};
|
|
19
|
+
|
|
20
|
+
/// Type alias for the DocumentExtractor callback function.
|
|
21
|
+
///
|
|
22
|
+
/// # Parameters
|
|
23
|
+
///
|
|
24
|
+
/// - `content`: Pointer to document bytes (valid only during the call)
|
|
25
|
+
/// - `content_len`: Length of the content in bytes
|
|
26
|
+
/// - `mime_type`: Null-terminated MIME type string
|
|
27
|
+
/// - `config_json`: Null-terminated JSON configuration string
|
|
28
|
+
///
|
|
29
|
+
/// # Returns
|
|
30
|
+
///
|
|
31
|
+
/// Null-terminated JSON string containing the ExtractionResult
|
|
32
|
+
/// (must be freed by Rust via kreuzberg_free_string), or NULL on error.
|
|
33
|
+
///
|
|
34
|
+
/// # Safety
|
|
35
|
+
///
|
|
36
|
+
/// The callback must:
|
|
37
|
+
/// - Not store the content, mime_type, or config_json pointers (only valid during the call)
|
|
38
|
+
/// - Return a valid null-terminated UTF-8 JSON string or NULL on error
|
|
39
|
+
/// - The returned string must be freeable by kreuzberg_free_string
|
|
40
|
+
type DocumentExtractorCallback = unsafe extern "C" fn(
|
|
41
|
+
content: *const u8,
|
|
42
|
+
content_len: usize,
|
|
43
|
+
mime_type: *const c_char,
|
|
44
|
+
config_json: *const c_char,
|
|
45
|
+
) -> *mut c_char;
|
|
46
|
+
|
|
47
|
+
/// FFI wrapper for custom DocumentExtractors registered from Java/C.
|
|
48
|
+
///
|
|
49
|
+
/// This struct wraps a C function pointer and implements the DocumentExtractor trait,
|
|
50
|
+
/// allowing custom extraction implementations from FFI languages to be registered
|
|
51
|
+
/// and used within the Rust extraction pipeline.
|
|
52
|
+
struct FfiDocumentExtractor {
|
|
53
|
+
name: String,
|
|
54
|
+
callback: DocumentExtractorCallback,
|
|
55
|
+
#[allow(dead_code)]
|
|
56
|
+
supported_types: Vec<String>,
|
|
57
|
+
supported_types_static: Vec<&'static str>,
|
|
58
|
+
priority: i32,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
impl FfiDocumentExtractor {
|
|
62
|
+
fn new(name: String, callback: DocumentExtractorCallback, supported_types: Vec<String>, priority: i32) -> Self {
|
|
63
|
+
let supported_types_static: Vec<&'static str> = supported_types
|
|
64
|
+
.iter()
|
|
65
|
+
.map(|s| {
|
|
66
|
+
let leaked: &'static str = Box::leak(s.clone().into_boxed_str());
|
|
67
|
+
leaked
|
|
68
|
+
})
|
|
69
|
+
.collect();
|
|
70
|
+
|
|
71
|
+
Self {
|
|
72
|
+
name,
|
|
73
|
+
callback,
|
|
74
|
+
supported_types,
|
|
75
|
+
supported_types_static,
|
|
76
|
+
priority,
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
impl Plugin for FfiDocumentExtractor {
|
|
82
|
+
fn name(&self) -> &str {
|
|
83
|
+
&self.name
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
fn version(&self) -> String {
|
|
87
|
+
"ffi-1.0.0".to_string()
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
fn initialize(&self) -> Result<()> {
|
|
91
|
+
Ok(())
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
fn shutdown(&self) -> Result<()> {
|
|
95
|
+
Ok(())
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
#[async_trait]
|
|
100
|
+
impl kreuzberg::plugins::DocumentExtractor for FfiDocumentExtractor {
|
|
101
|
+
async fn extract_bytes(
|
|
102
|
+
&self,
|
|
103
|
+
content: &[u8],
|
|
104
|
+
mime_type: &str,
|
|
105
|
+
config: &ExtractionConfig,
|
|
106
|
+
) -> Result<ExtractionResult> {
|
|
107
|
+
let config_json = serde_json::to_string(config).map_err(|e| KreuzbergError::Validation {
|
|
108
|
+
message: format!("Failed to serialize ExtractionConfig: {}", e),
|
|
109
|
+
source: Some(Box::new(e)),
|
|
110
|
+
})?;
|
|
111
|
+
|
|
112
|
+
let callback = self.callback;
|
|
113
|
+
let extractor_name = self.name.clone();
|
|
114
|
+
let extractor_name_error = self.name.clone();
|
|
115
|
+
let extractor_name_parse = self.name.clone();
|
|
116
|
+
let content_vec = content.to_vec();
|
|
117
|
+
let mime_type_owned = mime_type.to_string();
|
|
118
|
+
let config_json_owned = config_json.clone();
|
|
119
|
+
|
|
120
|
+
let result_json = tokio::task::spawn_blocking(move || {
|
|
121
|
+
let mime_cstr = match CString::new(mime_type_owned.clone()) {
|
|
122
|
+
Ok(s) => s,
|
|
123
|
+
Err(e) => {
|
|
124
|
+
return Err(KreuzbergError::Validation {
|
|
125
|
+
message: format!("Invalid MIME type for extractor '{}': {}", extractor_name, e),
|
|
126
|
+
source: Some(Box::new(e)),
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
let config_cstr = match CString::new(config_json_owned.clone()) {
|
|
132
|
+
Ok(s) => s,
|
|
133
|
+
Err(e) => {
|
|
134
|
+
return Err(KreuzbergError::Validation {
|
|
135
|
+
message: format!("Invalid config JSON for extractor '{}': {}", extractor_name, e),
|
|
136
|
+
source: Some(Box::new(e)),
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
};
|
|
140
|
+
|
|
141
|
+
let result_ptr = unsafe {
|
|
142
|
+
callback(
|
|
143
|
+
content_vec.as_ptr(),
|
|
144
|
+
content_vec.len(),
|
|
145
|
+
mime_cstr.as_ptr(),
|
|
146
|
+
config_cstr.as_ptr(),
|
|
147
|
+
)
|
|
148
|
+
};
|
|
149
|
+
|
|
150
|
+
if result_ptr.is_null() {
|
|
151
|
+
return Err(KreuzbergError::Parsing {
|
|
152
|
+
message: format!("DocumentExtractor '{}' returned NULL (callback failed)", extractor_name),
|
|
153
|
+
source: None,
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
let result_cstr = unsafe { CString::from_raw(result_ptr) };
|
|
158
|
+
let result_str = result_cstr.to_str().map_err(|e| KreuzbergError::Validation {
|
|
159
|
+
message: format!("Invalid UTF-8 in result from extractor '{}': {}", extractor_name, e),
|
|
160
|
+
source: Some(Box::new(e)),
|
|
161
|
+
})?;
|
|
162
|
+
|
|
163
|
+
Ok(result_str.to_string())
|
|
164
|
+
})
|
|
165
|
+
.await
|
|
166
|
+
.map_err(|e| {
|
|
167
|
+
KreuzbergError::Other(format!(
|
|
168
|
+
"Task join error in extractor '{}': {}",
|
|
169
|
+
extractor_name_error, e
|
|
170
|
+
))
|
|
171
|
+
})??;
|
|
172
|
+
|
|
173
|
+
serde_json::from_str(&result_json).map_err(|e| KreuzbergError::Parsing {
|
|
174
|
+
message: format!(
|
|
175
|
+
"Failed to deserialize ExtractionResult from extractor '{}': {}",
|
|
176
|
+
extractor_name_parse, e
|
|
177
|
+
),
|
|
178
|
+
source: Some(Box::new(e)),
|
|
179
|
+
})
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
async fn extract_file(
|
|
183
|
+
&self,
|
|
184
|
+
path: &std::path::Path,
|
|
185
|
+
mime_type: &str,
|
|
186
|
+
config: &ExtractionConfig,
|
|
187
|
+
) -> Result<ExtractionResult> {
|
|
188
|
+
let content = tokio::fs::read(path).await.map_err(KreuzbergError::Io)?;
|
|
189
|
+
self.extract_bytes(&content, mime_type, config).await
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
fn supported_mime_types(&self) -> &[&str] {
|
|
193
|
+
&self.supported_types_static
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
fn priority(&self) -> i32 {
|
|
197
|
+
self.priority
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/// Register a custom DocumentExtractor via FFI callback.
|
|
202
|
+
///
|
|
203
|
+
/// # Safety
|
|
204
|
+
///
|
|
205
|
+
/// - `name` must be a valid null-terminated C string
|
|
206
|
+
/// - `callback` must be a valid function pointer that:
|
|
207
|
+
/// - Does not store the content, mime_type, or config_json pointers
|
|
208
|
+
/// - Returns a null-terminated UTF-8 JSON string or NULL on error
|
|
209
|
+
/// - The returned string must be freeable by kreuzberg_free_string
|
|
210
|
+
/// - `mime_types` must be a valid null-terminated C string containing comma-separated MIME types
|
|
211
|
+
/// - `priority` determines the order of selection (higher priority preferred)
|
|
212
|
+
/// - Returns true on success, false on error (check kreuzberg_last_error)
|
|
213
|
+
///
|
|
214
|
+
/// # Example (C)
|
|
215
|
+
///
|
|
216
|
+
/// ```c
|
|
217
|
+
/// char* my_extractor(const uint8_t* content, size_t len, const char* mime_type, const char* config) {
|
|
218
|
+
/// // Extract content from bytes, return JSON ExtractionResult
|
|
219
|
+
/// return strdup("{\"content\":\"extracted text\",\"mime_type\":\"text/plain\",\"metadata\":{}}");
|
|
220
|
+
/// }
|
|
221
|
+
///
|
|
222
|
+
/// bool success = kreuzberg_register_document_extractor(
|
|
223
|
+
/// "my-extractor",
|
|
224
|
+
/// my_extractor,
|
|
225
|
+
/// "application/x-custom,text/x-custom",
|
|
226
|
+
/// 100
|
|
227
|
+
/// );
|
|
228
|
+
/// if (!success) {
|
|
229
|
+
/// const char* error = kreuzberg_last_error();
|
|
230
|
+
/// printf("Failed to register: %s\n", error);
|
|
231
|
+
/// }
|
|
232
|
+
/// ```
|
|
233
|
+
#[unsafe(no_mangle)]
|
|
234
|
+
pub unsafe extern "C" fn kreuzberg_register_document_extractor(
|
|
235
|
+
name: *const c_char,
|
|
236
|
+
callback: DocumentExtractorCallback,
|
|
237
|
+
mime_types: *const c_char,
|
|
238
|
+
priority: i32,
|
|
239
|
+
) -> bool {
|
|
240
|
+
ffi_panic_guard_bool!("kreuzberg_register_document_extractor", {
|
|
241
|
+
clear_last_error();
|
|
242
|
+
|
|
243
|
+
if name.is_null() {
|
|
244
|
+
set_last_error("DocumentExtractor name cannot be NULL".to_string());
|
|
245
|
+
return false;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
if mime_types.is_null() {
|
|
249
|
+
set_last_error("MIME types cannot be NULL".to_string());
|
|
250
|
+
return false;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
254
|
+
Ok(s) => s,
|
|
255
|
+
Err(e) => {
|
|
256
|
+
set_last_error(format!("Invalid UTF-8 in DocumentExtractor name: {}", e));
|
|
257
|
+
return false;
|
|
258
|
+
}
|
|
259
|
+
};
|
|
260
|
+
|
|
261
|
+
if name_str.is_empty() {
|
|
262
|
+
set_last_error("Plugin name cannot be empty".to_string());
|
|
263
|
+
return false;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
if name_str.chars().any(|c| c.is_whitespace()) {
|
|
267
|
+
set_last_error("Plugin name cannot contain whitespace".to_string());
|
|
268
|
+
return false;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
let mime_types_str = match unsafe { CStr::from_ptr(mime_types) }.to_str() {
|
|
272
|
+
Ok(s) => s,
|
|
273
|
+
Err(e) => {
|
|
274
|
+
set_last_error(format!("Invalid UTF-8 in MIME types: {}", e));
|
|
275
|
+
return false;
|
|
276
|
+
}
|
|
277
|
+
};
|
|
278
|
+
|
|
279
|
+
let supported_types: Vec<String> = mime_types_str
|
|
280
|
+
.split(',')
|
|
281
|
+
.map(|s| s.trim().to_string())
|
|
282
|
+
.filter(|s| !s.is_empty())
|
|
283
|
+
.collect();
|
|
284
|
+
|
|
285
|
+
if supported_types.is_empty() {
|
|
286
|
+
set_last_error("At least one MIME type must be specified".to_string());
|
|
287
|
+
return false;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
let extractor = Arc::new(FfiDocumentExtractor::new(
|
|
291
|
+
name_str.to_string(),
|
|
292
|
+
callback,
|
|
293
|
+
supported_types,
|
|
294
|
+
priority,
|
|
295
|
+
));
|
|
296
|
+
|
|
297
|
+
let registry = kreuzberg::plugins::registry::get_document_extractor_registry();
|
|
298
|
+
let mut registry_guard = match registry.write() {
|
|
299
|
+
Ok(guard) => guard,
|
|
300
|
+
Err(e) => {
|
|
301
|
+
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
302
|
+
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
303
|
+
return false;
|
|
304
|
+
}
|
|
305
|
+
};
|
|
306
|
+
|
|
307
|
+
match registry_guard.register(extractor) {
|
|
308
|
+
Ok(()) => true,
|
|
309
|
+
Err(e) => {
|
|
310
|
+
set_last_error(format!("Failed to register DocumentExtractor: {}", e));
|
|
311
|
+
false
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
})
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
/// Unregister a DocumentExtractor by name.
|
|
318
|
+
///
|
|
319
|
+
/// # Safety
|
|
320
|
+
///
|
|
321
|
+
/// - `name` must be a valid null-terminated C string
|
|
322
|
+
/// - Returns true on success, false on error (check kreuzberg_last_error)
|
|
323
|
+
///
|
|
324
|
+
/// # Example (C)
|
|
325
|
+
///
|
|
326
|
+
/// ```c
|
|
327
|
+
/// bool success = kreuzberg_unregister_document_extractor("my-extractor");
|
|
328
|
+
/// if (!success) {
|
|
329
|
+
/// const char* error = kreuzberg_last_error();
|
|
330
|
+
/// printf("Failed to unregister: %s\n", error);
|
|
331
|
+
/// }
|
|
332
|
+
/// ```
|
|
333
|
+
#[unsafe(no_mangle)]
|
|
334
|
+
pub unsafe extern "C" fn kreuzberg_unregister_document_extractor(name: *const c_char) -> bool {
|
|
335
|
+
ffi_panic_guard_bool!("kreuzberg_unregister_document_extractor", {
|
|
336
|
+
clear_last_error();
|
|
337
|
+
|
|
338
|
+
if name.is_null() {
|
|
339
|
+
set_last_error("DocumentExtractor name cannot be NULL".to_string());
|
|
340
|
+
return false;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
344
|
+
Ok(s) => s,
|
|
345
|
+
Err(e) => {
|
|
346
|
+
set_last_error(format!("Invalid UTF-8 in DocumentExtractor name: {}", e));
|
|
347
|
+
return false;
|
|
348
|
+
}
|
|
349
|
+
};
|
|
350
|
+
|
|
351
|
+
let registry = kreuzberg::plugins::registry::get_document_extractor_registry();
|
|
352
|
+
let mut registry_guard = match registry.write() {
|
|
353
|
+
Ok(guard) => guard,
|
|
354
|
+
Err(e) => {
|
|
355
|
+
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
356
|
+
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
357
|
+
return false;
|
|
358
|
+
}
|
|
359
|
+
};
|
|
360
|
+
|
|
361
|
+
match registry_guard.remove(name_str) {
|
|
362
|
+
Ok(()) => true,
|
|
363
|
+
Err(e) => {
|
|
364
|
+
set_last_error(format!("Failed to remove DocumentExtractor: {}", e));
|
|
365
|
+
false
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
})
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
/// List all registered DocumentExtractors as a JSON array of names.
|
|
372
|
+
///
|
|
373
|
+
/// # Safety
|
|
374
|
+
///
|
|
375
|
+
/// - Returned string must be freed with `kreuzberg_free_string`.
|
|
376
|
+
/// - Returns NULL on error (check `kreuzberg_last_error`).
|
|
377
|
+
#[unsafe(no_mangle)]
|
|
378
|
+
pub unsafe extern "C" fn kreuzberg_list_document_extractors() -> *mut c_char {
|
|
379
|
+
ffi_panic_guard!("kreuzberg_list_document_extractors", {
|
|
380
|
+
clear_last_error();
|
|
381
|
+
|
|
382
|
+
let registry = kreuzberg::plugins::registry::get_document_extractor_registry();
|
|
383
|
+
let registry_guard = match registry.read() {
|
|
384
|
+
Ok(guard) => guard,
|
|
385
|
+
Err(e) => {
|
|
386
|
+
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
387
|
+
set_last_error(format!("Failed to acquire registry read lock: {}", e));
|
|
388
|
+
return ptr::null_mut();
|
|
389
|
+
}
|
|
390
|
+
};
|
|
391
|
+
|
|
392
|
+
match serde_json::to_string(®istry_guard.list()) {
|
|
393
|
+
Ok(json) => match CString::new(json) {
|
|
394
|
+
Ok(cstr) => cstr.into_raw(),
|
|
395
|
+
Err(e) => {
|
|
396
|
+
set_last_error(format!("Failed to create C string: {}", e));
|
|
397
|
+
ptr::null_mut()
|
|
398
|
+
}
|
|
399
|
+
},
|
|
400
|
+
Err(e) => {
|
|
401
|
+
set_last_error(format!("Failed to serialize DocumentExtractor list: {}", e));
|
|
402
|
+
ptr::null_mut()
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
})
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
/// Clear all registered DocumentExtractors.
|
|
409
|
+
///
|
|
410
|
+
/// # Safety
|
|
411
|
+
///
|
|
412
|
+
/// - Removes all registered extractors. Subsequent extractions will use only built-in extractors.
|
|
413
|
+
/// - Returns true on success, false on error.
|
|
414
|
+
///
|
|
415
|
+
/// # Example (C)
|
|
416
|
+
///
|
|
417
|
+
/// ```c
|
|
418
|
+
/// bool success = kreuzberg_clear_document_extractors();
|
|
419
|
+
/// if (!success) {
|
|
420
|
+
/// const char* error = kreuzberg_last_error();
|
|
421
|
+
/// printf("Failed to clear document extractors: %s\n", error);
|
|
422
|
+
/// }
|
|
423
|
+
/// ```
|
|
424
|
+
#[unsafe(no_mangle)]
|
|
425
|
+
pub unsafe extern "C" fn kreuzberg_clear_document_extractors() -> bool {
|
|
426
|
+
ffi_panic_guard_bool!("kreuzberg_clear_document_extractors", {
|
|
427
|
+
clear_last_error();
|
|
428
|
+
|
|
429
|
+
let registry = kreuzberg::plugins::registry::get_document_extractor_registry();
|
|
430
|
+
let mut registry_guard = match registry.write() {
|
|
431
|
+
Ok(guard) => guard,
|
|
432
|
+
Err(e) => {
|
|
433
|
+
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
434
|
+
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
435
|
+
return false;
|
|
436
|
+
}
|
|
437
|
+
};
|
|
438
|
+
|
|
439
|
+
*registry_guard = Default::default();
|
|
440
|
+
true
|
|
441
|
+
})
|
|
442
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
//! Plugin system FFI bindings
|
|
2
|
+
//!
|
|
3
|
+
//! Provides FFI bindings for registering and managing plugins.
|
|
4
|
+
|
|
5
|
+
pub mod document_extractor;
|
|
6
|
+
pub mod ocr_backend;
|
|
7
|
+
pub mod post_processor;
|
|
8
|
+
pub mod validator;
|
|
9
|
+
|
|
10
|
+
// Re-export all public items
|
|
11
|
+
pub use document_extractor::*;
|
|
12
|
+
pub use ocr_backend::*;
|
|
13
|
+
pub use post_processor::*;
|
|
14
|
+
pub use validator::*;
|