kreuzberg 4.0.0.rc2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +391 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,438 @@
|
|
|
1
|
+
//! PostProcessor plugin system FFI bindings
|
|
2
|
+
//!
|
|
3
|
+
//! Provides FFI functions for registering, managing, and executing custom post-processors
|
|
4
|
+
//! from C/Java/other FFI languages.
|
|
5
|
+
|
|
6
|
+
use std::ffi::{CStr, CString};
|
|
7
|
+
use std::os::raw::c_char;
|
|
8
|
+
use std::ptr;
|
|
9
|
+
use std::sync::Arc;
|
|
10
|
+
|
|
11
|
+
use async_trait::async_trait;
|
|
12
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
13
|
+
use kreuzberg::plugins::{Plugin, ProcessingStage};
|
|
14
|
+
use kreuzberg::types::ExtractionResult;
|
|
15
|
+
use kreuzberg::{KreuzbergError, Result};
|
|
16
|
+
|
|
17
|
+
use crate::helpers::{clear_last_error, set_last_error};
|
|
18
|
+
use crate::memory::kreuzberg_free_string;
|
|
19
|
+
use crate::{ffi_panic_guard, ffi_panic_guard_bool};
|
|
20
|
+
|
|
21
|
+
/// Type alias for the PostProcessor callback function.
|
|
22
|
+
///
|
|
23
|
+
/// # Parameters
|
|
24
|
+
///
|
|
25
|
+
/// - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
|
|
26
|
+
///
|
|
27
|
+
/// # Returns
|
|
28
|
+
///
|
|
29
|
+
/// Null-terminated JSON string containing the processed ExtractionResult
|
|
30
|
+
/// (must be freed by Rust via kreuzberg_free_string), or NULL on error.
|
|
31
|
+
///
|
|
32
|
+
/// # Safety
|
|
33
|
+
///
|
|
34
|
+
/// The callback must:
|
|
35
|
+
/// - Not store the result_json pointer (it's only valid for the duration of the call)
|
|
36
|
+
/// - Return a valid null-terminated UTF-8 JSON string allocated by the caller
|
|
37
|
+
/// - Return NULL on error (error message should be retrievable separately)
|
|
38
|
+
pub type PostProcessorCallback = unsafe extern "C" fn(result_json: *const c_char) -> *mut c_char;
|
|
39
|
+
|
|
40
|
+
/// FFI wrapper for custom PostProcessors registered from Java/C.
|
|
41
|
+
///
|
|
42
|
+
/// This struct wraps a C function pointer and implements the PostProcessor trait,
|
|
43
|
+
/// allowing custom post-processing implementations from FFI languages to be registered
|
|
44
|
+
/// and used within the Rust extraction pipeline.
|
|
45
|
+
struct FfiPostProcessor {
|
|
46
|
+
name: String,
|
|
47
|
+
callback: PostProcessorCallback,
|
|
48
|
+
stage: ProcessingStage,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
impl FfiPostProcessor {
|
|
52
|
+
fn new(name: String, callback: PostProcessorCallback, stage: ProcessingStage) -> Self {
|
|
53
|
+
Self { name, callback, stage }
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
impl Plugin for FfiPostProcessor {
|
|
58
|
+
fn name(&self) -> &str {
|
|
59
|
+
&self.name
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
fn version(&self) -> String {
|
|
63
|
+
"ffi-1.0.0".to_string()
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
fn initialize(&self) -> Result<()> {
|
|
67
|
+
Ok(())
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
fn shutdown(&self) -> Result<()> {
|
|
71
|
+
Ok(())
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
#[async_trait]
|
|
76
|
+
impl kreuzberg::plugins::PostProcessor for FfiPostProcessor {
|
|
77
|
+
async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
78
|
+
let result_json = serde_json::to_string(&*result).map_err(|e| KreuzbergError::Validation {
|
|
79
|
+
message: format!("Failed to serialize ExtractionResult: {}", e),
|
|
80
|
+
source: Some(Box::new(e)),
|
|
81
|
+
})?;
|
|
82
|
+
|
|
83
|
+
let callback = self.callback;
|
|
84
|
+
let processor_name = self.name.clone();
|
|
85
|
+
let result_json_owned = result_json.clone();
|
|
86
|
+
|
|
87
|
+
let processed_json = tokio::task::spawn_blocking(move || {
|
|
88
|
+
let result_cstring = CString::new(result_json_owned).map_err(|e| KreuzbergError::Validation {
|
|
89
|
+
message: format!("Failed to create C string from result JSON: {}", e),
|
|
90
|
+
source: Some(Box::new(e)),
|
|
91
|
+
})?;
|
|
92
|
+
|
|
93
|
+
let processed_ptr = unsafe { callback(result_cstring.as_ptr()) };
|
|
94
|
+
|
|
95
|
+
if processed_ptr.is_null() {
|
|
96
|
+
return Err(KreuzbergError::Plugin {
|
|
97
|
+
message: "PostProcessor returned NULL (operation failed)".to_string(),
|
|
98
|
+
plugin_name: processor_name.clone(),
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
let processed_cstr = unsafe { CStr::from_ptr(processed_ptr) };
|
|
103
|
+
let json = processed_cstr
|
|
104
|
+
.to_str()
|
|
105
|
+
.map_err(|e| KreuzbergError::Plugin {
|
|
106
|
+
message: format!("PostProcessor returned invalid UTF-8: {}", e),
|
|
107
|
+
plugin_name: processor_name.clone(),
|
|
108
|
+
})?
|
|
109
|
+
.to_string();
|
|
110
|
+
|
|
111
|
+
unsafe { kreuzberg_free_string(processed_ptr) };
|
|
112
|
+
|
|
113
|
+
Ok(json)
|
|
114
|
+
})
|
|
115
|
+
.await
|
|
116
|
+
.map_err(|e| KreuzbergError::Plugin {
|
|
117
|
+
message: format!("PostProcessor task panicked: {}", e),
|
|
118
|
+
plugin_name: self.name.clone(),
|
|
119
|
+
})??;
|
|
120
|
+
|
|
121
|
+
let processed_result: ExtractionResult =
|
|
122
|
+
serde_json::from_str(&processed_json).map_err(|e| KreuzbergError::Plugin {
|
|
123
|
+
message: format!("Failed to deserialize processed result: {}", e),
|
|
124
|
+
plugin_name: self.name.clone(),
|
|
125
|
+
})?;
|
|
126
|
+
|
|
127
|
+
*result = processed_result;
|
|
128
|
+
|
|
129
|
+
Ok(())
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
fn processing_stage(&self) -> kreuzberg::plugins::ProcessingStage {
|
|
133
|
+
self.stage
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
fn parse_processing_stage(stage: Option<&str>) -> std::result::Result<ProcessingStage, String> {
|
|
138
|
+
match stage {
|
|
139
|
+
Some(value) => match value.to_lowercase().as_str() {
|
|
140
|
+
"early" => Ok(ProcessingStage::Early),
|
|
141
|
+
"middle" => Ok(ProcessingStage::Middle),
|
|
142
|
+
"late" => Ok(ProcessingStage::Late),
|
|
143
|
+
other => Err(format!(
|
|
144
|
+
"Invalid processing stage '{}'. Expected one of: early, middle, late",
|
|
145
|
+
other
|
|
146
|
+
)),
|
|
147
|
+
},
|
|
148
|
+
None => Ok(ProcessingStage::Middle),
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/// Register a custom PostProcessor via FFI callback.
|
|
153
|
+
///
|
|
154
|
+
/// # Safety
|
|
155
|
+
///
|
|
156
|
+
/// - `name` must be a valid null-terminated C string
|
|
157
|
+
/// - `callback` must be a valid function pointer that:
|
|
158
|
+
/// - Does not store the result_json pointer
|
|
159
|
+
/// - Returns a null-terminated UTF-8 JSON string or NULL on error
|
|
160
|
+
/// - The returned string must be freeable by kreuzberg_free_string
|
|
161
|
+
/// - `priority` determines the order of execution (higher priority runs first)
|
|
162
|
+
/// - Returns true on success, false on error (check kreuzberg_last_error)
|
|
163
|
+
///
|
|
164
|
+
/// # Example (C)
|
|
165
|
+
///
|
|
166
|
+
/// ```c
|
|
167
|
+
/// char* my_post_processor(const char* result_json) {
|
|
168
|
+
/// // Parse result_json, modify it, return JSON string
|
|
169
|
+
/// return strdup("{\"content\":\"PROCESSED\"}");
|
|
170
|
+
/// }
|
|
171
|
+
///
|
|
172
|
+
/// bool success = kreuzberg_register_post_processor("my-processor", my_post_processor, 100);
|
|
173
|
+
/// if (!success) {
|
|
174
|
+
/// const char* error = kreuzberg_last_error();
|
|
175
|
+
/// printf("Failed to register: %s\n", error);
|
|
176
|
+
/// }
|
|
177
|
+
/// ```
|
|
178
|
+
#[unsafe(no_mangle)]
|
|
179
|
+
pub unsafe extern "C" fn kreuzberg_register_post_processor(
|
|
180
|
+
name: *const c_char,
|
|
181
|
+
callback: PostProcessorCallback,
|
|
182
|
+
priority: i32,
|
|
183
|
+
) -> bool {
|
|
184
|
+
ffi_panic_guard_bool!("kreuzberg_register_post_processor", {
|
|
185
|
+
clear_last_error();
|
|
186
|
+
|
|
187
|
+
if name.is_null() {
|
|
188
|
+
set_last_error("PostProcessor name cannot be NULL".to_string());
|
|
189
|
+
return false;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
193
|
+
Ok(s) => s,
|
|
194
|
+
Err(e) => {
|
|
195
|
+
set_last_error(format!("Invalid UTF-8 in PostProcessor name: {}", e));
|
|
196
|
+
return false;
|
|
197
|
+
}
|
|
198
|
+
};
|
|
199
|
+
|
|
200
|
+
if name_str.is_empty() {
|
|
201
|
+
set_last_error("Plugin name cannot be empty".to_string());
|
|
202
|
+
return false;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
if name_str.chars().any(|c| c.is_whitespace()) {
|
|
206
|
+
set_last_error("Plugin name cannot contain whitespace".to_string());
|
|
207
|
+
return false;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
let processor = Arc::new(FfiPostProcessor::new(
|
|
211
|
+
name_str.to_string(),
|
|
212
|
+
callback,
|
|
213
|
+
ProcessingStage::Middle,
|
|
214
|
+
));
|
|
215
|
+
|
|
216
|
+
let registry = kreuzberg::plugins::registry::get_post_processor_registry();
|
|
217
|
+
let mut registry_guard = match registry.write() {
|
|
218
|
+
Ok(guard) => guard,
|
|
219
|
+
Err(e) => {
|
|
220
|
+
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
221
|
+
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
222
|
+
return false;
|
|
223
|
+
}
|
|
224
|
+
};
|
|
225
|
+
|
|
226
|
+
match registry_guard.register(processor, priority) {
|
|
227
|
+
Ok(()) => true,
|
|
228
|
+
Err(e) => {
|
|
229
|
+
set_last_error(format!("Failed to register PostProcessor: {}", e));
|
|
230
|
+
false
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
})
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
/// Register a custom PostProcessor with an explicit processing stage.
|
|
237
|
+
///
|
|
238
|
+
/// # Safety
|
|
239
|
+
///
|
|
240
|
+
/// - `name` must be a valid null-terminated C string
|
|
241
|
+
/// - `stage` must be a valid null-terminated C string containing "early", "middle", or "late"
|
|
242
|
+
/// - `callback` must be a valid function pointer that:
|
|
243
|
+
/// - Does not store the result_json pointer
|
|
244
|
+
/// - Returns a null-terminated UTF-8 JSON string or NULL on error
|
|
245
|
+
/// - The returned string must be freeable by kreuzberg_free_string
|
|
246
|
+
/// - `priority` determines the order of execution within the stage (higher priority runs first)
|
|
247
|
+
/// - Returns true on success, false on error (check kreuzberg_last_error)
|
|
248
|
+
#[unsafe(no_mangle)]
|
|
249
|
+
pub unsafe extern "C" fn kreuzberg_register_post_processor_with_stage(
|
|
250
|
+
name: *const c_char,
|
|
251
|
+
callback: PostProcessorCallback,
|
|
252
|
+
priority: i32,
|
|
253
|
+
stage: *const c_char,
|
|
254
|
+
) -> bool {
|
|
255
|
+
ffi_panic_guard_bool!("kreuzberg_register_post_processor_with_stage", {
|
|
256
|
+
clear_last_error();
|
|
257
|
+
|
|
258
|
+
if name.is_null() {
|
|
259
|
+
set_last_error("PostProcessor name cannot be NULL".to_string());
|
|
260
|
+
return false;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
264
|
+
Ok(s) => s,
|
|
265
|
+
Err(e) => {
|
|
266
|
+
set_last_error(format!("Invalid UTF-8 in PostProcessor name: {}", e));
|
|
267
|
+
return false;
|
|
268
|
+
}
|
|
269
|
+
};
|
|
270
|
+
|
|
271
|
+
if name_str.is_empty() {
|
|
272
|
+
set_last_error("Plugin name cannot be empty".to_string());
|
|
273
|
+
return false;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
if name_str.chars().any(|c| c.is_whitespace()) {
|
|
277
|
+
set_last_error("Plugin name cannot contain whitespace".to_string());
|
|
278
|
+
return false;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
let stage_str = if stage.is_null() {
|
|
282
|
+
None
|
|
283
|
+
} else {
|
|
284
|
+
match unsafe { CStr::from_ptr(stage) }.to_str() {
|
|
285
|
+
Ok(s) => Some(s),
|
|
286
|
+
Err(e) => {
|
|
287
|
+
set_last_error(format!("Invalid UTF-8 in processing stage: {}", e));
|
|
288
|
+
return false;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
};
|
|
292
|
+
|
|
293
|
+
let stage = match parse_processing_stage(stage_str) {
|
|
294
|
+
Ok(stage) => stage,
|
|
295
|
+
Err(e) => {
|
|
296
|
+
set_last_error(e);
|
|
297
|
+
return false;
|
|
298
|
+
}
|
|
299
|
+
};
|
|
300
|
+
|
|
301
|
+
let processor = Arc::new(FfiPostProcessor::new(name_str.to_string(), callback, stage));
|
|
302
|
+
|
|
303
|
+
let registry = kreuzberg::plugins::registry::get_post_processor_registry();
|
|
304
|
+
let mut registry_guard = match registry.write() {
|
|
305
|
+
Ok(guard) => guard,
|
|
306
|
+
Err(e) => {
|
|
307
|
+
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
308
|
+
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
309
|
+
return false;
|
|
310
|
+
}
|
|
311
|
+
};
|
|
312
|
+
|
|
313
|
+
match registry_guard.register(processor, priority) {
|
|
314
|
+
Ok(()) => true,
|
|
315
|
+
Err(e) => {
|
|
316
|
+
set_last_error(format!("Failed to register PostProcessor: {}", e));
|
|
317
|
+
false
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
})
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
/// Unregister a PostProcessor by name.
|
|
324
|
+
///
|
|
325
|
+
/// # Safety
|
|
326
|
+
///
|
|
327
|
+
/// - `name` must be a valid null-terminated C string
|
|
328
|
+
/// - Returns true on success, false on error (check kreuzberg_last_error)
|
|
329
|
+
///
|
|
330
|
+
/// # Example (C)
|
|
331
|
+
///
|
|
332
|
+
/// ```c
|
|
333
|
+
/// bool success = kreuzberg_unregister_post_processor("my-processor");
|
|
334
|
+
/// if (!success) {
|
|
335
|
+
/// const char* error = kreuzberg_last_error();
|
|
336
|
+
/// printf("Failed to unregister: %s\n", error);
|
|
337
|
+
/// }
|
|
338
|
+
/// ```
|
|
339
|
+
#[unsafe(no_mangle)]
|
|
340
|
+
pub unsafe extern "C" fn kreuzberg_unregister_post_processor(name: *const c_char) -> bool {
|
|
341
|
+
ffi_panic_guard_bool!("kreuzberg_unregister_post_processor", {
|
|
342
|
+
clear_last_error();
|
|
343
|
+
|
|
344
|
+
if name.is_null() {
|
|
345
|
+
set_last_error("PostProcessor name cannot be NULL".to_string());
|
|
346
|
+
return false;
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
350
|
+
Ok(s) => s,
|
|
351
|
+
Err(e) => {
|
|
352
|
+
set_last_error(format!("Invalid UTF-8 in PostProcessor name: {}", e));
|
|
353
|
+
return false;
|
|
354
|
+
}
|
|
355
|
+
};
|
|
356
|
+
|
|
357
|
+
let registry = kreuzberg::plugins::registry::get_post_processor_registry();
|
|
358
|
+
let mut registry_guard = match registry.write() {
|
|
359
|
+
Ok(guard) => guard,
|
|
360
|
+
Err(e) => {
|
|
361
|
+
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
362
|
+
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
363
|
+
return false;
|
|
364
|
+
}
|
|
365
|
+
};
|
|
366
|
+
|
|
367
|
+
match registry_guard.remove(name_str) {
|
|
368
|
+
Ok(()) => true,
|
|
369
|
+
Err(e) => {
|
|
370
|
+
set_last_error(format!("Failed to remove PostProcessor: {}", e));
|
|
371
|
+
false
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
})
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/// Clear all registered PostProcessors.
|
|
378
|
+
///
|
|
379
|
+
/// # Safety
|
|
380
|
+
///
|
|
381
|
+
/// - Removes all registered processors. Subsequent extractions will run without them.
|
|
382
|
+
/// - Returns true on success, false on error.
|
|
383
|
+
#[unsafe(no_mangle)]
|
|
384
|
+
pub unsafe extern "C" fn kreuzberg_clear_post_processors() -> bool {
|
|
385
|
+
ffi_panic_guard_bool!("kreuzberg_clear_post_processors", {
|
|
386
|
+
clear_last_error();
|
|
387
|
+
|
|
388
|
+
let registry = kreuzberg::plugins::registry::get_post_processor_registry();
|
|
389
|
+
let mut registry_guard = match registry.write() {
|
|
390
|
+
Ok(guard) => guard,
|
|
391
|
+
Err(e) => {
|
|
392
|
+
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
393
|
+
set_last_error(format!("Failed to acquire registry write lock: {}", e));
|
|
394
|
+
return false;
|
|
395
|
+
}
|
|
396
|
+
};
|
|
397
|
+
|
|
398
|
+
*registry_guard = Default::default();
|
|
399
|
+
true
|
|
400
|
+
})
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
/// List all registered PostProcessors as a JSON array of names.
|
|
404
|
+
///
|
|
405
|
+
/// # Safety
|
|
406
|
+
///
|
|
407
|
+
/// - Returned string must be freed with `kreuzberg_free_string`.
|
|
408
|
+
/// - Returns NULL on error (check `kreuzberg_last_error`).
|
|
409
|
+
#[unsafe(no_mangle)]
|
|
410
|
+
pub unsafe extern "C" fn kreuzberg_list_post_processors() -> *mut c_char {
|
|
411
|
+
ffi_panic_guard!("kreuzberg_list_post_processors", {
|
|
412
|
+
clear_last_error();
|
|
413
|
+
|
|
414
|
+
let registry = kreuzberg::plugins::registry::get_post_processor_registry();
|
|
415
|
+
let registry_guard = match registry.read() {
|
|
416
|
+
Ok(guard) => guard,
|
|
417
|
+
Err(e) => {
|
|
418
|
+
// ~keep: Lock poisoning indicates a panic in another thread holding the lock.
|
|
419
|
+
set_last_error(format!("Failed to acquire registry read lock: {}", e));
|
|
420
|
+
return ptr::null_mut();
|
|
421
|
+
}
|
|
422
|
+
};
|
|
423
|
+
|
|
424
|
+
match serde_json::to_string(®istry_guard.list()) {
|
|
425
|
+
Ok(json) => match CString::new(json) {
|
|
426
|
+
Ok(cstr) => cstr.into_raw(),
|
|
427
|
+
Err(e) => {
|
|
428
|
+
set_last_error(format!("Failed to create C string: {}", e));
|
|
429
|
+
ptr::null_mut()
|
|
430
|
+
}
|
|
431
|
+
},
|
|
432
|
+
Err(e) => {
|
|
433
|
+
set_last_error(format!("Failed to serialize PostProcessor list: {}", e));
|
|
434
|
+
ptr::null_mut()
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
})
|
|
438
|
+
}
|