kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +104 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6750 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +53 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +52 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.so} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +887 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +87 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +634 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +452 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +81 -22
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
|
@@ -1,212 +1,212 @@
|
|
|
1
|
-
//! Plugin system for extending Kreuzberg functionality.
|
|
2
|
-
//!
|
|
3
|
-
//! The plugin system provides a trait-based architecture that allows extending
|
|
4
|
-
//! Kreuzberg with custom extractors, OCR backends, post-processors, and validators.
|
|
5
|
-
//!
|
|
6
|
-
//! # Plugin Types
|
|
7
|
-
//!
|
|
8
|
-
//! - [`Plugin`] - Base trait that all plugins must implement
|
|
9
|
-
//! - [`OcrBackend`] - OCR processing plugins
|
|
10
|
-
//! - [`DocumentExtractor`] - Document format extraction plugins
|
|
11
|
-
//! - [`PostProcessor`] - Content post-processing plugins
|
|
12
|
-
//! - [`Validator`] - Validation plugins
|
|
13
|
-
//!
|
|
14
|
-
//! # Language Support
|
|
15
|
-
//!
|
|
16
|
-
//! Plugins can be implemented in:
|
|
17
|
-
//! - **Rust** (native, highest performance)
|
|
18
|
-
//! - **Python** (via PyO3 FFI bridge)
|
|
19
|
-
//! - **Node.js** (future - via napi-rs FFI bridge)
|
|
20
|
-
//!
|
|
21
|
-
//! # Lifecycle Pattern
|
|
22
|
-
//!
|
|
23
|
-
//! Plugins are stored in `Arc<dyn Trait>` for thread-safe shared access:
|
|
24
|
-
//!
|
|
25
|
-
//! ```rust
|
|
26
|
-
//! use kreuzberg::plugins::{Plugin, DocumentExtractor};
|
|
27
|
-
//! use kreuzberg::plugins::registry::get_document_extractor_registry;
|
|
28
|
-
//! use std::sync::Arc;
|
|
29
|
-
//!
|
|
30
|
-
//! # struct MyExtractor;
|
|
31
|
-
//! # use kreuzberg::types::{ExtractionResult, Metadata};
|
|
32
|
-
//! # impl kreuzberg::plugins::Plugin for MyExtractor {
|
|
33
|
-
//! # fn name(&self) -> &str { "my" }
|
|
34
|
-
//! # fn version(&self) -> String { "1.0.0".to_string() }
|
|
35
|
-
//! # fn initialize(&self) -> kreuzberg::Result<()> { Ok(()) }
|
|
36
|
-
//! # fn shutdown(&self) -> kreuzberg::Result<()> { Ok(()) }
|
|
37
|
-
//! # }
|
|
38
|
-
//! # #[async_trait::async_trait]
|
|
39
|
-
//! # impl DocumentExtractor for MyExtractor {
|
|
40
|
-
//! # async fn extract_bytes(&self, _: &[u8], _: &str, _: &kreuzberg::ExtractionConfig)
|
|
41
|
-
//! # -> kreuzberg::Result<ExtractionResult> {
|
|
42
|
-
//! # Ok(ExtractionResult {
|
|
43
|
-
//! # content: String::new(),
|
|
44
|
-
//! # mime_type: String::new(),
|
|
45
|
-
//! # metadata: Metadata::default(),
|
|
46
|
-
//! # tables: vec![],
|
|
47
|
-
//! # detected_languages: None,
|
|
48
|
-
//! # chunks: None,
|
|
49
|
-
//! # images: None,
|
|
50
|
-
//! # pages: None,
|
|
51
|
-
//! # })
|
|
52
|
-
//! # }
|
|
53
|
-
//! # async fn extract_file(&self, _: &std::path::Path, _: &str, _: &kreuzberg::ExtractionConfig)
|
|
54
|
-
//! # -> kreuzberg::Result<ExtractionResult> {
|
|
55
|
-
//! # Ok(ExtractionResult {
|
|
56
|
-
//! # content: String::new(),
|
|
57
|
-
//! # mime_type: String::new(),
|
|
58
|
-
//! # metadata: Metadata::default(),
|
|
59
|
-
//! # tables: vec![],
|
|
60
|
-
//! # detected_languages: None,
|
|
61
|
-
//! # chunks: None,
|
|
62
|
-
//! # images: None,
|
|
63
|
-
//! # pages: None,
|
|
64
|
-
//! # })
|
|
65
|
-
//! # }
|
|
66
|
-
//! # fn supported_mime_types(&self) -> &[&str] { &[] }
|
|
67
|
-
//! # fn priority(&self) -> i32 { 50 }
|
|
68
|
-
//! # }
|
|
69
|
-
//! // 1. Create plugin instance
|
|
70
|
-
//! let plugin = MyExtractor;
|
|
71
|
-
//!
|
|
72
|
-
//! // 2. Wrap in Arc for registration
|
|
73
|
-
//! let plugin = Arc::new(plugin);
|
|
74
|
-
//!
|
|
75
|
-
//! // 3. Register with registry (calls initialize internally)
|
|
76
|
-
//! let registry = get_document_extractor_registry();
|
|
77
|
-
//! let mut registry = registry.write().unwrap();
|
|
78
|
-
//! registry.register(plugin)?;
|
|
79
|
-
//! # Ok::<(), kreuzberg::KreuzbergError>(())
|
|
80
|
-
//! ```
|
|
81
|
-
//!
|
|
82
|
-
//! # Example: Custom Document Extractor
|
|
83
|
-
//!
|
|
84
|
-
//! ```rust
|
|
85
|
-
//! use kreuzberg::plugins::{Plugin, DocumentExtractor};
|
|
86
|
-
//! use kreuzberg::{Result, ExtractionConfig};
|
|
87
|
-
//! use kreuzberg::types::{ExtractionResult, Metadata};
|
|
88
|
-
//! use async_trait::async_trait;
|
|
89
|
-
//! use std::path::Path;
|
|
90
|
-
//!
|
|
91
|
-
//! struct CustomJsonExtractor;
|
|
92
|
-
//!
|
|
93
|
-
//! impl Plugin for CustomJsonExtractor {
|
|
94
|
-
//! fn name(&self) -> &str { "custom-json-extractor" }
|
|
95
|
-
//! fn version(&self) -> String { "1.0.0".to_string() }
|
|
96
|
-
//! fn initialize(&self) -> Result<()> {
|
|
97
|
-
//! println!("JSON extractor initialized");
|
|
98
|
-
//! Ok(())
|
|
99
|
-
//! }
|
|
100
|
-
//! fn shutdown(&self) -> Result<()> {
|
|
101
|
-
//! println!("JSON extractor shutdown");
|
|
102
|
-
//! Ok(())
|
|
103
|
-
//! }
|
|
104
|
-
//! }
|
|
105
|
-
//!
|
|
106
|
-
//! #[async_trait]
|
|
107
|
-
//! impl DocumentExtractor for CustomJsonExtractor {
|
|
108
|
-
//! async fn extract_bytes(&self, content: &[u8], _mime_type: &str, _config: &ExtractionConfig)
|
|
109
|
-
//! -> Result<ExtractionResult> {
|
|
110
|
-
//! // Parse JSON and extract all string values
|
|
111
|
-
//! let json: serde_json::Value = serde_json::from_slice(content)?;
|
|
112
|
-
//! let extracted_text = extract_strings_from_json(&json);
|
|
113
|
-
//!
|
|
114
|
-
//! let mut metadata = Metadata::default();
|
|
115
|
-
//! metadata.additional.insert("extracted_fields".to_string(), serde_json::json!(true));
|
|
116
|
-
//!
|
|
117
|
-
//! Ok(ExtractionResult {
|
|
118
|
-
//! content: extracted_text,
|
|
119
|
-
//! mime_type: "application/json".to_string(),
|
|
120
|
-
//! metadata,
|
|
121
|
-
//! tables: vec![],
|
|
122
|
-
//! detected_languages: None,
|
|
123
|
-
//! chunks: None,
|
|
124
|
-
//! images: None,
|
|
125
|
-
//! pages: None,
|
|
126
|
-
//! })
|
|
127
|
-
//! }
|
|
128
|
-
//!
|
|
129
|
-
//! async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig)
|
|
130
|
-
//! -> Result<ExtractionResult> {
|
|
131
|
-
//! // Read file and delegate to extract_bytes
|
|
132
|
-
//! let content = tokio::fs::read(path).await?;
|
|
133
|
-
//! self.extract_bytes(&content, mime_type, config).await
|
|
134
|
-
//! }
|
|
135
|
-
//!
|
|
136
|
-
//! fn supported_mime_types(&self) -> &[&str] {
|
|
137
|
-
//! &["application/json", "text/json"]
|
|
138
|
-
//! }
|
|
139
|
-
//!
|
|
140
|
-
//! fn priority(&self) -> i32 { 50 } // Default priority
|
|
141
|
-
//! }
|
|
142
|
-
//!
|
|
143
|
-
//! fn extract_strings_from_json(value: &serde_json::Value) -> String {
|
|
144
|
-
//! match value {
|
|
145
|
-
//! serde_json::Value::String(s) => format!("{}\n", s),
|
|
146
|
-
//! serde_json::Value::Array(arr) => {
|
|
147
|
-
//! arr.iter().map(extract_strings_from_json).collect()
|
|
148
|
-
//! }
|
|
149
|
-
//! serde_json::Value::Object(obj) => {
|
|
150
|
-
//! obj.values().map(extract_strings_from_json).collect()
|
|
151
|
-
//! }
|
|
152
|
-
//! _ => String::new(),
|
|
153
|
-
//! }
|
|
154
|
-
//! }
|
|
155
|
-
//! ```
|
|
156
|
-
//!
|
|
157
|
-
//! # Safety and Threading
|
|
158
|
-
//!
|
|
159
|
-
//! **CRITICAL**: All plugins must be `Send + Sync` because they are:
|
|
160
|
-
//! - Stored in `Arc<dyn Trait>` for shared ownership
|
|
161
|
-
//! - Accessed concurrently from multiple threads
|
|
162
|
-
//! - Called with `&self` (shared references)
|
|
163
|
-
//!
|
|
164
|
-
//! **Interior Mutability Pattern**:
|
|
165
|
-
//! Since plugins receive `&self` (not `&mut self`), use these for mutable state:
|
|
166
|
-
//! - `Mutex<T>` - Exclusive access, blocking
|
|
167
|
-
//! - `RwLock<T>` - Shared read, exclusive write
|
|
168
|
-
//! - `AtomicBool` / `AtomicU64` - Lock-free primitives
|
|
169
|
-
//! - `OnceCell<T>` - One-time initialization
|
|
170
|
-
//!
|
|
171
|
-
//! ```rust
|
|
172
|
-
//! use kreuzberg::plugins::Plugin;
|
|
173
|
-
//! use std::sync::Mutex;
|
|
174
|
-
//!
|
|
175
|
-
//! struct StatefulPlugin {
|
|
176
|
-
//! // Use interior mutability for state
|
|
177
|
-
//! call_count: std::sync::atomic::AtomicU64,
|
|
178
|
-
//! cache: Mutex<Option<Vec<String>>>,
|
|
179
|
-
//! }
|
|
180
|
-
//!
|
|
181
|
-
//! impl Plugin for StatefulPlugin {
|
|
182
|
-
//! fn name(&self) -> &str { "stateful-plugin" }
|
|
183
|
-
//! fn version(&self) -> String { "1.0.0".to_string() }
|
|
184
|
-
//!
|
|
185
|
-
//! fn initialize(&self) -> kreuzberg::Result<()> {
|
|
186
|
-
//! // Modify through interior mutability
|
|
187
|
-
//! let mut cache = self.cache.lock().unwrap();
|
|
188
|
-
//! *cache = Some(vec!["initialized".to_string()]);
|
|
189
|
-
//! Ok(())
|
|
190
|
-
//! }
|
|
191
|
-
//!
|
|
192
|
-
//! fn shutdown(&self) -> kreuzberg::Result<()> {
|
|
193
|
-
//! self.call_count.store(0, std::sync::atomic::Ordering::Release);
|
|
194
|
-
//! Ok(())
|
|
195
|
-
//! }
|
|
196
|
-
//! }
|
|
197
|
-
//! ```
|
|
198
|
-
|
|
199
|
-
mod extractor;
|
|
200
|
-
mod ocr;
|
|
201
|
-
mod processor;
|
|
202
|
-
pub mod registry;
|
|
203
|
-
mod traits;
|
|
204
|
-
mod validator;
|
|
205
|
-
|
|
206
|
-
pub use extractor::{DocumentExtractor, clear_extractors, list_extractors, register_extractor, unregister_extractor};
|
|
207
|
-
pub use ocr::{
|
|
208
|
-
OcrBackend, OcrBackendType, clear_ocr_backends, list_ocr_backends, register_ocr_backend, unregister_ocr_backend,
|
|
209
|
-
};
|
|
210
|
-
pub use processor::{PostProcessor, ProcessingStage, list_post_processors};
|
|
211
|
-
pub use traits::Plugin;
|
|
212
|
-
pub use validator::{Validator, clear_validators, list_validators, register_validator, unregister_validator};
|
|
1
|
+
//! Plugin system for extending Kreuzberg functionality.
|
|
2
|
+
//!
|
|
3
|
+
//! The plugin system provides a trait-based architecture that allows extending
|
|
4
|
+
//! Kreuzberg with custom extractors, OCR backends, post-processors, and validators.
|
|
5
|
+
//!
|
|
6
|
+
//! # Plugin Types
|
|
7
|
+
//!
|
|
8
|
+
//! - [`Plugin`] - Base trait that all plugins must implement
|
|
9
|
+
//! - [`OcrBackend`] - OCR processing plugins
|
|
10
|
+
//! - [`DocumentExtractor`] - Document format extraction plugins
|
|
11
|
+
//! - [`PostProcessor`] - Content post-processing plugins
|
|
12
|
+
//! - [`Validator`] - Validation plugins
|
|
13
|
+
//!
|
|
14
|
+
//! # Language Support
|
|
15
|
+
//!
|
|
16
|
+
//! Plugins can be implemented in:
|
|
17
|
+
//! - **Rust** (native, highest performance)
|
|
18
|
+
//! - **Python** (via PyO3 FFI bridge)
|
|
19
|
+
//! - **Node.js** (future - via napi-rs FFI bridge)
|
|
20
|
+
//!
|
|
21
|
+
//! # Lifecycle Pattern
|
|
22
|
+
//!
|
|
23
|
+
//! Plugins are stored in `Arc<dyn Trait>` for thread-safe shared access:
|
|
24
|
+
//!
|
|
25
|
+
//! ```rust
|
|
26
|
+
//! use kreuzberg::plugins::{Plugin, DocumentExtractor};
|
|
27
|
+
//! use kreuzberg::plugins::registry::get_document_extractor_registry;
|
|
28
|
+
//! use std::sync::Arc;
|
|
29
|
+
//!
|
|
30
|
+
//! # struct MyExtractor;
|
|
31
|
+
//! # use kreuzberg::types::{ExtractionResult, Metadata};
|
|
32
|
+
//! # impl kreuzberg::plugins::Plugin for MyExtractor {
|
|
33
|
+
//! # fn name(&self) -> &str { "my" }
|
|
34
|
+
//! # fn version(&self) -> String { "1.0.0".to_string() }
|
|
35
|
+
//! # fn initialize(&self) -> kreuzberg::Result<()> { Ok(()) }
|
|
36
|
+
//! # fn shutdown(&self) -> kreuzberg::Result<()> { Ok(()) }
|
|
37
|
+
//! # }
|
|
38
|
+
//! # #[async_trait::async_trait]
|
|
39
|
+
//! # impl DocumentExtractor for MyExtractor {
|
|
40
|
+
//! # async fn extract_bytes(&self, _: &[u8], _: &str, _: &kreuzberg::ExtractionConfig)
|
|
41
|
+
//! # -> kreuzberg::Result<ExtractionResult> {
|
|
42
|
+
//! # Ok(ExtractionResult {
|
|
43
|
+
//! # content: String::new(),
|
|
44
|
+
//! # mime_type: String::new(),
|
|
45
|
+
//! # metadata: Metadata::default(),
|
|
46
|
+
//! # tables: vec![],
|
|
47
|
+
//! # detected_languages: None,
|
|
48
|
+
//! # chunks: None,
|
|
49
|
+
//! # images: None,
|
|
50
|
+
//! # pages: None,
|
|
51
|
+
//! # })
|
|
52
|
+
//! # }
|
|
53
|
+
//! # async fn extract_file(&self, _: &std::path::Path, _: &str, _: &kreuzberg::ExtractionConfig)
|
|
54
|
+
//! # -> kreuzberg::Result<ExtractionResult> {
|
|
55
|
+
//! # Ok(ExtractionResult {
|
|
56
|
+
//! # content: String::new(),
|
|
57
|
+
//! # mime_type: String::new(),
|
|
58
|
+
//! # metadata: Metadata::default(),
|
|
59
|
+
//! # tables: vec![],
|
|
60
|
+
//! # detected_languages: None,
|
|
61
|
+
//! # chunks: None,
|
|
62
|
+
//! # images: None,
|
|
63
|
+
//! # pages: None,
|
|
64
|
+
//! # })
|
|
65
|
+
//! # }
|
|
66
|
+
//! # fn supported_mime_types(&self) -> &[&str] { &[] }
|
|
67
|
+
//! # fn priority(&self) -> i32 { 50 }
|
|
68
|
+
//! # }
|
|
69
|
+
//! // 1. Create plugin instance
|
|
70
|
+
//! let plugin = MyExtractor;
|
|
71
|
+
//!
|
|
72
|
+
//! // 2. Wrap in Arc for registration
|
|
73
|
+
//! let plugin = Arc::new(plugin);
|
|
74
|
+
//!
|
|
75
|
+
//! // 3. Register with registry (calls initialize internally)
|
|
76
|
+
//! let registry = get_document_extractor_registry();
|
|
77
|
+
//! let mut registry = registry.write().unwrap();
|
|
78
|
+
//! registry.register(plugin)?;
|
|
79
|
+
//! # Ok::<(), kreuzberg::KreuzbergError>(())
|
|
80
|
+
//! ```
|
|
81
|
+
//!
|
|
82
|
+
//! # Example: Custom Document Extractor
|
|
83
|
+
//!
|
|
84
|
+
//! ```rust
|
|
85
|
+
//! use kreuzberg::plugins::{Plugin, DocumentExtractor};
|
|
86
|
+
//! use kreuzberg::{Result, ExtractionConfig};
|
|
87
|
+
//! use kreuzberg::types::{ExtractionResult, Metadata};
|
|
88
|
+
//! use async_trait::async_trait;
|
|
89
|
+
//! use std::path::Path;
|
|
90
|
+
//!
|
|
91
|
+
//! struct CustomJsonExtractor;
|
|
92
|
+
//!
|
|
93
|
+
//! impl Plugin for CustomJsonExtractor {
|
|
94
|
+
//! fn name(&self) -> &str { "custom-json-extractor" }
|
|
95
|
+
//! fn version(&self) -> String { "1.0.0".to_string() }
|
|
96
|
+
//! fn initialize(&self) -> Result<()> {
|
|
97
|
+
//! println!("JSON extractor initialized");
|
|
98
|
+
//! Ok(())
|
|
99
|
+
//! }
|
|
100
|
+
//! fn shutdown(&self) -> Result<()> {
|
|
101
|
+
//! println!("JSON extractor shutdown");
|
|
102
|
+
//! Ok(())
|
|
103
|
+
//! }
|
|
104
|
+
//! }
|
|
105
|
+
//!
|
|
106
|
+
//! #[async_trait]
|
|
107
|
+
//! impl DocumentExtractor for CustomJsonExtractor {
|
|
108
|
+
//! async fn extract_bytes(&self, content: &[u8], _mime_type: &str, _config: &ExtractionConfig)
|
|
109
|
+
//! -> Result<ExtractionResult> {
|
|
110
|
+
//! // Parse JSON and extract all string values
|
|
111
|
+
//! let json: serde_json::Value = serde_json::from_slice(content)?;
|
|
112
|
+
//! let extracted_text = extract_strings_from_json(&json);
|
|
113
|
+
//!
|
|
114
|
+
//! let mut metadata = Metadata::default();
|
|
115
|
+
//! metadata.additional.insert("extracted_fields".to_string(), serde_json::json!(true));
|
|
116
|
+
//!
|
|
117
|
+
//! Ok(ExtractionResult {
|
|
118
|
+
//! content: extracted_text,
|
|
119
|
+
//! mime_type: "application/json".to_string(),
|
|
120
|
+
//! metadata,
|
|
121
|
+
//! tables: vec![],
|
|
122
|
+
//! detected_languages: None,
|
|
123
|
+
//! chunks: None,
|
|
124
|
+
//! images: None,
|
|
125
|
+
//! pages: None,
|
|
126
|
+
//! })
|
|
127
|
+
//! }
|
|
128
|
+
//!
|
|
129
|
+
//! async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig)
|
|
130
|
+
//! -> Result<ExtractionResult> {
|
|
131
|
+
//! // Read file and delegate to extract_bytes
|
|
132
|
+
//! let content = tokio::fs::read(path).await?;
|
|
133
|
+
//! self.extract_bytes(&content, mime_type, config).await
|
|
134
|
+
//! }
|
|
135
|
+
//!
|
|
136
|
+
//! fn supported_mime_types(&self) -> &[&str] {
|
|
137
|
+
//! &["application/json", "text/json"]
|
|
138
|
+
//! }
|
|
139
|
+
//!
|
|
140
|
+
//! fn priority(&self) -> i32 { 50 } // Default priority
|
|
141
|
+
//! }
|
|
142
|
+
//!
|
|
143
|
+
//! fn extract_strings_from_json(value: &serde_json::Value) -> String {
|
|
144
|
+
//! match value {
|
|
145
|
+
//! serde_json::Value::String(s) => format!("{}\n", s),
|
|
146
|
+
//! serde_json::Value::Array(arr) => {
|
|
147
|
+
//! arr.iter().map(extract_strings_from_json).collect()
|
|
148
|
+
//! }
|
|
149
|
+
//! serde_json::Value::Object(obj) => {
|
|
150
|
+
//! obj.values().map(extract_strings_from_json).collect()
|
|
151
|
+
//! }
|
|
152
|
+
//! _ => String::new(),
|
|
153
|
+
//! }
|
|
154
|
+
//! }
|
|
155
|
+
//! ```
|
|
156
|
+
//!
|
|
157
|
+
//! # Safety and Threading
|
|
158
|
+
//!
|
|
159
|
+
//! **CRITICAL**: All plugins must be `Send + Sync` because they are:
|
|
160
|
+
//! - Stored in `Arc<dyn Trait>` for shared ownership
|
|
161
|
+
//! - Accessed concurrently from multiple threads
|
|
162
|
+
//! - Called with `&self` (shared references)
|
|
163
|
+
//!
|
|
164
|
+
//! **Interior Mutability Pattern**:
|
|
165
|
+
//! Since plugins receive `&self` (not `&mut self`), use these for mutable state:
|
|
166
|
+
//! - `Mutex<T>` - Exclusive access, blocking
|
|
167
|
+
//! - `RwLock<T>` - Shared read, exclusive write
|
|
168
|
+
//! - `AtomicBool` / `AtomicU64` - Lock-free primitives
|
|
169
|
+
//! - `OnceCell<T>` - One-time initialization
|
|
170
|
+
//!
|
|
171
|
+
//! ```rust
|
|
172
|
+
//! use kreuzberg::plugins::Plugin;
|
|
173
|
+
//! use std::sync::Mutex;
|
|
174
|
+
//!
|
|
175
|
+
//! struct StatefulPlugin {
|
|
176
|
+
//! // Use interior mutability for state
|
|
177
|
+
//! call_count: std::sync::atomic::AtomicU64,
|
|
178
|
+
//! cache: Mutex<Option<Vec<String>>>,
|
|
179
|
+
//! }
|
|
180
|
+
//!
|
|
181
|
+
//! impl Plugin for StatefulPlugin {
|
|
182
|
+
//! fn name(&self) -> &str { "stateful-plugin" }
|
|
183
|
+
//! fn version(&self) -> String { "1.0.0".to_string() }
|
|
184
|
+
//!
|
|
185
|
+
//! fn initialize(&self) -> kreuzberg::Result<()> {
|
|
186
|
+
//! // Modify through interior mutability
|
|
187
|
+
//! let mut cache = self.cache.lock().unwrap();
|
|
188
|
+
//! *cache = Some(vec!["initialized".to_string()]);
|
|
189
|
+
//! Ok(())
|
|
190
|
+
//! }
|
|
191
|
+
//!
|
|
192
|
+
//! fn shutdown(&self) -> kreuzberg::Result<()> {
|
|
193
|
+
//! self.call_count.store(0, std::sync::atomic::Ordering::Release);
|
|
194
|
+
//! Ok(())
|
|
195
|
+
//! }
|
|
196
|
+
//! }
|
|
197
|
+
//! ```
|
|
198
|
+
|
|
199
|
+
mod extractor;
|
|
200
|
+
mod ocr;
|
|
201
|
+
mod processor;
|
|
202
|
+
pub mod registry;
|
|
203
|
+
mod traits;
|
|
204
|
+
mod validator;
|
|
205
|
+
|
|
206
|
+
pub use extractor::{DocumentExtractor, clear_extractors, list_extractors, register_extractor, unregister_extractor};
|
|
207
|
+
pub use ocr::{
|
|
208
|
+
OcrBackend, OcrBackendType, clear_ocr_backends, list_ocr_backends, register_ocr_backend, unregister_ocr_backend,
|
|
209
|
+
};
|
|
210
|
+
pub use processor::{PostProcessor, ProcessingStage, list_post_processors};
|
|
211
|
+
pub use traits::Plugin;
|
|
212
|
+
pub use validator::{Validator, clear_validators, list_validators, register_validator, unregister_validator};
|