kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +104 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6750 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +53 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +52 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.so} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +887 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +87 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +634 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +452 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +81 -22
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
|
@@ -1,560 +1,560 @@
|
|
|
1
|
-
//! Comprehensive post-processor plugin system tests.
|
|
2
|
-
//!
|
|
3
|
-
//! Tests custom post-processor registration, execution, modifications,
|
|
4
|
-
//! error handling, and cleanup with real file extraction.
|
|
5
|
-
|
|
6
|
-
use async_trait::async_trait;
|
|
7
|
-
use kreuzberg::core::config::ExtractionConfig;
|
|
8
|
-
use kreuzberg::plugins::registry::get_post_processor_registry;
|
|
9
|
-
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
10
|
-
use kreuzberg::types::ExtractionResult;
|
|
11
|
-
use kreuzberg::{KreuzbergError, Result, extract_file_sync};
|
|
12
|
-
use serial_test::serial;
|
|
13
|
-
use std::sync::Arc;
|
|
14
|
-
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
|
15
|
-
|
|
16
|
-
struct AppendTextProcessor {
|
|
17
|
-
name: String,
|
|
18
|
-
text_to_append: String,
|
|
19
|
-
call_count: AtomicUsize,
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
impl Plugin for AppendTextProcessor {
|
|
23
|
-
fn name(&self) -> &str {
|
|
24
|
-
&self.name
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
fn version(&self) -> String {
|
|
28
|
-
"1.0.0".to_string()
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
fn initialize(&self) -> Result<()> {
|
|
32
|
-
Ok(())
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
fn shutdown(&self) -> Result<()> {
|
|
36
|
-
Ok(())
|
|
37
|
-
}
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
#[async_trait]
|
|
41
|
-
impl PostProcessor for AppendTextProcessor {
|
|
42
|
-
async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
43
|
-
self.call_count.fetch_add(1, Ordering::SeqCst);
|
|
44
|
-
result.content.push_str(&self.text_to_append);
|
|
45
|
-
Ok(())
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
fn processing_stage(&self) -> ProcessingStage {
|
|
49
|
-
ProcessingStage::Late
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
struct MetadataAddingProcessor {
|
|
54
|
-
name: String,
|
|
55
|
-
initialized: AtomicBool,
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
impl Plugin for MetadataAddingProcessor {
|
|
59
|
-
fn name(&self) -> &str {
|
|
60
|
-
&self.name
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
fn version(&self) -> String {
|
|
64
|
-
"1.0.0".to_string()
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
fn initialize(&self) -> Result<()> {
|
|
68
|
-
self.initialized.store(true, Ordering::Release);
|
|
69
|
-
Ok(())
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
fn shutdown(&self) -> Result<()> {
|
|
73
|
-
self.initialized.store(false, Ordering::Release);
|
|
74
|
-
Ok(())
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
#[async_trait]
|
|
79
|
-
impl PostProcessor for MetadataAddingProcessor {
|
|
80
|
-
async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
81
|
-
result
|
|
82
|
-
.metadata
|
|
83
|
-
.additional
|
|
84
|
-
.insert("processed_by".to_string(), serde_json::json!(self.name()));
|
|
85
|
-
result.metadata.additional.insert(
|
|
86
|
-
"word_count".to_string(),
|
|
87
|
-
serde_json::json!(result.content.split_whitespace().count()),
|
|
88
|
-
);
|
|
89
|
-
Ok(())
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
fn processing_stage(&self) -> ProcessingStage {
|
|
93
|
-
ProcessingStage::Early
|
|
94
|
-
}
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
struct UppercaseProcessor {
|
|
98
|
-
name: String,
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
impl Plugin for UppercaseProcessor {
|
|
102
|
-
fn name(&self) -> &str {
|
|
103
|
-
&self.name
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
fn version(&self) -> String {
|
|
107
|
-
"1.0.0".to_string()
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
fn initialize(&self) -> Result<()> {
|
|
111
|
-
Ok(())
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
fn shutdown(&self) -> Result<()> {
|
|
115
|
-
Ok(())
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
#[async_trait]
|
|
120
|
-
impl PostProcessor for UppercaseProcessor {
|
|
121
|
-
async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
122
|
-
result.content = result.content.to_uppercase();
|
|
123
|
-
Ok(())
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
fn processing_stage(&self) -> ProcessingStage {
|
|
127
|
-
ProcessingStage::Middle
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
struct FailingProcessor {
|
|
132
|
-
name: String,
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
impl Plugin for FailingProcessor {
|
|
136
|
-
fn name(&self) -> &str {
|
|
137
|
-
&self.name
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
fn version(&self) -> String {
|
|
141
|
-
"1.0.0".to_string()
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
fn initialize(&self) -> Result<()> {
|
|
145
|
-
Ok(())
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
fn shutdown(&self) -> Result<()> {
|
|
149
|
-
Ok(())
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
#[async_trait]
|
|
154
|
-
impl PostProcessor for FailingProcessor {
|
|
155
|
-
async fn process(&self, _result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
156
|
-
Err(KreuzbergError::Plugin {
|
|
157
|
-
message: "Processor intentionally failed".to_string(),
|
|
158
|
-
plugin_name: self.name.clone(),
|
|
159
|
-
})
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
fn processing_stage(&self) -> ProcessingStage {
|
|
163
|
-
ProcessingStage::Early
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
#[serial]
|
|
168
|
-
#[test]
|
|
169
|
-
fn test_register_custom_postprocessor() {
|
|
170
|
-
let registry = get_post_processor_registry();
|
|
171
|
-
|
|
172
|
-
let processor = Arc::new(AppendTextProcessor {
|
|
173
|
-
name: "test-appender".to_string(),
|
|
174
|
-
text_to_append: " [PROCESSED]".to_string(),
|
|
175
|
-
call_count: AtomicUsize::new(0),
|
|
176
|
-
});
|
|
177
|
-
|
|
178
|
-
{
|
|
179
|
-
let mut reg = registry.write().unwrap();
|
|
180
|
-
reg.shutdown_all().unwrap();
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
{
|
|
184
|
-
let mut reg = registry.write().unwrap();
|
|
185
|
-
let result = reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100);
|
|
186
|
-
assert!(result.is_ok(), "Failed to register processor: {:?}", result.err());
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
let list = {
|
|
190
|
-
let reg = registry.read().unwrap();
|
|
191
|
-
reg.list()
|
|
192
|
-
};
|
|
193
|
-
|
|
194
|
-
assert!(list.contains(&"test-appender".to_string()));
|
|
195
|
-
|
|
196
|
-
{
|
|
197
|
-
let mut reg = registry.write().unwrap();
|
|
198
|
-
reg.shutdown_all().unwrap();
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
#[serial]
|
|
203
|
-
#[test]
|
|
204
|
-
fn test_postprocessor_called_during_extraction() {
|
|
205
|
-
let test_file = "../../test_documents/text/fake_text.txt";
|
|
206
|
-
let registry = get_post_processor_registry();
|
|
207
|
-
|
|
208
|
-
let processor = Arc::new(AppendTextProcessor {
|
|
209
|
-
name: "call-test-appender".to_string(),
|
|
210
|
-
text_to_append: "\n[APPENDED BY PROCESSOR]".to_string(),
|
|
211
|
-
call_count: AtomicUsize::new(0),
|
|
212
|
-
});
|
|
213
|
-
|
|
214
|
-
{
|
|
215
|
-
let mut reg = registry.write().unwrap();
|
|
216
|
-
reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100)
|
|
217
|
-
.unwrap();
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
let config = ExtractionConfig::default();
|
|
221
|
-
let result = extract_file_sync(test_file, None, &config);
|
|
222
|
-
|
|
223
|
-
assert!(result.is_ok(), "Extraction failed: {:?}", result.err());
|
|
224
|
-
|
|
225
|
-
let extraction_result = result.unwrap();
|
|
226
|
-
assert!(
|
|
227
|
-
extraction_result.content.contains("[APPENDED BY PROCESSOR]"),
|
|
228
|
-
"Processor did not modify content. Content: {}",
|
|
229
|
-
extraction_result.content
|
|
230
|
-
);
|
|
231
|
-
|
|
232
|
-
assert_eq!(
|
|
233
|
-
processor.call_count.load(Ordering::SeqCst),
|
|
234
|
-
1,
|
|
235
|
-
"Processor was not called exactly once"
|
|
236
|
-
);
|
|
237
|
-
|
|
238
|
-
{
|
|
239
|
-
let mut reg = registry.write().unwrap();
|
|
240
|
-
reg.shutdown_all().unwrap();
|
|
241
|
-
}
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
#[serial]
|
|
245
|
-
#[test]
|
|
246
|
-
fn test_postprocessor_modifies_content() {
|
|
247
|
-
let test_file = "../../test_documents/text/fake_text.txt";
|
|
248
|
-
let registry = get_post_processor_registry();
|
|
249
|
-
|
|
250
|
-
let processor = Arc::new(UppercaseProcessor {
|
|
251
|
-
name: "uppercase-processor".to_string(),
|
|
252
|
-
});
|
|
253
|
-
|
|
254
|
-
{
|
|
255
|
-
let mut reg = registry.write().unwrap();
|
|
256
|
-
reg.register(processor as Arc<dyn PostProcessor>, 100).unwrap();
|
|
257
|
-
}
|
|
258
|
-
|
|
259
|
-
let config = ExtractionConfig::default();
|
|
260
|
-
let result = extract_file_sync(test_file, None, &config);
|
|
261
|
-
|
|
262
|
-
assert!(result.is_ok());
|
|
263
|
-
|
|
264
|
-
let extraction_result = result.unwrap();
|
|
265
|
-
let has_lowercase = extraction_result.content.chars().any(|c| c.is_lowercase());
|
|
266
|
-
|
|
267
|
-
assert!(!has_lowercase, "Content was not fully uppercased");
|
|
268
|
-
|
|
269
|
-
{
|
|
270
|
-
let mut reg = registry.write().unwrap();
|
|
271
|
-
reg.shutdown_all().unwrap();
|
|
272
|
-
}
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
#[serial]
|
|
276
|
-
#[test]
|
|
277
|
-
fn test_postprocessor_adds_metadata() {
|
|
278
|
-
let test_file = "../../test_documents/text/fake_text.txt";
|
|
279
|
-
let registry = get_post_processor_registry();
|
|
280
|
-
|
|
281
|
-
let processor = Arc::new(MetadataAddingProcessor {
|
|
282
|
-
name: "metadata-adder".to_string(),
|
|
283
|
-
initialized: AtomicBool::new(false),
|
|
284
|
-
});
|
|
285
|
-
|
|
286
|
-
{
|
|
287
|
-
let mut reg = registry.write().unwrap();
|
|
288
|
-
reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100)
|
|
289
|
-
.unwrap();
|
|
290
|
-
}
|
|
291
|
-
|
|
292
|
-
assert!(
|
|
293
|
-
processor.initialized.load(Ordering::Acquire),
|
|
294
|
-
"Processor was not initialized"
|
|
295
|
-
);
|
|
296
|
-
|
|
297
|
-
let config = ExtractionConfig::default();
|
|
298
|
-
let result = extract_file_sync(test_file, None, &config);
|
|
299
|
-
|
|
300
|
-
assert!(result.is_ok());
|
|
301
|
-
|
|
302
|
-
let extraction_result = result.unwrap();
|
|
303
|
-
|
|
304
|
-
assert!(
|
|
305
|
-
extraction_result.metadata.additional.contains_key("processed_by"),
|
|
306
|
-
"Metadata 'processed_by' not added"
|
|
307
|
-
);
|
|
308
|
-
assert!(
|
|
309
|
-
extraction_result.metadata.additional.contains_key("word_count"),
|
|
310
|
-
"Metadata 'word_count' not added"
|
|
311
|
-
);
|
|
312
|
-
|
|
313
|
-
let processed_by = extraction_result.metadata.additional.get("processed_by").unwrap();
|
|
314
|
-
assert_eq!(processed_by.as_str().unwrap(), "metadata-adder");
|
|
315
|
-
|
|
316
|
-
{
|
|
317
|
-
let mut reg = registry.write().unwrap();
|
|
318
|
-
reg.shutdown_all().unwrap();
|
|
319
|
-
}
|
|
320
|
-
|
|
321
|
-
assert!(
|
|
322
|
-
!processor.initialized.load(Ordering::Acquire),
|
|
323
|
-
"Processor was not shutdown"
|
|
324
|
-
);
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
#[serial]
|
|
328
|
-
#[test]
|
|
329
|
-
fn test_unregister_postprocessor() {
|
|
330
|
-
let registry = get_post_processor_registry();
|
|
331
|
-
|
|
332
|
-
let processor = Arc::new(AppendTextProcessor {
|
|
333
|
-
name: "unregister-test".to_string(),
|
|
334
|
-
text_to_append: " [SHOULD NOT APPEAR]".to_string(),
|
|
335
|
-
call_count: AtomicUsize::new(0),
|
|
336
|
-
});
|
|
337
|
-
|
|
338
|
-
{
|
|
339
|
-
let mut reg = registry.write().unwrap();
|
|
340
|
-
reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100)
|
|
341
|
-
.unwrap();
|
|
342
|
-
}
|
|
343
|
-
|
|
344
|
-
{
|
|
345
|
-
let mut reg = registry.write().unwrap();
|
|
346
|
-
reg.remove("unregister-test").unwrap();
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
let list = {
|
|
350
|
-
let reg = registry.read().unwrap();
|
|
351
|
-
reg.list()
|
|
352
|
-
};
|
|
353
|
-
|
|
354
|
-
assert!(!list.contains(&"unregister-test".to_string()));
|
|
355
|
-
|
|
356
|
-
let test_file = "../../test_documents/text/fake_text.txt";
|
|
357
|
-
let config = ExtractionConfig::default();
|
|
358
|
-
let result = extract_file_sync(test_file, None, &config);
|
|
359
|
-
|
|
360
|
-
assert!(result.is_ok());
|
|
361
|
-
|
|
362
|
-
let extraction_result = result.unwrap();
|
|
363
|
-
assert!(
|
|
364
|
-
!extraction_result.content.contains("[SHOULD NOT APPEAR]"),
|
|
365
|
-
"Unregistered processor still modified content"
|
|
366
|
-
);
|
|
367
|
-
|
|
368
|
-
assert_eq!(processor.call_count.load(Ordering::SeqCst), 0);
|
|
369
|
-
|
|
370
|
-
{
|
|
371
|
-
let mut reg = registry.write().unwrap();
|
|
372
|
-
reg.shutdown_all().unwrap();
|
|
373
|
-
}
|
|
374
|
-
}
|
|
375
|
-
|
|
376
|
-
#[serial]
|
|
377
|
-
#[test]
|
|
378
|
-
fn test_clear_all_postprocessors() {
|
|
379
|
-
let registry = get_post_processor_registry();
|
|
380
|
-
|
|
381
|
-
{
|
|
382
|
-
let mut reg = registry.write().unwrap();
|
|
383
|
-
reg.shutdown_all().unwrap();
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
let processor1 = Arc::new(AppendTextProcessor {
|
|
387
|
-
name: "clear-test-1".to_string(),
|
|
388
|
-
text_to_append: " [ONE]".to_string(),
|
|
389
|
-
call_count: AtomicUsize::new(0),
|
|
390
|
-
});
|
|
391
|
-
|
|
392
|
-
let processor2 = Arc::new(AppendTextProcessor {
|
|
393
|
-
name: "clear-test-2".to_string(),
|
|
394
|
-
text_to_append: " [TWO]".to_string(),
|
|
395
|
-
call_count: AtomicUsize::new(0),
|
|
396
|
-
});
|
|
397
|
-
|
|
398
|
-
{
|
|
399
|
-
let mut reg = registry.write().unwrap();
|
|
400
|
-
reg.register(processor1 as Arc<dyn PostProcessor>, 100).unwrap();
|
|
401
|
-
reg.register(processor2 as Arc<dyn PostProcessor>, 100).unwrap();
|
|
402
|
-
}
|
|
403
|
-
|
|
404
|
-
{
|
|
405
|
-
let mut reg = registry.write().unwrap();
|
|
406
|
-
reg.shutdown_all().unwrap();
|
|
407
|
-
}
|
|
408
|
-
|
|
409
|
-
let list = {
|
|
410
|
-
let reg = registry.read().unwrap();
|
|
411
|
-
reg.list()
|
|
412
|
-
};
|
|
413
|
-
|
|
414
|
-
assert!(list.is_empty(), "Registry was not cleared");
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
#[serial]
|
|
418
|
-
#[test]
|
|
419
|
-
fn test_postprocessor_error_handling() {
|
|
420
|
-
let test_file = "../../test_documents/text/fake_text.txt";
|
|
421
|
-
let registry = get_post_processor_registry();
|
|
422
|
-
|
|
423
|
-
let failing_processor = Arc::new(FailingProcessor {
|
|
424
|
-
name: "failing-processor".to_string(),
|
|
425
|
-
});
|
|
426
|
-
|
|
427
|
-
{
|
|
428
|
-
let mut reg = registry.write().unwrap();
|
|
429
|
-
reg.register(failing_processor as Arc<dyn PostProcessor>, 100).unwrap();
|
|
430
|
-
}
|
|
431
|
-
|
|
432
|
-
let config = ExtractionConfig::default();
|
|
433
|
-
let result = extract_file_sync(test_file, None, &config);
|
|
434
|
-
|
|
435
|
-
// NOTE: Plugin errors now bubble up and fail the extraction (design change)
|
|
436
|
-
assert!(
|
|
437
|
-
result.is_err(),
|
|
438
|
-
"Extraction should fail when postprocessor returns Plugin error"
|
|
439
|
-
);
|
|
440
|
-
|
|
441
|
-
match result {
|
|
442
|
-
Err(KreuzbergError::Plugin { message, plugin_name }) => {
|
|
443
|
-
assert_eq!(plugin_name, "failing-processor");
|
|
444
|
-
assert_eq!(message, "Processor intentionally failed");
|
|
445
|
-
}
|
|
446
|
-
_ => panic!("Expected Plugin error"),
|
|
447
|
-
}
|
|
448
|
-
|
|
449
|
-
{
|
|
450
|
-
let mut reg = registry.write().unwrap();
|
|
451
|
-
reg.shutdown_all().unwrap();
|
|
452
|
-
}
|
|
453
|
-
}
|
|
454
|
-
|
|
455
|
-
#[serial]
|
|
456
|
-
#[test]
|
|
457
|
-
fn test_postprocessor_invalid_name() {
|
|
458
|
-
let registry = get_post_processor_registry();
|
|
459
|
-
|
|
460
|
-
{
|
|
461
|
-
let mut reg = registry.write().unwrap();
|
|
462
|
-
reg.shutdown_all().unwrap();
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
let processor = Arc::new(AppendTextProcessor {
|
|
466
|
-
name: "invalid name".to_string(),
|
|
467
|
-
text_to_append: " [TEST]".to_string(),
|
|
468
|
-
call_count: AtomicUsize::new(0),
|
|
469
|
-
});
|
|
470
|
-
|
|
471
|
-
{
|
|
472
|
-
let mut reg = registry.write().unwrap();
|
|
473
|
-
let result = reg.register(processor, 100);
|
|
474
|
-
|
|
475
|
-
assert!(result.is_err());
|
|
476
|
-
assert!(matches!(result.err().unwrap(), KreuzbergError::Validation { .. }));
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
{
|
|
480
|
-
let mut reg = registry.write().unwrap();
|
|
481
|
-
reg.shutdown_all().unwrap();
|
|
482
|
-
}
|
|
483
|
-
}
|
|
484
|
-
|
|
485
|
-
#[serial]
|
|
486
|
-
#[test]
|
|
487
|
-
fn test_multiple_postprocessors_execution_order() {
|
|
488
|
-
let test_file = "../../test_documents/text/fake_text.txt";
|
|
489
|
-
let registry = get_post_processor_registry();
|
|
490
|
-
|
|
491
|
-
let early_processor = Arc::new(MetadataAddingProcessor {
|
|
492
|
-
name: "early-processor".to_string(),
|
|
493
|
-
initialized: AtomicBool::new(false),
|
|
494
|
-
});
|
|
495
|
-
|
|
496
|
-
let middle_processor = Arc::new(UppercaseProcessor {
|
|
497
|
-
name: "middle-processor".to_string(),
|
|
498
|
-
});
|
|
499
|
-
|
|
500
|
-
let late_processor = Arc::new(AppendTextProcessor {
|
|
501
|
-
name: "late-processor".to_string(),
|
|
502
|
-
text_to_append: " [LATE]".to_string(),
|
|
503
|
-
call_count: AtomicUsize::new(0),
|
|
504
|
-
});
|
|
505
|
-
|
|
506
|
-
{
|
|
507
|
-
let mut reg = registry.write().unwrap();
|
|
508
|
-
reg.register(early_processor as Arc<dyn PostProcessor>, 100).unwrap();
|
|
509
|
-
reg.register(middle_processor as Arc<dyn PostProcessor>, 100).unwrap();
|
|
510
|
-
reg.register(late_processor as Arc<dyn PostProcessor>, 100).unwrap();
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
let config = ExtractionConfig::default();
|
|
514
|
-
let result = extract_file_sync(test_file, None, &config);
|
|
515
|
-
|
|
516
|
-
assert!(result.is_ok());
|
|
517
|
-
|
|
518
|
-
let extraction_result = result.unwrap();
|
|
519
|
-
|
|
520
|
-
assert!(extraction_result.metadata.additional.contains_key("processed_by"));
|
|
521
|
-
assert!(!extraction_result.content.chars().any(|c| c.is_lowercase()));
|
|
522
|
-
assert!(extraction_result.content.contains("[LATE]"));
|
|
523
|
-
|
|
524
|
-
{
|
|
525
|
-
let mut reg = registry.write().unwrap();
|
|
526
|
-
reg.shutdown_all().unwrap();
|
|
527
|
-
}
|
|
528
|
-
}
|
|
529
|
-
|
|
530
|
-
#[serial]
|
|
531
|
-
#[test]
|
|
532
|
-
fn test_postprocessor_preserves_mime_type() {
|
|
533
|
-
let test_file = "../../test_documents/text/fake_text.txt";
|
|
534
|
-
let registry = get_post_processor_registry();
|
|
535
|
-
|
|
536
|
-
let processor = Arc::new(AppendTextProcessor {
|
|
537
|
-
name: "mime-test".to_string(),
|
|
538
|
-
text_to_append: " [TEST]".to_string(),
|
|
539
|
-
call_count: AtomicUsize::new(0),
|
|
540
|
-
});
|
|
541
|
-
|
|
542
|
-
{
|
|
543
|
-
let mut reg = registry.write().unwrap();
|
|
544
|
-
reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100)
|
|
545
|
-
.unwrap();
|
|
546
|
-
}
|
|
547
|
-
|
|
548
|
-
let config = ExtractionConfig::default();
|
|
549
|
-
let result = extract_file_sync(test_file, None, &config);
|
|
550
|
-
|
|
551
|
-
assert!(result.is_ok());
|
|
552
|
-
|
|
553
|
-
let extraction_result = result.unwrap();
|
|
554
|
-
assert_eq!(extraction_result.mime_type, "text/plain");
|
|
555
|
-
|
|
556
|
-
{
|
|
557
|
-
let mut reg = registry.write().unwrap();
|
|
558
|
-
reg.shutdown_all().unwrap();
|
|
559
|
-
}
|
|
560
|
-
}
|
|
1
|
+
//! Comprehensive post-processor plugin system tests.
|
|
2
|
+
//!
|
|
3
|
+
//! Tests custom post-processor registration, execution, modifications,
|
|
4
|
+
//! error handling, and cleanup with real file extraction.
|
|
5
|
+
|
|
6
|
+
use async_trait::async_trait;
|
|
7
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
8
|
+
use kreuzberg::plugins::registry::get_post_processor_registry;
|
|
9
|
+
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
10
|
+
use kreuzberg::types::ExtractionResult;
|
|
11
|
+
use kreuzberg::{KreuzbergError, Result, extract_file_sync};
|
|
12
|
+
use serial_test::serial;
|
|
13
|
+
use std::sync::Arc;
|
|
14
|
+
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
|
15
|
+
|
|
16
|
+
struct AppendTextProcessor {
|
|
17
|
+
name: String,
|
|
18
|
+
text_to_append: String,
|
|
19
|
+
call_count: AtomicUsize,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
impl Plugin for AppendTextProcessor {
|
|
23
|
+
fn name(&self) -> &str {
|
|
24
|
+
&self.name
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
fn version(&self) -> String {
|
|
28
|
+
"1.0.0".to_string()
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
fn initialize(&self) -> Result<()> {
|
|
32
|
+
Ok(())
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
fn shutdown(&self) -> Result<()> {
|
|
36
|
+
Ok(())
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
#[async_trait]
|
|
41
|
+
impl PostProcessor for AppendTextProcessor {
|
|
42
|
+
async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
43
|
+
self.call_count.fetch_add(1, Ordering::SeqCst);
|
|
44
|
+
result.content.push_str(&self.text_to_append);
|
|
45
|
+
Ok(())
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
fn processing_stage(&self) -> ProcessingStage {
|
|
49
|
+
ProcessingStage::Late
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
struct MetadataAddingProcessor {
|
|
54
|
+
name: String,
|
|
55
|
+
initialized: AtomicBool,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
impl Plugin for MetadataAddingProcessor {
|
|
59
|
+
fn name(&self) -> &str {
|
|
60
|
+
&self.name
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
fn version(&self) -> String {
|
|
64
|
+
"1.0.0".to_string()
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
fn initialize(&self) -> Result<()> {
|
|
68
|
+
self.initialized.store(true, Ordering::Release);
|
|
69
|
+
Ok(())
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
fn shutdown(&self) -> Result<()> {
|
|
73
|
+
self.initialized.store(false, Ordering::Release);
|
|
74
|
+
Ok(())
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
#[async_trait]
|
|
79
|
+
impl PostProcessor for MetadataAddingProcessor {
|
|
80
|
+
async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
81
|
+
result
|
|
82
|
+
.metadata
|
|
83
|
+
.additional
|
|
84
|
+
.insert("processed_by".to_string(), serde_json::json!(self.name()));
|
|
85
|
+
result.metadata.additional.insert(
|
|
86
|
+
"word_count".to_string(),
|
|
87
|
+
serde_json::json!(result.content.split_whitespace().count()),
|
|
88
|
+
);
|
|
89
|
+
Ok(())
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
fn processing_stage(&self) -> ProcessingStage {
|
|
93
|
+
ProcessingStage::Early
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
struct UppercaseProcessor {
|
|
98
|
+
name: String,
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
impl Plugin for UppercaseProcessor {
|
|
102
|
+
fn name(&self) -> &str {
|
|
103
|
+
&self.name
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
fn version(&self) -> String {
|
|
107
|
+
"1.0.0".to_string()
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
fn initialize(&self) -> Result<()> {
|
|
111
|
+
Ok(())
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
fn shutdown(&self) -> Result<()> {
|
|
115
|
+
Ok(())
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
#[async_trait]
|
|
120
|
+
impl PostProcessor for UppercaseProcessor {
|
|
121
|
+
async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
122
|
+
result.content = result.content.to_uppercase();
|
|
123
|
+
Ok(())
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
fn processing_stage(&self) -> ProcessingStage {
|
|
127
|
+
ProcessingStage::Middle
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
struct FailingProcessor {
|
|
132
|
+
name: String,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
impl Plugin for FailingProcessor {
|
|
136
|
+
fn name(&self) -> &str {
|
|
137
|
+
&self.name
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
fn version(&self) -> String {
|
|
141
|
+
"1.0.0".to_string()
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
fn initialize(&self) -> Result<()> {
|
|
145
|
+
Ok(())
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
fn shutdown(&self) -> Result<()> {
|
|
149
|
+
Ok(())
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
#[async_trait]
|
|
154
|
+
impl PostProcessor for FailingProcessor {
|
|
155
|
+
async fn process(&self, _result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
156
|
+
Err(KreuzbergError::Plugin {
|
|
157
|
+
message: "Processor intentionally failed".to_string(),
|
|
158
|
+
plugin_name: self.name.clone(),
|
|
159
|
+
})
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
fn processing_stage(&self) -> ProcessingStage {
|
|
163
|
+
ProcessingStage::Early
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
#[serial]
|
|
168
|
+
#[test]
|
|
169
|
+
fn test_register_custom_postprocessor() {
|
|
170
|
+
let registry = get_post_processor_registry();
|
|
171
|
+
|
|
172
|
+
let processor = Arc::new(AppendTextProcessor {
|
|
173
|
+
name: "test-appender".to_string(),
|
|
174
|
+
text_to_append: " [PROCESSED]".to_string(),
|
|
175
|
+
call_count: AtomicUsize::new(0),
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
{
|
|
179
|
+
let mut reg = registry.write().unwrap();
|
|
180
|
+
reg.shutdown_all().unwrap();
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
{
|
|
184
|
+
let mut reg = registry.write().unwrap();
|
|
185
|
+
let result = reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100);
|
|
186
|
+
assert!(result.is_ok(), "Failed to register processor: {:?}", result.err());
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
let list = {
|
|
190
|
+
let reg = registry.read().unwrap();
|
|
191
|
+
reg.list()
|
|
192
|
+
};
|
|
193
|
+
|
|
194
|
+
assert!(list.contains(&"test-appender".to_string()));
|
|
195
|
+
|
|
196
|
+
{
|
|
197
|
+
let mut reg = registry.write().unwrap();
|
|
198
|
+
reg.shutdown_all().unwrap();
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
#[serial]
|
|
203
|
+
#[test]
|
|
204
|
+
fn test_postprocessor_called_during_extraction() {
|
|
205
|
+
let test_file = "../../test_documents/text/fake_text.txt";
|
|
206
|
+
let registry = get_post_processor_registry();
|
|
207
|
+
|
|
208
|
+
let processor = Arc::new(AppendTextProcessor {
|
|
209
|
+
name: "call-test-appender".to_string(),
|
|
210
|
+
text_to_append: "\n[APPENDED BY PROCESSOR]".to_string(),
|
|
211
|
+
call_count: AtomicUsize::new(0),
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
{
|
|
215
|
+
let mut reg = registry.write().unwrap();
|
|
216
|
+
reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100)
|
|
217
|
+
.unwrap();
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
let config = ExtractionConfig::default();
|
|
221
|
+
let result = extract_file_sync(test_file, None, &config);
|
|
222
|
+
|
|
223
|
+
assert!(result.is_ok(), "Extraction failed: {:?}", result.err());
|
|
224
|
+
|
|
225
|
+
let extraction_result = result.unwrap();
|
|
226
|
+
assert!(
|
|
227
|
+
extraction_result.content.contains("[APPENDED BY PROCESSOR]"),
|
|
228
|
+
"Processor did not modify content. Content: {}",
|
|
229
|
+
extraction_result.content
|
|
230
|
+
);
|
|
231
|
+
|
|
232
|
+
assert_eq!(
|
|
233
|
+
processor.call_count.load(Ordering::SeqCst),
|
|
234
|
+
1,
|
|
235
|
+
"Processor was not called exactly once"
|
|
236
|
+
);
|
|
237
|
+
|
|
238
|
+
{
|
|
239
|
+
let mut reg = registry.write().unwrap();
|
|
240
|
+
reg.shutdown_all().unwrap();
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
#[serial]
|
|
245
|
+
#[test]
|
|
246
|
+
fn test_postprocessor_modifies_content() {
|
|
247
|
+
let test_file = "../../test_documents/text/fake_text.txt";
|
|
248
|
+
let registry = get_post_processor_registry();
|
|
249
|
+
|
|
250
|
+
let processor = Arc::new(UppercaseProcessor {
|
|
251
|
+
name: "uppercase-processor".to_string(),
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
{
|
|
255
|
+
let mut reg = registry.write().unwrap();
|
|
256
|
+
reg.register(processor as Arc<dyn PostProcessor>, 100).unwrap();
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
let config = ExtractionConfig::default();
|
|
260
|
+
let result = extract_file_sync(test_file, None, &config);
|
|
261
|
+
|
|
262
|
+
assert!(result.is_ok());
|
|
263
|
+
|
|
264
|
+
let extraction_result = result.unwrap();
|
|
265
|
+
let has_lowercase = extraction_result.content.chars().any(|c| c.is_lowercase());
|
|
266
|
+
|
|
267
|
+
assert!(!has_lowercase, "Content was not fully uppercased");
|
|
268
|
+
|
|
269
|
+
{
|
|
270
|
+
let mut reg = registry.write().unwrap();
|
|
271
|
+
reg.shutdown_all().unwrap();
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
#[serial]
|
|
276
|
+
#[test]
|
|
277
|
+
fn test_postprocessor_adds_metadata() {
|
|
278
|
+
let test_file = "../../test_documents/text/fake_text.txt";
|
|
279
|
+
let registry = get_post_processor_registry();
|
|
280
|
+
|
|
281
|
+
let processor = Arc::new(MetadataAddingProcessor {
|
|
282
|
+
name: "metadata-adder".to_string(),
|
|
283
|
+
initialized: AtomicBool::new(false),
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
{
|
|
287
|
+
let mut reg = registry.write().unwrap();
|
|
288
|
+
reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100)
|
|
289
|
+
.unwrap();
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
assert!(
|
|
293
|
+
processor.initialized.load(Ordering::Acquire),
|
|
294
|
+
"Processor was not initialized"
|
|
295
|
+
);
|
|
296
|
+
|
|
297
|
+
let config = ExtractionConfig::default();
|
|
298
|
+
let result = extract_file_sync(test_file, None, &config);
|
|
299
|
+
|
|
300
|
+
assert!(result.is_ok());
|
|
301
|
+
|
|
302
|
+
let extraction_result = result.unwrap();
|
|
303
|
+
|
|
304
|
+
assert!(
|
|
305
|
+
extraction_result.metadata.additional.contains_key("processed_by"),
|
|
306
|
+
"Metadata 'processed_by' not added"
|
|
307
|
+
);
|
|
308
|
+
assert!(
|
|
309
|
+
extraction_result.metadata.additional.contains_key("word_count"),
|
|
310
|
+
"Metadata 'word_count' not added"
|
|
311
|
+
);
|
|
312
|
+
|
|
313
|
+
let processed_by = extraction_result.metadata.additional.get("processed_by").unwrap();
|
|
314
|
+
assert_eq!(processed_by.as_str().unwrap(), "metadata-adder");
|
|
315
|
+
|
|
316
|
+
{
|
|
317
|
+
let mut reg = registry.write().unwrap();
|
|
318
|
+
reg.shutdown_all().unwrap();
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
assert!(
|
|
322
|
+
!processor.initialized.load(Ordering::Acquire),
|
|
323
|
+
"Processor was not shutdown"
|
|
324
|
+
);
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
#[serial]
|
|
328
|
+
#[test]
|
|
329
|
+
fn test_unregister_postprocessor() {
|
|
330
|
+
let registry = get_post_processor_registry();
|
|
331
|
+
|
|
332
|
+
let processor = Arc::new(AppendTextProcessor {
|
|
333
|
+
name: "unregister-test".to_string(),
|
|
334
|
+
text_to_append: " [SHOULD NOT APPEAR]".to_string(),
|
|
335
|
+
call_count: AtomicUsize::new(0),
|
|
336
|
+
});
|
|
337
|
+
|
|
338
|
+
{
|
|
339
|
+
let mut reg = registry.write().unwrap();
|
|
340
|
+
reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100)
|
|
341
|
+
.unwrap();
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
{
|
|
345
|
+
let mut reg = registry.write().unwrap();
|
|
346
|
+
reg.remove("unregister-test").unwrap();
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
let list = {
|
|
350
|
+
let reg = registry.read().unwrap();
|
|
351
|
+
reg.list()
|
|
352
|
+
};
|
|
353
|
+
|
|
354
|
+
assert!(!list.contains(&"unregister-test".to_string()));
|
|
355
|
+
|
|
356
|
+
let test_file = "../../test_documents/text/fake_text.txt";
|
|
357
|
+
let config = ExtractionConfig::default();
|
|
358
|
+
let result = extract_file_sync(test_file, None, &config);
|
|
359
|
+
|
|
360
|
+
assert!(result.is_ok());
|
|
361
|
+
|
|
362
|
+
let extraction_result = result.unwrap();
|
|
363
|
+
assert!(
|
|
364
|
+
!extraction_result.content.contains("[SHOULD NOT APPEAR]"),
|
|
365
|
+
"Unregistered processor still modified content"
|
|
366
|
+
);
|
|
367
|
+
|
|
368
|
+
assert_eq!(processor.call_count.load(Ordering::SeqCst), 0);
|
|
369
|
+
|
|
370
|
+
{
|
|
371
|
+
let mut reg = registry.write().unwrap();
|
|
372
|
+
reg.shutdown_all().unwrap();
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
#[serial]
|
|
377
|
+
#[test]
|
|
378
|
+
fn test_clear_all_postprocessors() {
|
|
379
|
+
let registry = get_post_processor_registry();
|
|
380
|
+
|
|
381
|
+
{
|
|
382
|
+
let mut reg = registry.write().unwrap();
|
|
383
|
+
reg.shutdown_all().unwrap();
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
let processor1 = Arc::new(AppendTextProcessor {
|
|
387
|
+
name: "clear-test-1".to_string(),
|
|
388
|
+
text_to_append: " [ONE]".to_string(),
|
|
389
|
+
call_count: AtomicUsize::new(0),
|
|
390
|
+
});
|
|
391
|
+
|
|
392
|
+
let processor2 = Arc::new(AppendTextProcessor {
|
|
393
|
+
name: "clear-test-2".to_string(),
|
|
394
|
+
text_to_append: " [TWO]".to_string(),
|
|
395
|
+
call_count: AtomicUsize::new(0),
|
|
396
|
+
});
|
|
397
|
+
|
|
398
|
+
{
|
|
399
|
+
let mut reg = registry.write().unwrap();
|
|
400
|
+
reg.register(processor1 as Arc<dyn PostProcessor>, 100).unwrap();
|
|
401
|
+
reg.register(processor2 as Arc<dyn PostProcessor>, 100).unwrap();
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
{
|
|
405
|
+
let mut reg = registry.write().unwrap();
|
|
406
|
+
reg.shutdown_all().unwrap();
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
let list = {
|
|
410
|
+
let reg = registry.read().unwrap();
|
|
411
|
+
reg.list()
|
|
412
|
+
};
|
|
413
|
+
|
|
414
|
+
assert!(list.is_empty(), "Registry was not cleared");
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
#[serial]
|
|
418
|
+
#[test]
|
|
419
|
+
fn test_postprocessor_error_handling() {
|
|
420
|
+
let test_file = "../../test_documents/text/fake_text.txt";
|
|
421
|
+
let registry = get_post_processor_registry();
|
|
422
|
+
|
|
423
|
+
let failing_processor = Arc::new(FailingProcessor {
|
|
424
|
+
name: "failing-processor".to_string(),
|
|
425
|
+
});
|
|
426
|
+
|
|
427
|
+
{
|
|
428
|
+
let mut reg = registry.write().unwrap();
|
|
429
|
+
reg.register(failing_processor as Arc<dyn PostProcessor>, 100).unwrap();
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
let config = ExtractionConfig::default();
|
|
433
|
+
let result = extract_file_sync(test_file, None, &config);
|
|
434
|
+
|
|
435
|
+
// NOTE: Plugin errors now bubble up and fail the extraction (design change)
|
|
436
|
+
assert!(
|
|
437
|
+
result.is_err(),
|
|
438
|
+
"Extraction should fail when postprocessor returns Plugin error"
|
|
439
|
+
);
|
|
440
|
+
|
|
441
|
+
match result {
|
|
442
|
+
Err(KreuzbergError::Plugin { message, plugin_name }) => {
|
|
443
|
+
assert_eq!(plugin_name, "failing-processor");
|
|
444
|
+
assert_eq!(message, "Processor intentionally failed");
|
|
445
|
+
}
|
|
446
|
+
_ => panic!("Expected Plugin error"),
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
{
|
|
450
|
+
let mut reg = registry.write().unwrap();
|
|
451
|
+
reg.shutdown_all().unwrap();
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
#[serial]
|
|
456
|
+
#[test]
|
|
457
|
+
fn test_postprocessor_invalid_name() {
|
|
458
|
+
let registry = get_post_processor_registry();
|
|
459
|
+
|
|
460
|
+
{
|
|
461
|
+
let mut reg = registry.write().unwrap();
|
|
462
|
+
reg.shutdown_all().unwrap();
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
let processor = Arc::new(AppendTextProcessor {
|
|
466
|
+
name: "invalid name".to_string(),
|
|
467
|
+
text_to_append: " [TEST]".to_string(),
|
|
468
|
+
call_count: AtomicUsize::new(0),
|
|
469
|
+
});
|
|
470
|
+
|
|
471
|
+
{
|
|
472
|
+
let mut reg = registry.write().unwrap();
|
|
473
|
+
let result = reg.register(processor, 100);
|
|
474
|
+
|
|
475
|
+
assert!(result.is_err());
|
|
476
|
+
assert!(matches!(result.err().unwrap(), KreuzbergError::Validation { .. }));
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
{
|
|
480
|
+
let mut reg = registry.write().unwrap();
|
|
481
|
+
reg.shutdown_all().unwrap();
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
#[serial]
|
|
486
|
+
#[test]
|
|
487
|
+
fn test_multiple_postprocessors_execution_order() {
|
|
488
|
+
let test_file = "../../test_documents/text/fake_text.txt";
|
|
489
|
+
let registry = get_post_processor_registry();
|
|
490
|
+
|
|
491
|
+
let early_processor = Arc::new(MetadataAddingProcessor {
|
|
492
|
+
name: "early-processor".to_string(),
|
|
493
|
+
initialized: AtomicBool::new(false),
|
|
494
|
+
});
|
|
495
|
+
|
|
496
|
+
let middle_processor = Arc::new(UppercaseProcessor {
|
|
497
|
+
name: "middle-processor".to_string(),
|
|
498
|
+
});
|
|
499
|
+
|
|
500
|
+
let late_processor = Arc::new(AppendTextProcessor {
|
|
501
|
+
name: "late-processor".to_string(),
|
|
502
|
+
text_to_append: " [LATE]".to_string(),
|
|
503
|
+
call_count: AtomicUsize::new(0),
|
|
504
|
+
});
|
|
505
|
+
|
|
506
|
+
{
|
|
507
|
+
let mut reg = registry.write().unwrap();
|
|
508
|
+
reg.register(early_processor as Arc<dyn PostProcessor>, 100).unwrap();
|
|
509
|
+
reg.register(middle_processor as Arc<dyn PostProcessor>, 100).unwrap();
|
|
510
|
+
reg.register(late_processor as Arc<dyn PostProcessor>, 100).unwrap();
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
let config = ExtractionConfig::default();
|
|
514
|
+
let result = extract_file_sync(test_file, None, &config);
|
|
515
|
+
|
|
516
|
+
assert!(result.is_ok());
|
|
517
|
+
|
|
518
|
+
let extraction_result = result.unwrap();
|
|
519
|
+
|
|
520
|
+
assert!(extraction_result.metadata.additional.contains_key("processed_by"));
|
|
521
|
+
assert!(!extraction_result.content.chars().any(|c| c.is_lowercase()));
|
|
522
|
+
assert!(extraction_result.content.contains("[LATE]"));
|
|
523
|
+
|
|
524
|
+
{
|
|
525
|
+
let mut reg = registry.write().unwrap();
|
|
526
|
+
reg.shutdown_all().unwrap();
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
#[serial]
|
|
531
|
+
#[test]
|
|
532
|
+
fn test_postprocessor_preserves_mime_type() {
|
|
533
|
+
let test_file = "../../test_documents/text/fake_text.txt";
|
|
534
|
+
let registry = get_post_processor_registry();
|
|
535
|
+
|
|
536
|
+
let processor = Arc::new(AppendTextProcessor {
|
|
537
|
+
name: "mime-test".to_string(),
|
|
538
|
+
text_to_append: " [TEST]".to_string(),
|
|
539
|
+
call_count: AtomicUsize::new(0),
|
|
540
|
+
});
|
|
541
|
+
|
|
542
|
+
{
|
|
543
|
+
let mut reg = registry.write().unwrap();
|
|
544
|
+
reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100)
|
|
545
|
+
.unwrap();
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
let config = ExtractionConfig::default();
|
|
549
|
+
let result = extract_file_sync(test_file, None, &config);
|
|
550
|
+
|
|
551
|
+
assert!(result.is_ok());
|
|
552
|
+
|
|
553
|
+
let extraction_result = result.unwrap();
|
|
554
|
+
assert_eq!(extraction_result.mime_type, "text/plain");
|
|
555
|
+
|
|
556
|
+
{
|
|
557
|
+
let mut reg = registry.write().unwrap();
|
|
558
|
+
reg.shutdown_all().unwrap();
|
|
559
|
+
}
|
|
560
|
+
}
|