kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +104 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6750 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +53 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +52 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.so} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +887 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +87 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +634 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +452 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +81 -22
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
|
@@ -1,220 +1,220 @@
|
|
|
1
|
-
//! Text chunking post-processor.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides a PostProcessor plugin that chunks text content in
|
|
4
|
-
//! extraction results.
|
|
5
|
-
|
|
6
|
-
use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
7
|
-
use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
|
|
8
|
-
use async_trait::async_trait;
|
|
9
|
-
|
|
10
|
-
/// Post-processor that chunks text in document content.
|
|
11
|
-
///
|
|
12
|
-
/// This processor:
|
|
13
|
-
/// - Runs in the Middle processing stage
|
|
14
|
-
/// - Only processes when `config.chunking` is configured
|
|
15
|
-
/// - Stores chunks in `result.chunks`
|
|
16
|
-
/// - Uses configurable chunk size and overlap
|
|
17
|
-
///
|
|
18
|
-
/// # Example
|
|
19
|
-
///
|
|
20
|
-
/// ```rust,no_run
|
|
21
|
-
/// use kreuzberg::plugins::{Plugin, PostProcessor};
|
|
22
|
-
/// use kreuzberg::chunking::processor::ChunkingProcessor;
|
|
23
|
-
///
|
|
24
|
-
/// let processor = ChunkingProcessor;
|
|
25
|
-
/// assert_eq!(processor.name(), "text-chunking");
|
|
26
|
-
/// ```
|
|
27
|
-
#[derive(Debug, Clone, Copy)]
|
|
28
|
-
pub struct ChunkingProcessor;
|
|
29
|
-
|
|
30
|
-
impl Plugin for ChunkingProcessor {
|
|
31
|
-
fn name(&self) -> &str {
|
|
32
|
-
"text-chunking"
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
fn version(&self) -> String {
|
|
36
|
-
env!("CARGO_PKG_VERSION").to_string()
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
fn initialize(&self) -> Result<()> {
|
|
40
|
-
Ok(())
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
fn shutdown(&self) -> Result<()> {
|
|
44
|
-
Ok(())
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
|
|
49
|
-
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
50
|
-
impl PostProcessor for ChunkingProcessor {
|
|
51
|
-
async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
|
|
52
|
-
let chunking_config = match &config.chunking {
|
|
53
|
-
Some(cfg) => cfg,
|
|
54
|
-
None => return Ok(()),
|
|
55
|
-
};
|
|
56
|
-
|
|
57
|
-
let chunk_config = crate::chunking::ChunkingConfig {
|
|
58
|
-
max_characters: chunking_config.max_chars,
|
|
59
|
-
overlap: chunking_config.max_overlap,
|
|
60
|
-
trim: true,
|
|
61
|
-
chunker_type: crate::chunking::ChunkerType::Text,
|
|
62
|
-
};
|
|
63
|
-
|
|
64
|
-
let chunking_result = crate::chunking::chunk_text(&result.content, &chunk_config, None)
|
|
65
|
-
.map_err(|e| KreuzbergError::Other(format!("Chunking failed: {}", e)))?;
|
|
66
|
-
result.chunks = Some(chunking_result.chunks);
|
|
67
|
-
|
|
68
|
-
Ok(())
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
fn processing_stage(&self) -> ProcessingStage {
|
|
72
|
-
ProcessingStage::Middle
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
|
|
76
|
-
config.chunking.is_some()
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
|
|
80
|
-
let text_length = result.content.len();
|
|
81
|
-
// Chunking is fast: ~1ms per 10KB
|
|
82
|
-
(text_length / 10240).max(1) as u64
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
#[cfg(test)]
|
|
87
|
-
mod tests {
|
|
88
|
-
use super::*;
|
|
89
|
-
use crate::core::config::ChunkingConfig;
|
|
90
|
-
use crate::types::Metadata;
|
|
91
|
-
|
|
92
|
-
#[tokio::test]
|
|
93
|
-
async fn test_chunking_processor() {
|
|
94
|
-
let processor = ChunkingProcessor;
|
|
95
|
-
let config = ExtractionConfig {
|
|
96
|
-
chunking: Some(ChunkingConfig {
|
|
97
|
-
max_chars: 100,
|
|
98
|
-
max_overlap: 10,
|
|
99
|
-
embedding: None,
|
|
100
|
-
preset: None,
|
|
101
|
-
}),
|
|
102
|
-
..Default::default()
|
|
103
|
-
};
|
|
104
|
-
|
|
105
|
-
let mut result = ExtractionResult {
|
|
106
|
-
content: "This is a longer text that should be split into multiple chunks to test the chunking processor functionality.".to_string(),
|
|
107
|
-
mime_type: "text/plain".to_string(),
|
|
108
|
-
metadata: Metadata::default(),
|
|
109
|
-
tables: vec![],
|
|
110
|
-
detected_languages: None,
|
|
111
|
-
chunks: None,
|
|
112
|
-
images: None,
|
|
113
|
-
pages: None,
|
|
114
|
-
};
|
|
115
|
-
|
|
116
|
-
processor.process(&mut result, &config).await.unwrap();
|
|
117
|
-
|
|
118
|
-
assert!(result.chunks.is_some());
|
|
119
|
-
let chunks = result.chunks.unwrap();
|
|
120
|
-
assert!(!chunks.is_empty());
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
#[tokio::test]
|
|
124
|
-
async fn test_chunking_processor_no_config() {
|
|
125
|
-
let processor = ChunkingProcessor;
|
|
126
|
-
let config = ExtractionConfig::default();
|
|
127
|
-
|
|
128
|
-
let mut result = ExtractionResult {
|
|
129
|
-
content: "Some text".to_string(),
|
|
130
|
-
mime_type: "text/plain".to_string(),
|
|
131
|
-
metadata: Metadata::default(),
|
|
132
|
-
tables: vec![],
|
|
133
|
-
detected_languages: None,
|
|
134
|
-
chunks: None,
|
|
135
|
-
images: None,
|
|
136
|
-
pages: None,
|
|
137
|
-
};
|
|
138
|
-
|
|
139
|
-
processor.process(&mut result, &config).await.unwrap();
|
|
140
|
-
|
|
141
|
-
assert!(result.chunks.is_none());
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
#[test]
|
|
145
|
-
fn test_chunking_processor_plugin_interface() {
|
|
146
|
-
let processor = ChunkingProcessor;
|
|
147
|
-
assert_eq!(processor.name(), "text-chunking");
|
|
148
|
-
assert!(!processor.version().is_empty());
|
|
149
|
-
assert!(processor.initialize().is_ok());
|
|
150
|
-
assert!(processor.shutdown().is_ok());
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
#[test]
|
|
154
|
-
fn test_chunking_processor_stage() {
|
|
155
|
-
let processor = ChunkingProcessor;
|
|
156
|
-
assert_eq!(processor.processing_stage(), ProcessingStage::Middle);
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
#[test]
|
|
160
|
-
fn test_chunking_processor_should_process() {
|
|
161
|
-
let processor = ChunkingProcessor;
|
|
162
|
-
|
|
163
|
-
let result = ExtractionResult {
|
|
164
|
-
content: "Sample text".to_string(),
|
|
165
|
-
mime_type: "text/plain".to_string(),
|
|
166
|
-
metadata: Metadata::default(),
|
|
167
|
-
tables: vec![],
|
|
168
|
-
detected_languages: None,
|
|
169
|
-
chunks: None,
|
|
170
|
-
images: None,
|
|
171
|
-
pages: None,
|
|
172
|
-
};
|
|
173
|
-
|
|
174
|
-
let config_with_chunking = ExtractionConfig {
|
|
175
|
-
chunking: Some(crate::core::config::ChunkingConfig {
|
|
176
|
-
max_chars: 100,
|
|
177
|
-
max_overlap: 10,
|
|
178
|
-
embedding: None,
|
|
179
|
-
preset: None,
|
|
180
|
-
}),
|
|
181
|
-
..Default::default()
|
|
182
|
-
};
|
|
183
|
-
assert!(processor.should_process(&result, &config_with_chunking));
|
|
184
|
-
|
|
185
|
-
let config_without_chunking = ExtractionConfig::default();
|
|
186
|
-
assert!(!processor.should_process(&result, &config_without_chunking));
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
#[test]
|
|
190
|
-
fn test_chunking_processor_estimated_duration() {
|
|
191
|
-
let processor = ChunkingProcessor;
|
|
192
|
-
|
|
193
|
-
let short_result = ExtractionResult {
|
|
194
|
-
content: "Short".to_string(),
|
|
195
|
-
mime_type: "text/plain".to_string(),
|
|
196
|
-
metadata: Metadata::default(),
|
|
197
|
-
tables: vec![],
|
|
198
|
-
detected_languages: None,
|
|
199
|
-
chunks: None,
|
|
200
|
-
images: None,
|
|
201
|
-
pages: None,
|
|
202
|
-
};
|
|
203
|
-
|
|
204
|
-
let long_result = ExtractionResult {
|
|
205
|
-
content: "a".repeat(100000),
|
|
206
|
-
mime_type: "text/plain".to_string(),
|
|
207
|
-
metadata: Metadata::default(),
|
|
208
|
-
tables: vec![],
|
|
209
|
-
detected_languages: None,
|
|
210
|
-
chunks: None,
|
|
211
|
-
images: None,
|
|
212
|
-
pages: None,
|
|
213
|
-
};
|
|
214
|
-
|
|
215
|
-
let short_duration = processor.estimated_duration_ms(&short_result);
|
|
216
|
-
let long_duration = processor.estimated_duration_ms(&long_result);
|
|
217
|
-
|
|
218
|
-
assert!(long_duration > short_duration);
|
|
219
|
-
}
|
|
220
|
-
}
|
|
1
|
+
//! Text chunking post-processor.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides a PostProcessor plugin that chunks text content in
|
|
4
|
+
//! extraction results.
|
|
5
|
+
|
|
6
|
+
use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
7
|
+
use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
|
|
8
|
+
use async_trait::async_trait;
|
|
9
|
+
|
|
10
|
+
/// Post-processor that chunks text in document content.
|
|
11
|
+
///
|
|
12
|
+
/// This processor:
|
|
13
|
+
/// - Runs in the Middle processing stage
|
|
14
|
+
/// - Only processes when `config.chunking` is configured
|
|
15
|
+
/// - Stores chunks in `result.chunks`
|
|
16
|
+
/// - Uses configurable chunk size and overlap
|
|
17
|
+
///
|
|
18
|
+
/// # Example
|
|
19
|
+
///
|
|
20
|
+
/// ```rust,no_run
|
|
21
|
+
/// use kreuzberg::plugins::{Plugin, PostProcessor};
|
|
22
|
+
/// use kreuzberg::chunking::processor::ChunkingProcessor;
|
|
23
|
+
///
|
|
24
|
+
/// let processor = ChunkingProcessor;
|
|
25
|
+
/// assert_eq!(processor.name(), "text-chunking");
|
|
26
|
+
/// ```
|
|
27
|
+
#[derive(Debug, Clone, Copy)]
|
|
28
|
+
pub struct ChunkingProcessor;
|
|
29
|
+
|
|
30
|
+
impl Plugin for ChunkingProcessor {
|
|
31
|
+
fn name(&self) -> &str {
|
|
32
|
+
"text-chunking"
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
fn version(&self) -> String {
|
|
36
|
+
env!("CARGO_PKG_VERSION").to_string()
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
fn initialize(&self) -> Result<()> {
|
|
40
|
+
Ok(())
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
fn shutdown(&self) -> Result<()> {
|
|
44
|
+
Ok(())
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
|
|
49
|
+
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
50
|
+
impl PostProcessor for ChunkingProcessor {
|
|
51
|
+
async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
|
|
52
|
+
let chunking_config = match &config.chunking {
|
|
53
|
+
Some(cfg) => cfg,
|
|
54
|
+
None => return Ok(()),
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
let chunk_config = crate::chunking::ChunkingConfig {
|
|
58
|
+
max_characters: chunking_config.max_chars,
|
|
59
|
+
overlap: chunking_config.max_overlap,
|
|
60
|
+
trim: true,
|
|
61
|
+
chunker_type: crate::chunking::ChunkerType::Text,
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
let chunking_result = crate::chunking::chunk_text(&result.content, &chunk_config, None)
|
|
65
|
+
.map_err(|e| KreuzbergError::Other(format!("Chunking failed: {}", e)))?;
|
|
66
|
+
result.chunks = Some(chunking_result.chunks);
|
|
67
|
+
|
|
68
|
+
Ok(())
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
fn processing_stage(&self) -> ProcessingStage {
|
|
72
|
+
ProcessingStage::Middle
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
|
|
76
|
+
config.chunking.is_some()
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
|
|
80
|
+
let text_length = result.content.len();
|
|
81
|
+
// Chunking is fast: ~1ms per 10KB
|
|
82
|
+
(text_length / 10240).max(1) as u64
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
#[cfg(test)]
|
|
87
|
+
mod tests {
|
|
88
|
+
use super::*;
|
|
89
|
+
use crate::core::config::ChunkingConfig;
|
|
90
|
+
use crate::types::Metadata;
|
|
91
|
+
|
|
92
|
+
#[tokio::test]
|
|
93
|
+
async fn test_chunking_processor() {
|
|
94
|
+
let processor = ChunkingProcessor;
|
|
95
|
+
let config = ExtractionConfig {
|
|
96
|
+
chunking: Some(ChunkingConfig {
|
|
97
|
+
max_chars: 100,
|
|
98
|
+
max_overlap: 10,
|
|
99
|
+
embedding: None,
|
|
100
|
+
preset: None,
|
|
101
|
+
}),
|
|
102
|
+
..Default::default()
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
let mut result = ExtractionResult {
|
|
106
|
+
content: "This is a longer text that should be split into multiple chunks to test the chunking processor functionality.".to_string(),
|
|
107
|
+
mime_type: "text/plain".to_string(),
|
|
108
|
+
metadata: Metadata::default(),
|
|
109
|
+
tables: vec![],
|
|
110
|
+
detected_languages: None,
|
|
111
|
+
chunks: None,
|
|
112
|
+
images: None,
|
|
113
|
+
pages: None,
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
processor.process(&mut result, &config).await.unwrap();
|
|
117
|
+
|
|
118
|
+
assert!(result.chunks.is_some());
|
|
119
|
+
let chunks = result.chunks.unwrap();
|
|
120
|
+
assert!(!chunks.is_empty());
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
#[tokio::test]
|
|
124
|
+
async fn test_chunking_processor_no_config() {
|
|
125
|
+
let processor = ChunkingProcessor;
|
|
126
|
+
let config = ExtractionConfig::default();
|
|
127
|
+
|
|
128
|
+
let mut result = ExtractionResult {
|
|
129
|
+
content: "Some text".to_string(),
|
|
130
|
+
mime_type: "text/plain".to_string(),
|
|
131
|
+
metadata: Metadata::default(),
|
|
132
|
+
tables: vec![],
|
|
133
|
+
detected_languages: None,
|
|
134
|
+
chunks: None,
|
|
135
|
+
images: None,
|
|
136
|
+
pages: None,
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
processor.process(&mut result, &config).await.unwrap();
|
|
140
|
+
|
|
141
|
+
assert!(result.chunks.is_none());
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
#[test]
|
|
145
|
+
fn test_chunking_processor_plugin_interface() {
|
|
146
|
+
let processor = ChunkingProcessor;
|
|
147
|
+
assert_eq!(processor.name(), "text-chunking");
|
|
148
|
+
assert!(!processor.version().is_empty());
|
|
149
|
+
assert!(processor.initialize().is_ok());
|
|
150
|
+
assert!(processor.shutdown().is_ok());
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
#[test]
|
|
154
|
+
fn test_chunking_processor_stage() {
|
|
155
|
+
let processor = ChunkingProcessor;
|
|
156
|
+
assert_eq!(processor.processing_stage(), ProcessingStage::Middle);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
#[test]
|
|
160
|
+
fn test_chunking_processor_should_process() {
|
|
161
|
+
let processor = ChunkingProcessor;
|
|
162
|
+
|
|
163
|
+
let result = ExtractionResult {
|
|
164
|
+
content: "Sample text".to_string(),
|
|
165
|
+
mime_type: "text/plain".to_string(),
|
|
166
|
+
metadata: Metadata::default(),
|
|
167
|
+
tables: vec![],
|
|
168
|
+
detected_languages: None,
|
|
169
|
+
chunks: None,
|
|
170
|
+
images: None,
|
|
171
|
+
pages: None,
|
|
172
|
+
};
|
|
173
|
+
|
|
174
|
+
let config_with_chunking = ExtractionConfig {
|
|
175
|
+
chunking: Some(crate::core::config::ChunkingConfig {
|
|
176
|
+
max_chars: 100,
|
|
177
|
+
max_overlap: 10,
|
|
178
|
+
embedding: None,
|
|
179
|
+
preset: None,
|
|
180
|
+
}),
|
|
181
|
+
..Default::default()
|
|
182
|
+
};
|
|
183
|
+
assert!(processor.should_process(&result, &config_with_chunking));
|
|
184
|
+
|
|
185
|
+
let config_without_chunking = ExtractionConfig::default();
|
|
186
|
+
assert!(!processor.should_process(&result, &config_without_chunking));
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
#[test]
|
|
190
|
+
fn test_chunking_processor_estimated_duration() {
|
|
191
|
+
let processor = ChunkingProcessor;
|
|
192
|
+
|
|
193
|
+
let short_result = ExtractionResult {
|
|
194
|
+
content: "Short".to_string(),
|
|
195
|
+
mime_type: "text/plain".to_string(),
|
|
196
|
+
metadata: Metadata::default(),
|
|
197
|
+
tables: vec![],
|
|
198
|
+
detected_languages: None,
|
|
199
|
+
chunks: None,
|
|
200
|
+
images: None,
|
|
201
|
+
pages: None,
|
|
202
|
+
};
|
|
203
|
+
|
|
204
|
+
let long_result = ExtractionResult {
|
|
205
|
+
content: "a".repeat(100000),
|
|
206
|
+
mime_type: "text/plain".to_string(),
|
|
207
|
+
metadata: Metadata::default(),
|
|
208
|
+
tables: vec![],
|
|
209
|
+
detected_languages: None,
|
|
210
|
+
chunks: None,
|
|
211
|
+
images: None,
|
|
212
|
+
pages: None,
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
let short_duration = processor.estimated_duration_ms(&short_result);
|
|
216
|
+
let long_duration = processor.estimated_duration_ms(&long_result);
|
|
217
|
+
|
|
218
|
+
assert!(long_duration > short_duration);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
@@ -1,95 +1,95 @@
|
|
|
1
|
-
//! Internal batch mode tracking using tokio task-local storage.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides a way to track whether we're in batch processing mode
|
|
4
|
-
//! without exposing it in the public API. Extractors check this flag to decide
|
|
5
|
-
//! whether to use `spawn_blocking` for CPU-intensive work.
|
|
6
|
-
|
|
7
|
-
use std::cell::Cell;
|
|
8
|
-
use tokio::task_local;
|
|
9
|
-
|
|
10
|
-
task_local! {
|
|
11
|
-
/// Task-local flag indicating batch processing mode.
|
|
12
|
-
///
|
|
13
|
-
/// When true, extractors use `spawn_blocking` for CPU-intensive work to enable
|
|
14
|
-
/// parallelism. When false (single-file mode), extractors run directly to avoid
|
|
15
|
-
/// spawn overhead.
|
|
16
|
-
static BATCH_MODE: Cell<bool>;
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
/// Check if we're currently in batch processing mode.
|
|
20
|
-
///
|
|
21
|
-
/// Returns `false` if the task-local is not set (single-file mode).
|
|
22
|
-
#[allow(dead_code)]
|
|
23
|
-
pub fn is_batch_mode() -> bool {
|
|
24
|
-
BATCH_MODE.try_with(|cell| cell.get()).unwrap_or(false)
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
/// Run a future with batch mode enabled.
|
|
28
|
-
///
|
|
29
|
-
/// This sets the task-local BATCH_MODE flag for the duration of the future.
|
|
30
|
-
pub async fn with_batch_mode<F, T>(future: F) -> T
|
|
31
|
-
where
|
|
32
|
-
F: std::future::Future<Output = T>,
|
|
33
|
-
{
|
|
34
|
-
BATCH_MODE.scope(Cell::new(true), future).await
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
#[cfg(test)]
|
|
38
|
-
mod tests {
|
|
39
|
-
use super::*;
|
|
40
|
-
|
|
41
|
-
#[tokio::test]
|
|
42
|
-
async fn test_batch_mode_not_set_by_default() {
|
|
43
|
-
let result = is_batch_mode();
|
|
44
|
-
assert!(!result, "batch mode should be false by default");
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
#[tokio::test]
|
|
48
|
-
async fn test_with_batch_mode_sets_flag() {
|
|
49
|
-
let result = with_batch_mode(async { is_batch_mode() }).await;
|
|
50
|
-
|
|
51
|
-
assert!(result, "batch mode should be true inside with_batch_mode");
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
#[tokio::test]
|
|
55
|
-
async fn test_batch_mode_scoped_to_future() {
|
|
56
|
-
assert!(!is_batch_mode(), "batch mode should be false before");
|
|
57
|
-
|
|
58
|
-
with_batch_mode(async {
|
|
59
|
-
assert!(is_batch_mode(), "batch mode should be true inside");
|
|
60
|
-
})
|
|
61
|
-
.await;
|
|
62
|
-
|
|
63
|
-
assert!(!is_batch_mode(), "batch mode should be false after future completes");
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
#[tokio::test]
|
|
67
|
-
async fn test_nested_batch_mode_calls() {
|
|
68
|
-
let result = with_batch_mode(async {
|
|
69
|
-
let outer = is_batch_mode();
|
|
70
|
-
let inner = with_batch_mode(async { is_batch_mode() }).await;
|
|
71
|
-
(outer, inner)
|
|
72
|
-
})
|
|
73
|
-
.await;
|
|
74
|
-
|
|
75
|
-
assert!(result.0, "outer batch mode should be true");
|
|
76
|
-
assert!(result.1, "inner batch mode should be true");
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
#[tokio::test]
|
|
80
|
-
async fn test_batch_mode_unaffected_after_with_batch_mode() {
|
|
81
|
-
with_batch_mode(async {
|
|
82
|
-
assert!(is_batch_mode(), "first call should set batch mode");
|
|
83
|
-
})
|
|
84
|
-
.await;
|
|
85
|
-
|
|
86
|
-
assert!(!is_batch_mode(), "batch mode should be false between calls");
|
|
87
|
-
|
|
88
|
-
with_batch_mode(async {
|
|
89
|
-
assert!(is_batch_mode(), "second call should set batch mode");
|
|
90
|
-
})
|
|
91
|
-
.await;
|
|
92
|
-
|
|
93
|
-
assert!(!is_batch_mode(), "batch mode should be false after all calls");
|
|
94
|
-
}
|
|
95
|
-
}
|
|
1
|
+
//! Internal batch mode tracking using tokio task-local storage.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides a way to track whether we're in batch processing mode
|
|
4
|
+
//! without exposing it in the public API. Extractors check this flag to decide
|
|
5
|
+
//! whether to use `spawn_blocking` for CPU-intensive work.
|
|
6
|
+
|
|
7
|
+
use std::cell::Cell;
|
|
8
|
+
use tokio::task_local;
|
|
9
|
+
|
|
10
|
+
task_local! {
|
|
11
|
+
/// Task-local flag indicating batch processing mode.
|
|
12
|
+
///
|
|
13
|
+
/// When true, extractors use `spawn_blocking` for CPU-intensive work to enable
|
|
14
|
+
/// parallelism. When false (single-file mode), extractors run directly to avoid
|
|
15
|
+
/// spawn overhead.
|
|
16
|
+
static BATCH_MODE: Cell<bool>;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/// Check if we're currently in batch processing mode.
|
|
20
|
+
///
|
|
21
|
+
/// Returns `false` if the task-local is not set (single-file mode).
|
|
22
|
+
#[allow(dead_code)]
|
|
23
|
+
pub fn is_batch_mode() -> bool {
|
|
24
|
+
BATCH_MODE.try_with(|cell| cell.get()).unwrap_or(false)
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/// Run a future with batch mode enabled.
|
|
28
|
+
///
|
|
29
|
+
/// This sets the task-local BATCH_MODE flag for the duration of the future.
|
|
30
|
+
pub async fn with_batch_mode<F, T>(future: F) -> T
|
|
31
|
+
where
|
|
32
|
+
F: std::future::Future<Output = T>,
|
|
33
|
+
{
|
|
34
|
+
BATCH_MODE.scope(Cell::new(true), future).await
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
#[cfg(test)]
|
|
38
|
+
mod tests {
|
|
39
|
+
use super::*;
|
|
40
|
+
|
|
41
|
+
#[tokio::test]
|
|
42
|
+
async fn test_batch_mode_not_set_by_default() {
|
|
43
|
+
let result = is_batch_mode();
|
|
44
|
+
assert!(!result, "batch mode should be false by default");
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
#[tokio::test]
|
|
48
|
+
async fn test_with_batch_mode_sets_flag() {
|
|
49
|
+
let result = with_batch_mode(async { is_batch_mode() }).await;
|
|
50
|
+
|
|
51
|
+
assert!(result, "batch mode should be true inside with_batch_mode");
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
#[tokio::test]
|
|
55
|
+
async fn test_batch_mode_scoped_to_future() {
|
|
56
|
+
assert!(!is_batch_mode(), "batch mode should be false before");
|
|
57
|
+
|
|
58
|
+
with_batch_mode(async {
|
|
59
|
+
assert!(is_batch_mode(), "batch mode should be true inside");
|
|
60
|
+
})
|
|
61
|
+
.await;
|
|
62
|
+
|
|
63
|
+
assert!(!is_batch_mode(), "batch mode should be false after future completes");
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
#[tokio::test]
|
|
67
|
+
async fn test_nested_batch_mode_calls() {
|
|
68
|
+
let result = with_batch_mode(async {
|
|
69
|
+
let outer = is_batch_mode();
|
|
70
|
+
let inner = with_batch_mode(async { is_batch_mode() }).await;
|
|
71
|
+
(outer, inner)
|
|
72
|
+
})
|
|
73
|
+
.await;
|
|
74
|
+
|
|
75
|
+
assert!(result.0, "outer batch mode should be true");
|
|
76
|
+
assert!(result.1, "inner batch mode should be true");
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
#[tokio::test]
|
|
80
|
+
async fn test_batch_mode_unaffected_after_with_batch_mode() {
|
|
81
|
+
with_batch_mode(async {
|
|
82
|
+
assert!(is_batch_mode(), "first call should set batch mode");
|
|
83
|
+
})
|
|
84
|
+
.await;
|
|
85
|
+
|
|
86
|
+
assert!(!is_batch_mode(), "batch mode should be false between calls");
|
|
87
|
+
|
|
88
|
+
with_batch_mode(async {
|
|
89
|
+
assert!(is_batch_mode(), "second call should set batch mode");
|
|
90
|
+
})
|
|
91
|
+
.await;
|
|
92
|
+
|
|
93
|
+
assert!(!is_batch_mode(), "batch mode should be false after all calls");
|
|
94
|
+
}
|
|
95
|
+
}
|