kreuzberg 4.0.0.rc2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +391 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
//! Batch extraction optimizations using object pooling.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides optimized batch processing utilities that leverage
|
|
4
|
+
//! object pooling to reduce allocations during concurrent extraction of
|
|
5
|
+
//! multiple documents.
|
|
6
|
+
//!
|
|
7
|
+
//! # Performance Impact
|
|
8
|
+
//!
|
|
9
|
+
//! - Reuses temporary string/buffer allocations across documents
|
|
10
|
+
//! - Reduces garbage collection pressure by ~5-10%
|
|
11
|
+
//! - Overall throughput improvement of 5-10% for batch operations
|
|
12
|
+
//!
|
|
13
|
+
//! # Usage
|
|
14
|
+
//!
|
|
15
|
+
//! The batch extraction functions automatically use pooling internally.
|
|
16
|
+
//! For manual control, use `BatchProcessor` to create pools and manage
|
|
17
|
+
//! extraction with custom pool sizes.
|
|
18
|
+
|
|
19
|
+
use crate::core::config::ExtractionConfig;
|
|
20
|
+
use crate::types::ExtractionResult;
|
|
21
|
+
use crate::utils::pool::{ByteBufferPool, StringBufferPool, create_byte_buffer_pool, create_string_buffer_pool};
|
|
22
|
+
use crate::utils::pool_sizing::PoolSizeHint;
|
|
23
|
+
use crate::{KreuzbergError, Result};
|
|
24
|
+
use parking_lot::Mutex;
|
|
25
|
+
use std::path::Path;
|
|
26
|
+
use std::sync::Arc;
|
|
27
|
+
use std::sync::atomic::{AtomicBool, Ordering};
|
|
28
|
+
|
|
29
|
+
/// Configuration for batch processing with pooling optimizations.
|
|
30
|
+
#[derive(Debug, Clone)]
|
|
31
|
+
pub struct BatchProcessorConfig {
|
|
32
|
+
/// Maximum number of string buffers to maintain in the pool
|
|
33
|
+
pub string_pool_size: usize,
|
|
34
|
+
|
|
35
|
+
/// Initial capacity for pooled string buffers in bytes
|
|
36
|
+
pub string_buffer_capacity: usize,
|
|
37
|
+
|
|
38
|
+
/// Maximum number of byte buffers to maintain in the pool
|
|
39
|
+
pub byte_pool_size: usize,
|
|
40
|
+
|
|
41
|
+
/// Initial capacity for pooled byte buffers in bytes
|
|
42
|
+
pub byte_buffer_capacity: usize,
|
|
43
|
+
|
|
44
|
+
/// Maximum concurrent extractions (for concurrency control)
|
|
45
|
+
pub max_concurrent: Option<usize>,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
impl Default for BatchProcessorConfig {
|
|
49
|
+
fn default() -> Self {
|
|
50
|
+
BatchProcessorConfig {
|
|
51
|
+
string_pool_size: 10,
|
|
52
|
+
string_buffer_capacity: 8192,
|
|
53
|
+
byte_pool_size: 10,
|
|
54
|
+
byte_buffer_capacity: 65536,
|
|
55
|
+
max_concurrent: None,
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/// Batch processor that manages object pools for optimized extraction.
|
|
61
|
+
///
|
|
62
|
+
/// This struct manages the lifecycle of reusable object pools used during
|
|
63
|
+
/// batch extraction. Pools are created lazily on first use and reused across
|
|
64
|
+
/// all documents processed by this batch processor.
|
|
65
|
+
///
|
|
66
|
+
/// # Lazy Initialization
|
|
67
|
+
///
|
|
68
|
+
/// Pools are initialized on demand to reduce memory usage for applications
|
|
69
|
+
/// that may not use batch processing immediately or at all.
|
|
70
|
+
pub struct BatchProcessor {
|
|
71
|
+
string_pool: Mutex<Option<Arc<StringBufferPool>>>,
|
|
72
|
+
byte_pool: Mutex<Option<Arc<ByteBufferPool>>>,
|
|
73
|
+
config: BatchProcessorConfig,
|
|
74
|
+
string_pool_initialized: AtomicBool,
|
|
75
|
+
byte_pool_initialized: AtomicBool,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
impl BatchProcessor {
|
|
79
|
+
/// Create a new batch processor with default pool configuration.
|
|
80
|
+
///
|
|
81
|
+
/// # Returns
|
|
82
|
+
///
|
|
83
|
+
/// A new `BatchProcessor` ready to process documents.
|
|
84
|
+
///
|
|
85
|
+
/// # Example
|
|
86
|
+
///
|
|
87
|
+
/// ```rust,no_run
|
|
88
|
+
/// use kreuzberg::core::batch_optimizations::BatchProcessor;
|
|
89
|
+
///
|
|
90
|
+
/// let processor = BatchProcessor::new();
|
|
91
|
+
/// ```
|
|
92
|
+
pub fn new() -> Self {
|
|
93
|
+
Self::with_config(BatchProcessorConfig::default())
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/// Create a new batch processor with custom pool configuration.
|
|
97
|
+
///
|
|
98
|
+
/// Pools are not created immediately but lazily on first access.
|
|
99
|
+
///
|
|
100
|
+
/// # Arguments
|
|
101
|
+
///
|
|
102
|
+
/// * `config` - Custom batch processor configuration
|
|
103
|
+
///
|
|
104
|
+
/// # Returns
|
|
105
|
+
///
|
|
106
|
+
/// A new `BatchProcessor` configured with the provided settings.
|
|
107
|
+
///
|
|
108
|
+
/// # Example
|
|
109
|
+
///
|
|
110
|
+
/// ```rust,no_run
|
|
111
|
+
/// use kreuzberg::core::batch_optimizations::{BatchProcessor, BatchProcessorConfig};
|
|
112
|
+
///
|
|
113
|
+
/// let mut config = BatchProcessorConfig::default();
|
|
114
|
+
/// config.string_pool_size = 20;
|
|
115
|
+
/// config.string_buffer_capacity = 16384;
|
|
116
|
+
/// let processor = BatchProcessor::with_config(config);
|
|
117
|
+
/// ```
|
|
118
|
+
pub fn with_config(config: BatchProcessorConfig) -> Self {
|
|
119
|
+
BatchProcessor {
|
|
120
|
+
string_pool: Mutex::new(None),
|
|
121
|
+
byte_pool: Mutex::new(None),
|
|
122
|
+
config,
|
|
123
|
+
string_pool_initialized: AtomicBool::new(false),
|
|
124
|
+
byte_pool_initialized: AtomicBool::new(false),
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/// Create a batch processor with pool sizes optimized for a specific document.
|
|
129
|
+
///
|
|
130
|
+
/// This method uses a `PoolSizeHint` (derived from file size and MIME type)
|
|
131
|
+
/// to create a batch processor with appropriately sized pools. This reduces
|
|
132
|
+
/// memory waste by tailoring pool allocation to actual document complexity.
|
|
133
|
+
///
|
|
134
|
+
/// # Arguments
|
|
135
|
+
///
|
|
136
|
+
/// * `hint` - Pool sizing hint containing recommended buffer counts and capacities
|
|
137
|
+
///
|
|
138
|
+
/// # Returns
|
|
139
|
+
///
|
|
140
|
+
/// A new `BatchProcessor` configured with the hint-based pool sizes
|
|
141
|
+
///
|
|
142
|
+
/// # Example
|
|
143
|
+
///
|
|
144
|
+
/// ```rust,no_run
|
|
145
|
+
/// use kreuzberg::core::batch_optimizations::BatchProcessor;
|
|
146
|
+
/// use kreuzberg::utils::pool_sizing::estimate_pool_size;
|
|
147
|
+
///
|
|
148
|
+
/// let hint = estimate_pool_size(5_000_000, "application/pdf");
|
|
149
|
+
/// let processor = BatchProcessor::with_pool_hint(&hint);
|
|
150
|
+
/// ```
|
|
151
|
+
pub fn with_pool_hint(hint: &PoolSizeHint) -> Self {
|
|
152
|
+
let config = BatchProcessorConfig {
|
|
153
|
+
string_pool_size: hint.string_buffer_count,
|
|
154
|
+
string_buffer_capacity: hint.string_buffer_capacity,
|
|
155
|
+
byte_pool_size: hint.byte_buffer_count,
|
|
156
|
+
byte_buffer_capacity: hint.byte_buffer_capacity,
|
|
157
|
+
max_concurrent: None,
|
|
158
|
+
};
|
|
159
|
+
Self::with_config(config)
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/// Get a reference to the string buffer pool.
|
|
163
|
+
///
|
|
164
|
+
/// Creates the pool lazily on first access.
|
|
165
|
+
/// Useful for custom pooling implementations that need direct pool access.
|
|
166
|
+
pub fn string_pool(&self) -> Arc<StringBufferPool> {
|
|
167
|
+
if self.string_pool_initialized.load(Ordering::Acquire) {
|
|
168
|
+
return Arc::clone(self.string_pool.lock().as_ref().unwrap());
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
let mut pool_opt = self.string_pool.lock();
|
|
172
|
+
if pool_opt.is_none() {
|
|
173
|
+
let pool = Arc::new(create_string_buffer_pool(
|
|
174
|
+
self.config.string_pool_size,
|
|
175
|
+
self.config.string_buffer_capacity,
|
|
176
|
+
));
|
|
177
|
+
*pool_opt = Some(pool);
|
|
178
|
+
self.string_pool_initialized.store(true, Ordering::Release);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
Arc::clone(pool_opt.as_ref().unwrap())
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/// Get a reference to the byte buffer pool.
|
|
185
|
+
///
|
|
186
|
+
/// Creates the pool lazily on first access.
|
|
187
|
+
/// Useful for custom pooling implementations that need direct pool access.
|
|
188
|
+
pub fn byte_pool(&self) -> Arc<ByteBufferPool> {
|
|
189
|
+
if self.byte_pool_initialized.load(Ordering::Acquire) {
|
|
190
|
+
return Arc::clone(self.byte_pool.lock().as_ref().unwrap());
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
let mut pool_opt = self.byte_pool.lock();
|
|
194
|
+
if pool_opt.is_none() {
|
|
195
|
+
let pool = Arc::new(create_byte_buffer_pool(
|
|
196
|
+
self.config.byte_pool_size,
|
|
197
|
+
self.config.byte_buffer_capacity,
|
|
198
|
+
));
|
|
199
|
+
*pool_opt = Some(pool);
|
|
200
|
+
self.byte_pool_initialized.store(true, Ordering::Release);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
Arc::clone(pool_opt.as_ref().unwrap())
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/// Get the current configuration.
|
|
207
|
+
pub fn config(&self) -> &BatchProcessorConfig {
|
|
208
|
+
&self.config
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/// Process multiple files with optimized pooling.
|
|
212
|
+
///
|
|
213
|
+
/// This is a convenience method that combines file extraction with
|
|
214
|
+
/// automatic pool management.
|
|
215
|
+
///
|
|
216
|
+
/// # Arguments
|
|
217
|
+
///
|
|
218
|
+
/// * `paths` - Paths to the files to extract
|
|
219
|
+
/// * `extraction_config` - Extraction configuration
|
|
220
|
+
///
|
|
221
|
+
/// # Returns
|
|
222
|
+
///
|
|
223
|
+
/// A vector of `ExtractionResult` in the same order as input paths.
|
|
224
|
+
///
|
|
225
|
+
/// # Errors
|
|
226
|
+
///
|
|
227
|
+
/// Returns `KreuzbergError` if any file operation fails.
|
|
228
|
+
#[cfg(feature = "tokio-runtime")]
|
|
229
|
+
pub async fn process_files(
|
|
230
|
+
&self,
|
|
231
|
+
paths: Vec<impl AsRef<Path>>,
|
|
232
|
+
extraction_config: &ExtractionConfig,
|
|
233
|
+
) -> Result<Vec<ExtractionResult>> {
|
|
234
|
+
use crate::core::extractor::batch_extract_file;
|
|
235
|
+
|
|
236
|
+
batch_extract_file(paths, extraction_config).await
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
/// Process multiple byte arrays with optimized pooling.
|
|
240
|
+
///
|
|
241
|
+
/// This is a convenience method that combines bytes extraction with
|
|
242
|
+
/// automatic pool management.
|
|
243
|
+
///
|
|
244
|
+
/// # Arguments
|
|
245
|
+
///
|
|
246
|
+
/// * `contents` - Vector of (bytes, mime_type) tuples
|
|
247
|
+
/// * `extraction_config` - Extraction configuration
|
|
248
|
+
///
|
|
249
|
+
/// # Returns
|
|
250
|
+
///
|
|
251
|
+
/// A vector of `ExtractionResult` in the same order as input contents.
|
|
252
|
+
///
|
|
253
|
+
/// # Errors
|
|
254
|
+
///
|
|
255
|
+
/// Returns `KreuzbergError` if extraction fails.
|
|
256
|
+
#[cfg(feature = "tokio-runtime")]
|
|
257
|
+
pub async fn process_bytes(
|
|
258
|
+
&self,
|
|
259
|
+
contents: Vec<(&[u8], &str)>,
|
|
260
|
+
extraction_config: &ExtractionConfig,
|
|
261
|
+
) -> Result<Vec<ExtractionResult>> {
|
|
262
|
+
use crate::core::extractor::batch_extract_bytes;
|
|
263
|
+
|
|
264
|
+
let owned_contents: Vec<(Vec<u8>, String)> = contents
|
|
265
|
+
.into_iter()
|
|
266
|
+
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
267
|
+
.collect();
|
|
268
|
+
|
|
269
|
+
batch_extract_bytes(owned_contents, extraction_config).await
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
/// Get the number of pooled string buffers currently available.
|
|
273
|
+
pub fn string_pool_size(&self) -> usize {
|
|
274
|
+
self.string_pool.lock().as_ref().map(|p| p.size()).unwrap_or(0)
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
/// Get the number of pooled byte buffers currently available.
|
|
278
|
+
pub fn byte_pool_size(&self) -> usize {
|
|
279
|
+
self.byte_pool.lock().as_ref().map(|p| p.size()).unwrap_or(0)
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/// Clear all pooled objects, forcing new allocations on next acquire.
|
|
283
|
+
///
|
|
284
|
+
/// Useful for memory-constrained environments or to reclaim memory
|
|
285
|
+
/// after processing large batches.
|
|
286
|
+
pub fn clear_pools(&self) -> Result<()> {
|
|
287
|
+
let pool_opt = self.string_pool.lock();
|
|
288
|
+
if let Some(pool) = pool_opt.as_ref() {
|
|
289
|
+
pool.clear()
|
|
290
|
+
.map_err(|e| KreuzbergError::Other(format!("string pool error: {}", e)))?;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
let pool_opt = self.byte_pool.lock();
|
|
294
|
+
if let Some(pool) = pool_opt.as_ref() {
|
|
295
|
+
pool.clear()
|
|
296
|
+
.map_err(|e| KreuzbergError::Other(format!("byte pool error: {}", e)))?;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
Ok(())
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
impl Default for BatchProcessor {
|
|
304
|
+
fn default() -> Self {
|
|
305
|
+
Self::new()
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
#[cfg(test)]
|
|
310
|
+
mod tests {
|
|
311
|
+
use super::*;
|
|
312
|
+
|
|
313
|
+
#[test]
|
|
314
|
+
fn test_batch_processor_creation() {
|
|
315
|
+
let processor = BatchProcessor::new();
|
|
316
|
+
assert_eq!(processor.string_pool_size(), 0);
|
|
317
|
+
assert_eq!(processor.byte_pool_size(), 0);
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
#[test]
|
|
321
|
+
fn test_batch_processor_with_config() {
|
|
322
|
+
let config = BatchProcessorConfig {
|
|
323
|
+
string_pool_size: 5,
|
|
324
|
+
string_buffer_capacity: 1024,
|
|
325
|
+
byte_pool_size: 3,
|
|
326
|
+
byte_buffer_capacity: 4096,
|
|
327
|
+
max_concurrent: None,
|
|
328
|
+
};
|
|
329
|
+
|
|
330
|
+
let processor = BatchProcessor::with_config(config);
|
|
331
|
+
assert_eq!(processor.config().string_pool_size, 5);
|
|
332
|
+
assert_eq!(processor.config().byte_pool_size, 3);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
#[test]
|
|
336
|
+
fn test_batch_processor_string_pool_usage() {
|
|
337
|
+
let processor = BatchProcessor::new();
|
|
338
|
+
let pool = processor.string_pool();
|
|
339
|
+
|
|
340
|
+
{
|
|
341
|
+
let mut s = pool.acquire().unwrap();
|
|
342
|
+
s.push_str("test");
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
{
|
|
346
|
+
let s = pool.acquire().unwrap();
|
|
347
|
+
assert_eq!(s.len(), 0);
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
#[test]
|
|
352
|
+
fn test_batch_processor_byte_pool_usage() {
|
|
353
|
+
let processor = BatchProcessor::new();
|
|
354
|
+
let pool = processor.byte_pool();
|
|
355
|
+
|
|
356
|
+
{
|
|
357
|
+
let mut buf = pool.acquire().unwrap();
|
|
358
|
+
buf.extend_from_slice(b"test");
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
{
|
|
362
|
+
let buf = pool.acquire().unwrap();
|
|
363
|
+
assert_eq!(buf.len(), 0);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
#[test]
|
|
368
|
+
fn test_batch_processor_clear_pools() {
|
|
369
|
+
let processor = BatchProcessor::new();
|
|
370
|
+
|
|
371
|
+
let s1 = processor.string_pool().acquire().unwrap();
|
|
372
|
+
let s2 = processor.byte_pool().acquire().unwrap();
|
|
373
|
+
|
|
374
|
+
drop(s1);
|
|
375
|
+
drop(s2);
|
|
376
|
+
|
|
377
|
+
assert!(processor.string_pool_size() > 0);
|
|
378
|
+
assert!(processor.byte_pool_size() > 0);
|
|
379
|
+
|
|
380
|
+
processor.clear_pools().unwrap();
|
|
381
|
+
|
|
382
|
+
assert_eq!(processor.string_pool_size(), 0);
|
|
383
|
+
assert_eq!(processor.byte_pool_size(), 0);
|
|
384
|
+
}
|
|
385
|
+
}
|