kreuzberg 4.0.0.rc1 → 4.0.0.rc2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -8
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -534
- data/.rubocop.yml +538 -0
- data/Gemfile +8 -9
- data/Gemfile.lock +9 -109
- data/README.md +426 -421
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -340
- data/ext/kreuzberg_rb/extconf.rb +45 -35
- data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +44 -36
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -17
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +2998 -2939
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +148 -105
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +46 -45
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +691 -684
- data/lib/kreuzberg/error_context.rb +32 -0
- data/lib/kreuzberg/errors.rb +118 -50
- data/lib/kreuzberg/extraction_api.rb +85 -84
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +216 -216
- data/lib/kreuzberg/setup_lib_path.rb +80 -79
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +103 -82
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +520 -468
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -87
- data/spec/binding/cli_spec.rb +55 -54
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -42
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/kreuzberg/Cargo.toml +204 -134
- data/vendor/kreuzberg/README.md +175 -175
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
- data/vendor/kreuzberg/build.rs +474 -460
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1143
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -677
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -35
- data/vendor/kreuzberg/src/core/config.rs +1032 -1032
- data/vendor/kreuzberg/src/core/extractor.rs +1024 -903
- data/vendor/kreuzberg/src/core/io.rs +329 -327
- data/vendor/kreuzberg/src/core/mime.rs +605 -615
- data/vendor/kreuzberg/src/core/mod.rs +45 -42
- data/vendor/kreuzberg/src/core/pipeline.rs +984 -906
- data/vendor/kreuzberg/src/embeddings.rs +432 -323
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -40
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +553 -553
- data/vendor/kreuzberg/src/extraction/image.rs +368 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -564
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -77
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -128
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +446 -425
- data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +367 -479
- data/vendor/kreuzberg/src/extractors/email.rs +143 -129
- data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +343 -344
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
- data/vendor/kreuzberg/src/extractors/html.rs +393 -410
- data/vendor/kreuzberg/src/extractors/image.rs +198 -195
- data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +365 -268
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
- data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +493 -496
- data/vendor/kreuzberg/src/extractors/pptx.rs +248 -234
- data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
- data/vendor/kreuzberg/src/extractors/security.rs +484 -0
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +140 -126
- data/vendor/kreuzberg/src/extractors/text.rs +260 -242
- data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +135 -128
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -294
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -942
- data/vendor/kreuzberg/src/lib.rs +105 -102
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -32
- data/vendor/kreuzberg/src/mcp/server.rs +1968 -1966
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -847
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -122
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +393 -420
- data/vendor/kreuzberg/src/pdf/text.rs +158 -161
- data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -1010
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +620 -629
- data/vendor/kreuzberg/src/plugins/processor.rs +642 -641
- data/vendor/kreuzberg/src/plugins/registry.rs +1337 -1324
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +956 -955
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +19 -19
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +903 -873
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -542
- data/vendor/kreuzberg/tests/batch_processing.rs +316 -304
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -509
- data/vendor/kreuzberg/tests/config_features.rs +598 -580
- data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -439
- data/vendor/kreuzberg/tests/core_integration.rs +510 -493
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -424
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -124
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -676
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -43
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -1412
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -561
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -607
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
- data/vendor/kreuzberg/tests/security_validation.rs +415 -404
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/rb-sys/.cargo-ok +1 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
- data/vendor/rb-sys/Cargo.lock +393 -0
- data/vendor/rb-sys/Cargo.toml +70 -0
- data/vendor/rb-sys/Cargo.toml.orig +57 -0
- data/vendor/rb-sys/LICENSE-APACHE +190 -0
- data/vendor/rb-sys/LICENSE-MIT +21 -0
- data/vendor/rb-sys/bin/release.sh +21 -0
- data/vendor/rb-sys/build/features.rs +108 -0
- data/vendor/rb-sys/build/main.rs +246 -0
- data/vendor/rb-sys/build/stable_api_config.rs +153 -0
- data/vendor/rb-sys/build/version.rs +48 -0
- data/vendor/rb-sys/readme.md +36 -0
- data/vendor/rb-sys/src/bindings.rs +21 -0
- data/vendor/rb-sys/src/hidden.rs +11 -0
- data/vendor/rb-sys/src/lib.rs +34 -0
- data/vendor/rb-sys/src/macros.rs +371 -0
- data/vendor/rb-sys/src/memory.rs +53 -0
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
- data/vendor/rb-sys/src/special_consts.rs +31 -0
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
- data/vendor/rb-sys/src/stable_api.rs +261 -0
- data/vendor/rb-sys/src/symbol.rs +31 -0
- data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
- data/vendor/rb-sys/src/utils.rs +89 -0
- data/vendor/rb-sys/src/value_type.rs +7 -0
- metadata +90 -95
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/spec/examples.txt +0 -104
- data/vendor/kreuzberg/src/bin/profile_extract.rs +0 -455
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +0 -275
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +0 -178
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +0 -491
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +0 -496
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +0 -1188
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +0 -162
- data/vendor/kreuzberg/src/extractors/pandoc.rs +0 -201
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +0 -92
- data/vendor/kreuzberg/tests/pandoc_integration.rs +0 -503
|
@@ -1,164 +1,164 @@
|
|
|
1
|
-
use std::ops::RangeInclusive;
|
|
2
|
-
|
|
3
|
-
/// CJK text tokenizer for token reduction.
|
|
4
|
-
///
|
|
5
|
-
/// This tokenizer uses bigram (2-character) tokenization for CJK text,
|
|
6
|
-
/// which is appropriate for token reduction where we want to preserve
|
|
7
|
-
/// meaning while reducing token count.
|
|
8
|
-
///
|
|
9
|
-
/// # Unicode Range Coverage
|
|
10
|
-
///
|
|
11
|
-
/// **Currently covers:** CJK Unified Ideographs (U+4E00-U+9FFF)
|
|
12
|
-
/// - Covers ~20,992 common Chinese/Japanese Kanji characters
|
|
13
|
-
/// - Sufficient for token reduction purposes with Chinese and Japanese text
|
|
14
|
-
///
|
|
15
|
-
/// **Intentionally excluded:**
|
|
16
|
-
/// - Hiragana (U+3040-U+309F): Japanese phonetic script
|
|
17
|
-
/// - Katakana (U+30A0-U+30FF): Japanese phonetic script
|
|
18
|
-
/// - Hangul (U+AC00-U+D7AF): Korean alphabet
|
|
19
|
-
///
|
|
20
|
-
/// These exclusions are intentional for token reduction. Hiragana and Katakana
|
|
21
|
-
/// are typically tokenized with whitespace, and Hangul has different tokenization
|
|
22
|
-
/// requirements. If broader CJK support is needed, consider expanding the range
|
|
23
|
-
/// or using language-specific tokenizers.
|
|
24
|
-
pub struct CjkTokenizer {
|
|
25
|
-
cjk_range: RangeInclusive<u32>,
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
impl CjkTokenizer {
|
|
29
|
-
pub fn new() -> Self {
|
|
30
|
-
Self {
|
|
31
|
-
cjk_range: 0x4E00..=0x9FFF,
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
/// Checks if a character is a CJK Unified Ideograph (U+4E00-U+9FFF).
|
|
36
|
-
///
|
|
37
|
-
/// Returns true for Chinese characters and Japanese Kanji, false for
|
|
38
|
-
/// Hiragana, Katakana, Hangul, and non-CJK characters.
|
|
39
|
-
#[inline]
|
|
40
|
-
pub fn is_cjk_char(&self, c: char) -> bool {
|
|
41
|
-
self.cjk_range.contains(&(c as u32))
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
#[inline]
|
|
45
|
-
pub fn has_cjk(&self, text: &str) -> bool {
|
|
46
|
-
text.chars().any(|c| self.is_cjk_char(c))
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
pub fn tokenize_cjk_string(&self, text: &str) -> Vec<String> {
|
|
50
|
-
let chars: Vec<char> = text.chars().collect();
|
|
51
|
-
self.tokenize_cjk_chars(&chars)
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
pub fn tokenize_cjk_chars(&self, chars: &[char]) -> Vec<String> {
|
|
55
|
-
chars
|
|
56
|
-
.chunks(2)
|
|
57
|
-
.map(|chunk| {
|
|
58
|
-
if chunk.len() == 2 {
|
|
59
|
-
format!("{}{}", chunk[0], chunk[1])
|
|
60
|
-
} else {
|
|
61
|
-
chunk[0].to_string()
|
|
62
|
-
}
|
|
63
|
-
})
|
|
64
|
-
.collect()
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
pub fn tokenize_mixed_text(&self, text: &str) -> Vec<String> {
|
|
68
|
-
let whitespace_tokens: Vec<&str> = text.split_whitespace().collect();
|
|
69
|
-
|
|
70
|
-
if whitespace_tokens.is_empty() {
|
|
71
|
-
return if text.is_empty() {
|
|
72
|
-
vec![]
|
|
73
|
-
} else {
|
|
74
|
-
vec![text.to_string()]
|
|
75
|
-
};
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
if whitespace_tokens.len() == 1 {
|
|
79
|
-
let token = whitespace_tokens[0];
|
|
80
|
-
return if self.has_cjk(token) {
|
|
81
|
-
self.tokenize_cjk_string(token)
|
|
82
|
-
} else {
|
|
83
|
-
vec![token.to_string()]
|
|
84
|
-
};
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
let mut all_tokens = Vec::new();
|
|
88
|
-
for token in whitespace_tokens {
|
|
89
|
-
if self.has_cjk(token) {
|
|
90
|
-
all_tokens.extend(self.tokenize_cjk_string(token));
|
|
91
|
-
} else {
|
|
92
|
-
all_tokens.push(token.to_string());
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
all_tokens
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
impl Default for CjkTokenizer {
|
|
100
|
-
fn default() -> Self {
|
|
101
|
-
Self::new()
|
|
102
|
-
}
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
#[cfg(test)]
|
|
106
|
-
mod tests {
|
|
107
|
-
use super::*;
|
|
108
|
-
|
|
109
|
-
#[test]
|
|
110
|
-
fn test_is_cjk_char() {
|
|
111
|
-
let tokenizer = CjkTokenizer::new();
|
|
112
|
-
|
|
113
|
-
assert!(tokenizer.is_cjk_char('中'));
|
|
114
|
-
assert!(tokenizer.is_cjk_char('国'));
|
|
115
|
-
assert!(tokenizer.is_cjk_char('日'));
|
|
116
|
-
assert!(tokenizer.is_cjk_char('本'));
|
|
117
|
-
|
|
118
|
-
assert!(!tokenizer.is_cjk_char('a'));
|
|
119
|
-
assert!(!tokenizer.is_cjk_char('Z'));
|
|
120
|
-
assert!(!tokenizer.is_cjk_char('1'));
|
|
121
|
-
assert!(!tokenizer.is_cjk_char(' '));
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
#[test]
|
|
125
|
-
fn test_has_cjk() {
|
|
126
|
-
let tokenizer = CjkTokenizer::new();
|
|
127
|
-
|
|
128
|
-
assert!(tokenizer.has_cjk("这是中文"));
|
|
129
|
-
assert!(tokenizer.has_cjk("mixed 中文 text"));
|
|
130
|
-
assert!(tokenizer.has_cjk("日本語"));
|
|
131
|
-
|
|
132
|
-
assert!(!tokenizer.has_cjk("English text"));
|
|
133
|
-
assert!(!tokenizer.has_cjk("12345"));
|
|
134
|
-
assert!(!tokenizer.has_cjk(""));
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
#[test]
|
|
138
|
-
fn test_tokenize_cjk_string() {
|
|
139
|
-
let tokenizer = CjkTokenizer::new();
|
|
140
|
-
|
|
141
|
-
let tokens = tokenizer.tokenize_cjk_string("中国人");
|
|
142
|
-
assert_eq!(tokens, vec!["中国", "人"]);
|
|
143
|
-
|
|
144
|
-
let tokens = tokenizer.tokenize_cjk_string("四个字");
|
|
145
|
-
assert_eq!(tokens, vec!["四个", "字"]);
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
#[test]
|
|
149
|
-
fn test_tokenize_mixed_text() {
|
|
150
|
-
let tokenizer = CjkTokenizer::new();
|
|
151
|
-
|
|
152
|
-
let tokens = tokenizer.tokenize_mixed_text("hello world");
|
|
153
|
-
assert_eq!(tokens, vec!["hello", "world"]);
|
|
154
|
-
|
|
155
|
-
let tokens = tokenizer.tokenize_mixed_text("中国");
|
|
156
|
-
assert_eq!(tokens, vec!["中国"]);
|
|
157
|
-
|
|
158
|
-
let tokens = tokenizer.tokenize_mixed_text("hello 中国 world");
|
|
159
|
-
assert_eq!(tokens, vec!["hello", "中国", "world"]);
|
|
160
|
-
|
|
161
|
-
let tokens = tokenizer.tokenize_mixed_text("学习 machine learning 技术");
|
|
162
|
-
assert_eq!(tokens, vec!["学习", "machine", "learning", "技术"]);
|
|
163
|
-
}
|
|
164
|
-
}
|
|
1
|
+
use std::ops::RangeInclusive;
|
|
2
|
+
|
|
3
|
+
/// CJK text tokenizer for token reduction.
|
|
4
|
+
///
|
|
5
|
+
/// This tokenizer uses bigram (2-character) tokenization for CJK text,
|
|
6
|
+
/// which is appropriate for token reduction where we want to preserve
|
|
7
|
+
/// meaning while reducing token count.
|
|
8
|
+
///
|
|
9
|
+
/// # Unicode Range Coverage
|
|
10
|
+
///
|
|
11
|
+
/// **Currently covers:** CJK Unified Ideographs (U+4E00-U+9FFF)
|
|
12
|
+
/// - Covers ~20,992 common Chinese/Japanese Kanji characters
|
|
13
|
+
/// - Sufficient for token reduction purposes with Chinese and Japanese text
|
|
14
|
+
///
|
|
15
|
+
/// **Intentionally excluded:**
|
|
16
|
+
/// - Hiragana (U+3040-U+309F): Japanese phonetic script
|
|
17
|
+
/// - Katakana (U+30A0-U+30FF): Japanese phonetic script
|
|
18
|
+
/// - Hangul (U+AC00-U+D7AF): Korean alphabet
|
|
19
|
+
///
|
|
20
|
+
/// These exclusions are intentional for token reduction. Hiragana and Katakana
|
|
21
|
+
/// are typically tokenized with whitespace, and Hangul has different tokenization
|
|
22
|
+
/// requirements. If broader CJK support is needed, consider expanding the range
|
|
23
|
+
/// or using language-specific tokenizers.
|
|
24
|
+
pub struct CjkTokenizer {
|
|
25
|
+
cjk_range: RangeInclusive<u32>,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
impl CjkTokenizer {
|
|
29
|
+
pub fn new() -> Self {
|
|
30
|
+
Self {
|
|
31
|
+
cjk_range: 0x4E00..=0x9FFF,
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/// Checks if a character is a CJK Unified Ideograph (U+4E00-U+9FFF).
|
|
36
|
+
///
|
|
37
|
+
/// Returns true for Chinese characters and Japanese Kanji, false for
|
|
38
|
+
/// Hiragana, Katakana, Hangul, and non-CJK characters.
|
|
39
|
+
#[inline]
|
|
40
|
+
pub fn is_cjk_char(&self, c: char) -> bool {
|
|
41
|
+
self.cjk_range.contains(&(c as u32))
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
#[inline]
|
|
45
|
+
pub fn has_cjk(&self, text: &str) -> bool {
|
|
46
|
+
text.chars().any(|c| self.is_cjk_char(c))
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
pub fn tokenize_cjk_string(&self, text: &str) -> Vec<String> {
|
|
50
|
+
let chars: Vec<char> = text.chars().collect();
|
|
51
|
+
self.tokenize_cjk_chars(&chars)
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
pub fn tokenize_cjk_chars(&self, chars: &[char]) -> Vec<String> {
|
|
55
|
+
chars
|
|
56
|
+
.chunks(2)
|
|
57
|
+
.map(|chunk| {
|
|
58
|
+
if chunk.len() == 2 {
|
|
59
|
+
format!("{}{}", chunk[0], chunk[1])
|
|
60
|
+
} else {
|
|
61
|
+
chunk[0].to_string()
|
|
62
|
+
}
|
|
63
|
+
})
|
|
64
|
+
.collect()
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
pub fn tokenize_mixed_text(&self, text: &str) -> Vec<String> {
|
|
68
|
+
let whitespace_tokens: Vec<&str> = text.split_whitespace().collect();
|
|
69
|
+
|
|
70
|
+
if whitespace_tokens.is_empty() {
|
|
71
|
+
return if text.is_empty() {
|
|
72
|
+
vec![]
|
|
73
|
+
} else {
|
|
74
|
+
vec![text.to_string()]
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if whitespace_tokens.len() == 1 {
|
|
79
|
+
let token = whitespace_tokens[0];
|
|
80
|
+
return if self.has_cjk(token) {
|
|
81
|
+
self.tokenize_cjk_string(token)
|
|
82
|
+
} else {
|
|
83
|
+
vec![token.to_string()]
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
let mut all_tokens = Vec::new();
|
|
88
|
+
for token in whitespace_tokens {
|
|
89
|
+
if self.has_cjk(token) {
|
|
90
|
+
all_tokens.extend(self.tokenize_cjk_string(token));
|
|
91
|
+
} else {
|
|
92
|
+
all_tokens.push(token.to_string());
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
all_tokens
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
impl Default for CjkTokenizer {
|
|
100
|
+
fn default() -> Self {
|
|
101
|
+
Self::new()
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
#[cfg(test)]
|
|
106
|
+
mod tests {
|
|
107
|
+
use super::*;
|
|
108
|
+
|
|
109
|
+
#[test]
|
|
110
|
+
fn test_is_cjk_char() {
|
|
111
|
+
let tokenizer = CjkTokenizer::new();
|
|
112
|
+
|
|
113
|
+
assert!(tokenizer.is_cjk_char('中'));
|
|
114
|
+
assert!(tokenizer.is_cjk_char('国'));
|
|
115
|
+
assert!(tokenizer.is_cjk_char('日'));
|
|
116
|
+
assert!(tokenizer.is_cjk_char('本'));
|
|
117
|
+
|
|
118
|
+
assert!(!tokenizer.is_cjk_char('a'));
|
|
119
|
+
assert!(!tokenizer.is_cjk_char('Z'));
|
|
120
|
+
assert!(!tokenizer.is_cjk_char('1'));
|
|
121
|
+
assert!(!tokenizer.is_cjk_char(' '));
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
#[test]
|
|
125
|
+
fn test_has_cjk() {
|
|
126
|
+
let tokenizer = CjkTokenizer::new();
|
|
127
|
+
|
|
128
|
+
assert!(tokenizer.has_cjk("这是中文"));
|
|
129
|
+
assert!(tokenizer.has_cjk("mixed 中文 text"));
|
|
130
|
+
assert!(tokenizer.has_cjk("日本語"));
|
|
131
|
+
|
|
132
|
+
assert!(!tokenizer.has_cjk("English text"));
|
|
133
|
+
assert!(!tokenizer.has_cjk("12345"));
|
|
134
|
+
assert!(!tokenizer.has_cjk(""));
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
#[test]
|
|
138
|
+
fn test_tokenize_cjk_string() {
|
|
139
|
+
let tokenizer = CjkTokenizer::new();
|
|
140
|
+
|
|
141
|
+
let tokens = tokenizer.tokenize_cjk_string("中国人");
|
|
142
|
+
assert_eq!(tokens, vec!["中国", "人"]);
|
|
143
|
+
|
|
144
|
+
let tokens = tokenizer.tokenize_cjk_string("四个字");
|
|
145
|
+
assert_eq!(tokens, vec!["四个", "字"]);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
#[test]
|
|
149
|
+
fn test_tokenize_mixed_text() {
|
|
150
|
+
let tokenizer = CjkTokenizer::new();
|
|
151
|
+
|
|
152
|
+
let tokens = tokenizer.tokenize_mixed_text("hello world");
|
|
153
|
+
assert_eq!(tokens, vec!["hello", "world"]);
|
|
154
|
+
|
|
155
|
+
let tokens = tokenizer.tokenize_mixed_text("中国");
|
|
156
|
+
assert_eq!(tokens, vec!["中国"]);
|
|
157
|
+
|
|
158
|
+
let tokens = tokenizer.tokenize_mixed_text("hello 中国 world");
|
|
159
|
+
assert_eq!(tokens, vec!["hello", "中国", "world"]);
|
|
160
|
+
|
|
161
|
+
let tokens = tokenizer.tokenize_mixed_text("学习 machine learning 技术");
|
|
162
|
+
assert_eq!(tokens, vec!["学习", "machine", "learning", "技术"]);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
@@ -1,100 +1,100 @@
|
|
|
1
|
-
use serde::{Deserialize, Serialize};
|
|
2
|
-
use std::collections::HashMap;
|
|
3
|
-
|
|
4
|
-
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
|
5
|
-
pub enum ReductionLevel {
|
|
6
|
-
Off = 0,
|
|
7
|
-
Light = 1,
|
|
8
|
-
Moderate = 2,
|
|
9
|
-
Aggressive = 3,
|
|
10
|
-
Maximum = 4,
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
impl ReductionLevel {
|
|
14
|
-
pub fn as_str(&self) -> &'static str {
|
|
15
|
-
match self {
|
|
16
|
-
ReductionLevel::Off => "off",
|
|
17
|
-
ReductionLevel::Light => "light",
|
|
18
|
-
ReductionLevel::Moderate => "moderate",
|
|
19
|
-
ReductionLevel::Aggressive => "aggressive",
|
|
20
|
-
ReductionLevel::Maximum => "maximum",
|
|
21
|
-
}
|
|
22
|
-
}
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
impl From<&str> for ReductionLevel {
|
|
26
|
-
fn from(s: &str) -> Self {
|
|
27
|
-
match s.to_lowercase().as_str() {
|
|
28
|
-
"off" => ReductionLevel::Off,
|
|
29
|
-
"light" => ReductionLevel::Light,
|
|
30
|
-
"moderate" => ReductionLevel::Moderate,
|
|
31
|
-
"aggressive" => ReductionLevel::Aggressive,
|
|
32
|
-
"maximum" => ReductionLevel::Maximum,
|
|
33
|
-
_ => ReductionLevel::Moderate,
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
|
39
|
-
pub struct TokenReductionConfig {
|
|
40
|
-
pub level: ReductionLevel,
|
|
41
|
-
pub language_hint: Option<String>,
|
|
42
|
-
pub preserve_markdown: bool,
|
|
43
|
-
pub preserve_code: bool,
|
|
44
|
-
pub semantic_threshold: f32,
|
|
45
|
-
pub enable_parallel: bool,
|
|
46
|
-
pub use_simd: bool,
|
|
47
|
-
pub custom_stopwords: Option<HashMap<String, Vec<String>>>,
|
|
48
|
-
pub preserve_patterns: Vec<String>,
|
|
49
|
-
pub target_reduction: Option<f32>,
|
|
50
|
-
pub enable_semantic_clustering: bool,
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
impl Default for TokenReductionConfig {
|
|
54
|
-
fn default() -> Self {
|
|
55
|
-
Self {
|
|
56
|
-
level: ReductionLevel::Moderate,
|
|
57
|
-
language_hint: None,
|
|
58
|
-
preserve_markdown: false,
|
|
59
|
-
preserve_code: true,
|
|
60
|
-
semantic_threshold: 0.3,
|
|
61
|
-
enable_parallel: true,
|
|
62
|
-
use_simd: true,
|
|
63
|
-
custom_stopwords: None,
|
|
64
|
-
preserve_patterns: vec![],
|
|
65
|
-
target_reduction: None,
|
|
66
|
-
enable_semantic_clustering: false,
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
impl TokenReductionConfig {
|
|
72
|
-
#[allow(clippy::too_many_arguments)]
|
|
73
|
-
pub fn new(
|
|
74
|
-
level: ReductionLevel,
|
|
75
|
-
language_hint: Option<String>,
|
|
76
|
-
preserve_markdown: bool,
|
|
77
|
-
preserve_code: bool,
|
|
78
|
-
semantic_threshold: f32,
|
|
79
|
-
enable_parallel: bool,
|
|
80
|
-
use_simd: bool,
|
|
81
|
-
custom_stopwords: Option<HashMap<String, Vec<String>>>,
|
|
82
|
-
preserve_patterns: Option<Vec<String>>,
|
|
83
|
-
target_reduction: Option<f32>,
|
|
84
|
-
enable_semantic_clustering: bool,
|
|
85
|
-
) -> Self {
|
|
86
|
-
Self {
|
|
87
|
-
level,
|
|
88
|
-
language_hint,
|
|
89
|
-
preserve_markdown,
|
|
90
|
-
preserve_code,
|
|
91
|
-
semantic_threshold: semantic_threshold.clamp(0.0, 1.0),
|
|
92
|
-
enable_parallel,
|
|
93
|
-
use_simd,
|
|
94
|
-
custom_stopwords,
|
|
95
|
-
preserve_patterns: preserve_patterns.unwrap_or_default(),
|
|
96
|
-
target_reduction: target_reduction.map(|t| t.clamp(0.0, 1.0)),
|
|
97
|
-
enable_semantic_clustering,
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
}
|
|
1
|
+
use serde::{Deserialize, Serialize};
|
|
2
|
+
use std::collections::HashMap;
|
|
3
|
+
|
|
4
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
|
5
|
+
pub enum ReductionLevel {
|
|
6
|
+
Off = 0,
|
|
7
|
+
Light = 1,
|
|
8
|
+
Moderate = 2,
|
|
9
|
+
Aggressive = 3,
|
|
10
|
+
Maximum = 4,
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
impl ReductionLevel {
|
|
14
|
+
pub fn as_str(&self) -> &'static str {
|
|
15
|
+
match self {
|
|
16
|
+
ReductionLevel::Off => "off",
|
|
17
|
+
ReductionLevel::Light => "light",
|
|
18
|
+
ReductionLevel::Moderate => "moderate",
|
|
19
|
+
ReductionLevel::Aggressive => "aggressive",
|
|
20
|
+
ReductionLevel::Maximum => "maximum",
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
impl From<&str> for ReductionLevel {
|
|
26
|
+
fn from(s: &str) -> Self {
|
|
27
|
+
match s.to_lowercase().as_str() {
|
|
28
|
+
"off" => ReductionLevel::Off,
|
|
29
|
+
"light" => ReductionLevel::Light,
|
|
30
|
+
"moderate" => ReductionLevel::Moderate,
|
|
31
|
+
"aggressive" => ReductionLevel::Aggressive,
|
|
32
|
+
"maximum" => ReductionLevel::Maximum,
|
|
33
|
+
_ => ReductionLevel::Moderate,
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
|
39
|
+
pub struct TokenReductionConfig {
|
|
40
|
+
pub level: ReductionLevel,
|
|
41
|
+
pub language_hint: Option<String>,
|
|
42
|
+
pub preserve_markdown: bool,
|
|
43
|
+
pub preserve_code: bool,
|
|
44
|
+
pub semantic_threshold: f32,
|
|
45
|
+
pub enable_parallel: bool,
|
|
46
|
+
pub use_simd: bool,
|
|
47
|
+
pub custom_stopwords: Option<HashMap<String, Vec<String>>>,
|
|
48
|
+
pub preserve_patterns: Vec<String>,
|
|
49
|
+
pub target_reduction: Option<f32>,
|
|
50
|
+
pub enable_semantic_clustering: bool,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
impl Default for TokenReductionConfig {
|
|
54
|
+
fn default() -> Self {
|
|
55
|
+
Self {
|
|
56
|
+
level: ReductionLevel::Moderate,
|
|
57
|
+
language_hint: None,
|
|
58
|
+
preserve_markdown: false,
|
|
59
|
+
preserve_code: true,
|
|
60
|
+
semantic_threshold: 0.3,
|
|
61
|
+
enable_parallel: true,
|
|
62
|
+
use_simd: true,
|
|
63
|
+
custom_stopwords: None,
|
|
64
|
+
preserve_patterns: vec![],
|
|
65
|
+
target_reduction: None,
|
|
66
|
+
enable_semantic_clustering: false,
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
impl TokenReductionConfig {
|
|
72
|
+
#[allow(clippy::too_many_arguments)]
|
|
73
|
+
pub fn new(
|
|
74
|
+
level: ReductionLevel,
|
|
75
|
+
language_hint: Option<String>,
|
|
76
|
+
preserve_markdown: bool,
|
|
77
|
+
preserve_code: bool,
|
|
78
|
+
semantic_threshold: f32,
|
|
79
|
+
enable_parallel: bool,
|
|
80
|
+
use_simd: bool,
|
|
81
|
+
custom_stopwords: Option<HashMap<String, Vec<String>>>,
|
|
82
|
+
preserve_patterns: Option<Vec<String>>,
|
|
83
|
+
target_reduction: Option<f32>,
|
|
84
|
+
enable_semantic_clustering: bool,
|
|
85
|
+
) -> Self {
|
|
86
|
+
Self {
|
|
87
|
+
level,
|
|
88
|
+
language_hint,
|
|
89
|
+
preserve_markdown,
|
|
90
|
+
preserve_code,
|
|
91
|
+
semantic_threshold: semantic_threshold.clamp(0.0, 1.0),
|
|
92
|
+
enable_parallel,
|
|
93
|
+
use_simd,
|
|
94
|
+
custom_stopwords,
|
|
95
|
+
preserve_patterns: preserve_patterns.unwrap_or_default(),
|
|
96
|
+
target_reduction: target_reduction.map(|t| t.clamp(0.0, 1.0)),
|
|
97
|
+
enable_semantic_clustering,
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|