kreuzberg 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +534 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +157 -0
- data/README.md +421 -0
- data/Rakefile +25 -0
- data/Steepfile +47 -0
- data/examples/async_patterns.rb +340 -0
- data/ext/kreuzberg_rb/extconf.rb +35 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
- data/ext/kreuzberg_rb/native/README.md +425 -0
- data/ext/kreuzberg_rb/native/build.rs +17 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
- data/ext/kreuzberg_rb/native/include/strings.h +20 -0
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
- data/extconf.rb +28 -0
- data/kreuzberg.gemspec +105 -0
- data/lib/kreuzberg/api_proxy.rb +142 -0
- data/lib/kreuzberg/cache_api.rb +45 -0
- data/lib/kreuzberg/cli.rb +55 -0
- data/lib/kreuzberg/cli_proxy.rb +127 -0
- data/lib/kreuzberg/config.rb +684 -0
- data/lib/kreuzberg/errors.rb +50 -0
- data/lib/kreuzberg/extraction_api.rb +84 -0
- data/lib/kreuzberg/mcp_proxy.rb +186 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
- data/lib/kreuzberg/post_processor_protocol.rb +86 -0
- data/lib/kreuzberg/result.rb +216 -0
- data/lib/kreuzberg/setup_lib_path.rb +79 -0
- data/lib/kreuzberg/validator_protocol.rb +89 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +82 -0
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +468 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +87 -0
- data/spec/binding/cli_spec.rb +54 -0
- data/spec/binding/config_spec.rb +345 -0
- data/spec/binding/config_validation_spec.rb +283 -0
- data/spec/binding/error_handling_spec.rb +213 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +274 -0
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +39 -0
- data/spec/fixtures/config.yaml +42 -0
- data/spec/fixtures/invalid_config.toml +4 -0
- data/spec/smoke/package_spec.rb +178 -0
- data/spec/spec_helper.rb +42 -0
- data/vendor/kreuzberg/Cargo.toml +134 -0
- data/vendor/kreuzberg/README.md +175 -0
- data/vendor/kreuzberg/build.rs +460 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -0
- data/vendor/kreuzberg/src/api/handlers.rs +199 -0
- data/vendor/kreuzberg/src/api/mod.rs +79 -0
- data/vendor/kreuzberg/src/api/server.rs +353 -0
- data/vendor/kreuzberg/src/api/types.rs +170 -0
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
- data/vendor/kreuzberg/src/core/config.rs +1032 -0
- data/vendor/kreuzberg/src/core/extractor.rs +903 -0
- data/vendor/kreuzberg/src/core/io.rs +327 -0
- data/vendor/kreuzberg/src/core/mime.rs +615 -0
- data/vendor/kreuzberg/src/core/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
- data/vendor/kreuzberg/src/embeddings.rs +323 -0
- data/vendor/kreuzberg/src/error.rs +431 -0
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
- data/vendor/kreuzberg/src/extraction/email.rs +854 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
- data/vendor/kreuzberg/src/extraction/html.rs +553 -0
- data/vendor/kreuzberg/src/extraction/image.rs +368 -0
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
- data/vendor/kreuzberg/src/extraction/table.rs +328 -0
- data/vendor/kreuzberg/src/extraction/text.rs +269 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
- data/vendor/kreuzberg/src/extractors/email.rs +129 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
- data/vendor/kreuzberg/src/extractors/html.rs +410 -0
- data/vendor/kreuzberg/src/extractors/image.rs +195 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
- data/vendor/kreuzberg/src/extractors/text.rs +242 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
- data/vendor/kreuzberg/src/image/dpi.rs +164 -0
- data/vendor/kreuzberg/src/image/mod.rs +6 -0
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
- data/vendor/kreuzberg/src/image/resize.rs +89 -0
- data/vendor/kreuzberg/src/keywords/config.rs +154 -0
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
- data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
- data/vendor/kreuzberg/src/keywords/types.rs +68 -0
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
- data/vendor/kreuzberg/src/lib.rs +102 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
- data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
- data/vendor/kreuzberg/src/ocr/error.rs +37 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
- data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
- data/vendor/kreuzberg/src/ocr/types.rs +393 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
- data/vendor/kreuzberg/src/pdf/table.rs +420 -0
- data/vendor/kreuzberg/src/pdf/text.rs +161 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
- data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
- data/vendor/kreuzberg/src/text/mod.rs +19 -0
- data/vendor/kreuzberg/src/text/quality.rs +697 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
- data/vendor/kreuzberg/src/types.rs +873 -0
- data/vendor/kreuzberg/src/utils/mod.rs +17 -0
- data/vendor/kreuzberg/src/utils/quality.rs +959 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -0
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
- data/vendor/kreuzberg/tests/config_features.rs +580 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
- data/vendor/kreuzberg/tests/core_integration.rs +493 -0
- data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
- data/vendor/kreuzberg/tests/security_validation.rs +404 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
- metadata +471 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
mod cjk_utils;
|
|
2
|
+
mod config;
|
|
3
|
+
mod core;
|
|
4
|
+
mod filters;
|
|
5
|
+
mod semantic;
|
|
6
|
+
mod simd_text;
|
|
7
|
+
|
|
8
|
+
pub use config::{ReductionLevel, TokenReductionConfig};
|
|
9
|
+
pub use core::TokenReducer;
|
|
10
|
+
|
|
11
|
+
// TODO: reorganize token_reduction - move out of text, and reorganize text properly into utils etc.
|
|
12
|
+
|
|
13
|
+
/// Reduces token count in text while preserving meaning and structure.
|
|
14
|
+
///
|
|
15
|
+
/// This function removes stopwords, redundancy, and applies compression techniques
|
|
16
|
+
/// based on the specified reduction level. Supports 64 languages with automatic
|
|
17
|
+
/// stopword removal and optional semantic clustering.
|
|
18
|
+
///
|
|
19
|
+
/// # Arguments
|
|
20
|
+
///
|
|
21
|
+
/// * `text` - The input text to reduce
|
|
22
|
+
/// * `config` - Configuration specifying reduction level and options
|
|
23
|
+
/// * `language_hint` - Optional ISO 639-3 language code (e.g., "eng", "spa")
|
|
24
|
+
///
|
|
25
|
+
/// # Returns
|
|
26
|
+
///
|
|
27
|
+
/// Returns the reduced text with preserved structure (markdown, code blocks).
|
|
28
|
+
///
|
|
29
|
+
/// # Errors
|
|
30
|
+
///
|
|
31
|
+
/// Returns an error if the language hint is invalid or stopwords cannot be loaded.
|
|
32
|
+
///
|
|
33
|
+
/// # Examples
|
|
34
|
+
///
|
|
35
|
+
/// ```rust
|
|
36
|
+
/// use kreuzberg::text::token_reduction::{reduce_tokens, TokenReductionConfig, ReductionLevel};
|
|
37
|
+
///
|
|
38
|
+
/// let text = "This is a simple example text with some stopwords.";
|
|
39
|
+
/// let config = TokenReductionConfig::default();
|
|
40
|
+
/// let reduced = reduce_tokens(text, &config, Some("eng"))?;
|
|
41
|
+
/// println!("Reduced: {}", reduced);
|
|
42
|
+
/// # Ok::<(), kreuzberg::error::KreuzbergError>(())
|
|
43
|
+
/// ```
|
|
44
|
+
pub fn reduce_tokens(
|
|
45
|
+
text: &str,
|
|
46
|
+
config: &TokenReductionConfig,
|
|
47
|
+
language_hint: Option<&str>,
|
|
48
|
+
) -> crate::error::Result<String> {
|
|
49
|
+
let reducer = TokenReducer::new(config, language_hint)?;
|
|
50
|
+
Ok(reducer.reduce(text))
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/// Reduces token count for multiple texts efficiently using parallel processing.
|
|
54
|
+
///
|
|
55
|
+
/// This function processes multiple texts in parallel using Rayon, providing
|
|
56
|
+
/// significant performance improvements for batch operations. All texts use the
|
|
57
|
+
/// same configuration and language hint for consistency.
|
|
58
|
+
///
|
|
59
|
+
/// # Arguments
|
|
60
|
+
///
|
|
61
|
+
/// * `texts` - Slice of text references to reduce
|
|
62
|
+
/// * `config` - Configuration specifying reduction level and options
|
|
63
|
+
/// * `language_hint` - Optional ISO 639-3 language code (e.g., "eng", "spa")
|
|
64
|
+
///
|
|
65
|
+
/// # Returns
|
|
66
|
+
///
|
|
67
|
+
/// Returns a vector of reduced texts in the same order as the input.
|
|
68
|
+
///
|
|
69
|
+
/// # Errors
|
|
70
|
+
///
|
|
71
|
+
/// Returns an error if the language hint is invalid or stopwords cannot be loaded.
|
|
72
|
+
///
|
|
73
|
+
/// # Examples
|
|
74
|
+
///
|
|
75
|
+
/// ```rust
|
|
76
|
+
/// use kreuzberg::text::token_reduction::{batch_reduce_tokens, TokenReductionConfig, ReductionLevel};
|
|
77
|
+
///
|
|
78
|
+
/// let texts = vec![
|
|
79
|
+
/// "This is the first document with some text.",
|
|
80
|
+
/// "Here is another document with different content.",
|
|
81
|
+
/// "And finally, a third document to process.",
|
|
82
|
+
/// ];
|
|
83
|
+
/// let config = TokenReductionConfig::default();
|
|
84
|
+
/// let reduced = batch_reduce_tokens(&texts, &config, Some("eng"))?;
|
|
85
|
+
/// assert_eq!(reduced.len(), 3);
|
|
86
|
+
/// # Ok::<(), kreuzberg::error::KreuzbergError>(())
|
|
87
|
+
/// ```
|
|
88
|
+
pub fn batch_reduce_tokens(
|
|
89
|
+
texts: &[&str],
|
|
90
|
+
config: &TokenReductionConfig,
|
|
91
|
+
language_hint: Option<&str>,
|
|
92
|
+
) -> crate::error::Result<Vec<String>> {
|
|
93
|
+
let reducer = TokenReducer::new(config, language_hint)?;
|
|
94
|
+
Ok(reducer.batch_reduce(texts))
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/// Calculates detailed statistics comparing original and reduced text.
|
|
98
|
+
///
|
|
99
|
+
/// Provides comprehensive metrics including reduction percentages and absolute
|
|
100
|
+
/// counts for both characters and tokens. Useful for analyzing the effectiveness
|
|
101
|
+
/// of token reduction and monitoring compression ratios.
|
|
102
|
+
///
|
|
103
|
+
/// # Arguments
|
|
104
|
+
///
|
|
105
|
+
/// * `original` - The original text before reduction
|
|
106
|
+
/// * `reduced` - The reduced text after applying token reduction
|
|
107
|
+
///
|
|
108
|
+
/// # Returns
|
|
109
|
+
///
|
|
110
|
+
/// Returns a tuple with the following statistics (in order):
|
|
111
|
+
/// 1. `char_reduction` (f64) - Character reduction ratio (0.0 to 1.0)
|
|
112
|
+
/// 2. `token_reduction` (f64) - Token reduction ratio (0.0 to 1.0)
|
|
113
|
+
/// 3. `original_chars` (usize) - Original character count
|
|
114
|
+
/// 4. `reduced_chars` (usize) - Reduced character count
|
|
115
|
+
/// 5. `original_tokens` (usize) - Original token count (whitespace-delimited)
|
|
116
|
+
/// 6. `reduced_tokens` (usize) - Reduced token count (whitespace-delimited)
|
|
117
|
+
///
|
|
118
|
+
/// # Examples
|
|
119
|
+
///
|
|
120
|
+
/// ```rust
|
|
121
|
+
/// use kreuzberg::text::token_reduction::{reduce_tokens, get_reduction_statistics, TokenReductionConfig, ReductionLevel};
|
|
122
|
+
///
|
|
123
|
+
/// let original = "This is a simple example text with some stopwords and redundancy.";
|
|
124
|
+
/// let config = TokenReductionConfig::default();
|
|
125
|
+
/// let reduced = reduce_tokens(original, &config, Some("eng"))?;
|
|
126
|
+
///
|
|
127
|
+
/// let (char_ratio, token_ratio, orig_chars, red_chars, orig_tokens, red_tokens) =
|
|
128
|
+
/// get_reduction_statistics(original, &reduced);
|
|
129
|
+
///
|
|
130
|
+
/// println!("Reduced {:.1}% of characters ({} -> {})", char_ratio * 100.0, orig_chars, red_chars);
|
|
131
|
+
/// println!("Reduced {:.1}% of tokens ({} -> {})", token_ratio * 100.0, orig_tokens, red_tokens);
|
|
132
|
+
/// # Ok::<(), kreuzberg::error::KreuzbergError>(())
|
|
133
|
+
/// ```
|
|
134
|
+
pub fn get_reduction_statistics(original: &str, reduced: &str) -> (f64, f64, usize, usize, usize, usize) {
|
|
135
|
+
let original_chars = original.chars().count();
|
|
136
|
+
let reduced_chars = reduced.chars().count();
|
|
137
|
+
let original_tokens = original.split_whitespace().count();
|
|
138
|
+
let reduced_tokens = reduced.split_whitespace().count();
|
|
139
|
+
|
|
140
|
+
let char_reduction = if original_chars > 0 {
|
|
141
|
+
1.0 - (reduced_chars as f64 / original_chars as f64)
|
|
142
|
+
} else {
|
|
143
|
+
0.0
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
let token_reduction = if original_tokens > 0 {
|
|
147
|
+
1.0 - (reduced_tokens as f64 / original_tokens as f64)
|
|
148
|
+
} else {
|
|
149
|
+
0.0
|
|
150
|
+
};
|
|
151
|
+
|
|
152
|
+
(
|
|
153
|
+
char_reduction,
|
|
154
|
+
token_reduction,
|
|
155
|
+
original_chars,
|
|
156
|
+
reduced_chars,
|
|
157
|
+
original_tokens,
|
|
158
|
+
reduced_tokens,
|
|
159
|
+
)
|
|
160
|
+
}
|