kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,217 @@
1
+ use chardetng::EncodingDetector;
2
+ use encoding_rs::Encoding;
3
+ use once_cell::sync::Lazy;
4
+ use regex::Regex;
5
+ use std::borrow::Cow;
6
+ use std::collections::HashMap;
7
+ use std::sync::RwLock;
8
+
9
+ // ============================================================================
10
+
11
+ static CONTROL_CHARS: Lazy<Regex> = Lazy::new(|| {
12
+ Regex::new(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]")
13
+ .expect("Control chars regex pattern is valid and should compile")
14
+ });
15
+ static REPLACEMENT_CHARS: Lazy<Regex> =
16
+ Lazy::new(|| Regex::new(r"\u{FFFD}+").expect("Replacement chars regex pattern is valid and should compile"));
17
+ static ISOLATED_COMBINING: Lazy<Regex> = Lazy::new(|| {
18
+ Regex::new(r"[\u{0300}-\u{036F}]+")
19
+ .expect("Isolated combining diacritics regex pattern is valid and should compile")
20
+ });
21
+ static HEBREW_AS_CYRILLIC: Lazy<Regex> = Lazy::new(|| {
22
+ Regex::new(r"[\u{0400}-\u{04FF}]{3,}")
23
+ .expect("Hebrew misencoded as Cyrillic regex pattern is valid and should compile")
24
+ });
25
+
26
+ static ENCODING_CACHE: Lazy<RwLock<HashMap<String, &'static Encoding>>> = Lazy::new(|| RwLock::new(HashMap::new()));
27
+
28
+ const CACHE_SIZE_LIMIT: usize = 1000;
29
+
30
+ #[inline]
31
+ fn chain_replacements<'a>(mut text: Cow<'a, str>, replacements: &[(&Regex, &str)]) -> Cow<'a, str> {
32
+ for (pattern, replacement) in replacements {
33
+ if pattern.is_match(&text) {
34
+ text = Cow::Owned(pattern.replace_all(&text, *replacement).into_owned());
35
+ }
36
+ }
37
+ text
38
+ }
39
+
40
+ fn calculate_cache_key(data: &[u8]) -> String {
41
+ use std::collections::hash_map::DefaultHasher;
42
+ use std::hash::{Hash, Hasher};
43
+
44
+ let mut hasher = DefaultHasher::new();
45
+ let sample = if data.len() > 1024 { &data[..1024] } else { data };
46
+ sample.hash(&mut hasher);
47
+ data.len().hash(&mut hasher);
48
+ format!("{:x}", hasher.finish())
49
+ }
50
+
51
+ pub fn safe_decode(byte_data: &[u8], encoding: Option<&str>) -> String {
52
+ if byte_data.is_empty() {
53
+ return String::new();
54
+ }
55
+
56
+ if let Some(enc_name) = encoding
57
+ && let Some(enc) = Encoding::for_label(enc_name.as_bytes())
58
+ {
59
+ let (decoded, _, _) = enc.decode(byte_data);
60
+ return fix_mojibake_internal(&decoded);
61
+ }
62
+
63
+ let cache_key = calculate_cache_key(byte_data);
64
+
65
+ if let Ok(cache) = ENCODING_CACHE.read()
66
+ && let Some(&cached_encoding) = cache.get(&cache_key)
67
+ {
68
+ let (decoded, _, _) = cached_encoding.decode(byte_data);
69
+ return fix_mojibake_internal(&decoded);
70
+ }
71
+
72
+ let mut detector = EncodingDetector::new();
73
+ detector.feed(byte_data, true);
74
+ let encoding = detector.guess(None, true);
75
+
76
+ if let Ok(mut cache) = ENCODING_CACHE.write()
77
+ && cache.len() < CACHE_SIZE_LIMIT
78
+ {
79
+ cache.insert(cache_key, encoding);
80
+ }
81
+
82
+ let (decoded, _, had_errors) = encoding.decode(byte_data);
83
+
84
+ if had_errors {
85
+ for enc_name in &[
86
+ "windows-1255",
87
+ "iso-8859-8",
88
+ "windows-1256",
89
+ "iso-8859-6",
90
+ "windows-1252",
91
+ "cp1251",
92
+ ] {
93
+ if let Some(enc) = Encoding::for_label(enc_name.as_bytes()) {
94
+ let (test_decoded, _, test_errors) = enc.decode(byte_data);
95
+ if !test_errors && calculate_text_confidence_internal(&test_decoded) > 0.5 {
96
+ return fix_mojibake_internal(&test_decoded);
97
+ }
98
+ }
99
+ }
100
+ }
101
+
102
+ fix_mojibake_internal(&decoded)
103
+ }
104
+
105
+ pub fn get_encoding_cache_key(data_hash: &str, size: usize) -> String {
106
+ format!("{}:{}", data_hash, size)
107
+ }
108
+
109
+ pub fn calculate_text_confidence(text: &str) -> f64 {
110
+ calculate_text_confidence_internal(text)
111
+ }
112
+
113
+ fn calculate_text_confidence_internal(text: &str) -> f64 {
114
+ if text.is_empty() {
115
+ return 0.0;
116
+ }
117
+
118
+ let total_chars = text.len() as f64;
119
+
120
+ let replacement_count = REPLACEMENT_CHARS.find_iter(text).count() as f64;
121
+ let control_count = CONTROL_CHARS.find_iter(text).count() as f64;
122
+
123
+ let penalty = (replacement_count + control_count * 2.0) / total_chars;
124
+
125
+ let readable_chars = text
126
+ .chars()
127
+ .filter(|c| c.is_ascii_graphic() || c.is_whitespace())
128
+ .count() as f64;
129
+
130
+ let readability_score = readable_chars / total_chars;
131
+
132
+ let cyrillic_matches = HEBREW_AS_CYRILLIC.find_iter(text);
133
+ let cyrillic_length: usize = cyrillic_matches.map(|m| m.len()).sum();
134
+
135
+ let mut final_penalty = penalty;
136
+ if cyrillic_length as f64 > total_chars * 0.1 {
137
+ final_penalty += 0.3;
138
+ }
139
+
140
+ (readability_score - final_penalty).clamp(0.0, 1.0)
141
+ }
142
+
143
+ pub fn fix_mojibake(text: &str) -> String {
144
+ fix_mojibake_internal(text)
145
+ }
146
+
147
+ fn fix_mojibake_internal(text: &str) -> String {
148
+ if text.is_empty() {
149
+ return text.to_string();
150
+ }
151
+
152
+ let replacements = [
153
+ (&*CONTROL_CHARS, ""),
154
+ (&*REPLACEMENT_CHARS, ""),
155
+ (&*ISOLATED_COMBINING, ""),
156
+ ];
157
+
158
+ chain_replacements(Cow::Borrowed(text), &replacements).into_owned()
159
+ }
160
+
161
+ #[cfg(test)]
162
+ mod tests {
163
+ use super::*;
164
+
165
+ #[test]
166
+ fn test_safe_decode_empty() {
167
+ assert_eq!(safe_decode(b"", None), "");
168
+ }
169
+
170
+ #[test]
171
+ fn test_safe_decode_ascii() {
172
+ let text = b"Hello, World!";
173
+ assert_eq!(safe_decode(text, None), "Hello, World!");
174
+ }
175
+
176
+ #[test]
177
+ fn test_safe_decode_utf8() {
178
+ let text = "Hello, 世界! مرحبا".as_bytes();
179
+ assert_eq!(safe_decode(text, None), "Hello, 世界! مرحبا");
180
+ }
181
+
182
+ #[test]
183
+ fn test_calculate_text_confidence_empty() {
184
+ assert_eq!(calculate_text_confidence(""), 0.0);
185
+ }
186
+
187
+ #[test]
188
+ fn test_calculate_text_confidence_clean_text() {
189
+ let text = "This is clean, readable text without any issues.";
190
+ let confidence = calculate_text_confidence(text);
191
+ assert!(confidence > 0.9);
192
+ }
193
+
194
+ #[test]
195
+ fn test_fix_mojibake_empty() {
196
+ assert_eq!(fix_mojibake(""), "");
197
+ }
198
+
199
+ #[test]
200
+ fn test_fix_mojibake_clean_text() {
201
+ let text = "Clean text without mojibake";
202
+ assert_eq!(fix_mojibake(text), text);
203
+ }
204
+
205
+ #[test]
206
+ fn test_fix_mojibake_control_chars() {
207
+ let text = "Text\x00with\x01control\x1Fchars";
208
+ let fixed = fix_mojibake(text);
209
+ assert_eq!(fixed, "Textwithcontrolchars");
210
+ }
211
+
212
+ #[test]
213
+ fn test_get_encoding_cache_key() {
214
+ let key = get_encoding_cache_key("hash123", 1024);
215
+ assert_eq!(key, "hash123:1024");
216
+ }
217
+ }
@@ -0,0 +1,164 @@
1
+ use std::ops::RangeInclusive;
2
+
3
+ /// CJK text tokenizer for token reduction.
4
+ ///
5
+ /// This tokenizer uses bigram (2-character) tokenization for CJK text,
6
+ /// which is appropriate for token reduction where we want to preserve
7
+ /// meaning while reducing token count.
8
+ ///
9
+ /// # Unicode Range Coverage
10
+ ///
11
+ /// **Currently covers:** CJK Unified Ideographs (U+4E00-U+9FFF)
12
+ /// - Covers ~20,992 common Chinese/Japanese Kanji characters
13
+ /// - Sufficient for token reduction purposes with Chinese and Japanese text
14
+ ///
15
+ /// **Intentionally excluded:**
16
+ /// - Hiragana (U+3040-U+309F): Japanese phonetic script
17
+ /// - Katakana (U+30A0-U+30FF): Japanese phonetic script
18
+ /// - Hangul (U+AC00-U+D7AF): Korean alphabet
19
+ ///
20
+ /// These exclusions are intentional for token reduction. Hiragana and Katakana
21
+ /// are typically tokenized with whitespace, and Hangul has different tokenization
22
+ /// requirements. If broader CJK support is needed, consider expanding the range
23
+ /// or using language-specific tokenizers.
24
+ pub struct CjkTokenizer {
25
+ cjk_range: RangeInclusive<u32>,
26
+ }
27
+
28
+ impl CjkTokenizer {
29
+ pub fn new() -> Self {
30
+ Self {
31
+ cjk_range: 0x4E00..=0x9FFF,
32
+ }
33
+ }
34
+
35
+ /// Checks if a character is a CJK Unified Ideograph (U+4E00-U+9FFF).
36
+ ///
37
+ /// Returns true for Chinese characters and Japanese Kanji, false for
38
+ /// Hiragana, Katakana, Hangul, and non-CJK characters.
39
+ #[inline]
40
+ pub fn is_cjk_char(&self, c: char) -> bool {
41
+ self.cjk_range.contains(&(c as u32))
42
+ }
43
+
44
+ #[inline]
45
+ pub fn has_cjk(&self, text: &str) -> bool {
46
+ text.chars().any(|c| self.is_cjk_char(c))
47
+ }
48
+
49
+ pub fn tokenize_cjk_string(&self, text: &str) -> Vec<String> {
50
+ let chars: Vec<char> = text.chars().collect();
51
+ self.tokenize_cjk_chars(&chars)
52
+ }
53
+
54
+ pub fn tokenize_cjk_chars(&self, chars: &[char]) -> Vec<String> {
55
+ chars
56
+ .chunks(2)
57
+ .map(|chunk| {
58
+ if chunk.len() == 2 {
59
+ format!("{}{}", chunk[0], chunk[1])
60
+ } else {
61
+ chunk[0].to_string()
62
+ }
63
+ })
64
+ .collect()
65
+ }
66
+
67
+ pub fn tokenize_mixed_text(&self, text: &str) -> Vec<String> {
68
+ let whitespace_tokens: Vec<&str> = text.split_whitespace().collect();
69
+
70
+ if whitespace_tokens.is_empty() {
71
+ return if text.is_empty() {
72
+ vec![]
73
+ } else {
74
+ vec![text.to_string()]
75
+ };
76
+ }
77
+
78
+ if whitespace_tokens.len() == 1 {
79
+ let token = whitespace_tokens[0];
80
+ return if self.has_cjk(token) {
81
+ self.tokenize_cjk_string(token)
82
+ } else {
83
+ vec![token.to_string()]
84
+ };
85
+ }
86
+
87
+ let mut all_tokens = Vec::new();
88
+ for token in whitespace_tokens {
89
+ if self.has_cjk(token) {
90
+ all_tokens.extend(self.tokenize_cjk_string(token));
91
+ } else {
92
+ all_tokens.push(token.to_string());
93
+ }
94
+ }
95
+ all_tokens
96
+ }
97
+ }
98
+
99
+ impl Default for CjkTokenizer {
100
+ fn default() -> Self {
101
+ Self::new()
102
+ }
103
+ }
104
+
105
+ #[cfg(test)]
106
+ mod tests {
107
+ use super::*;
108
+
109
+ #[test]
110
+ fn test_is_cjk_char() {
111
+ let tokenizer = CjkTokenizer::new();
112
+
113
+ assert!(tokenizer.is_cjk_char('中'));
114
+ assert!(tokenizer.is_cjk_char('国'));
115
+ assert!(tokenizer.is_cjk_char('日'));
116
+ assert!(tokenizer.is_cjk_char('本'));
117
+
118
+ assert!(!tokenizer.is_cjk_char('a'));
119
+ assert!(!tokenizer.is_cjk_char('Z'));
120
+ assert!(!tokenizer.is_cjk_char('1'));
121
+ assert!(!tokenizer.is_cjk_char(' '));
122
+ }
123
+
124
+ #[test]
125
+ fn test_has_cjk() {
126
+ let tokenizer = CjkTokenizer::new();
127
+
128
+ assert!(tokenizer.has_cjk("这是中文"));
129
+ assert!(tokenizer.has_cjk("mixed 中文 text"));
130
+ assert!(tokenizer.has_cjk("日本語"));
131
+
132
+ assert!(!tokenizer.has_cjk("English text"));
133
+ assert!(!tokenizer.has_cjk("12345"));
134
+ assert!(!tokenizer.has_cjk(""));
135
+ }
136
+
137
+ #[test]
138
+ fn test_tokenize_cjk_string() {
139
+ let tokenizer = CjkTokenizer::new();
140
+
141
+ let tokens = tokenizer.tokenize_cjk_string("中国人");
142
+ assert_eq!(tokens, vec!["中国", "人"]);
143
+
144
+ let tokens = tokenizer.tokenize_cjk_string("四个字");
145
+ assert_eq!(tokens, vec!["四个", "字"]);
146
+ }
147
+
148
+ #[test]
149
+ fn test_tokenize_mixed_text() {
150
+ let tokenizer = CjkTokenizer::new();
151
+
152
+ let tokens = tokenizer.tokenize_mixed_text("hello world");
153
+ assert_eq!(tokens, vec!["hello", "world"]);
154
+
155
+ let tokens = tokenizer.tokenize_mixed_text("中国");
156
+ assert_eq!(tokens, vec!["中国"]);
157
+
158
+ let tokens = tokenizer.tokenize_mixed_text("hello 中国 world");
159
+ assert_eq!(tokens, vec!["hello", "中国", "world"]);
160
+
161
+ let tokens = tokenizer.tokenize_mixed_text("学习 machine learning 技术");
162
+ assert_eq!(tokens, vec!["学习", "machine", "learning", "技术"]);
163
+ }
164
+ }
@@ -0,0 +1,100 @@
1
+ use serde::{Deserialize, Serialize};
2
+ use std::collections::HashMap;
3
+
4
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
5
+ pub enum ReductionLevel {
6
+ Off = 0,
7
+ Light = 1,
8
+ Moderate = 2,
9
+ Aggressive = 3,
10
+ Maximum = 4,
11
+ }
12
+
13
+ impl ReductionLevel {
14
+ pub fn as_str(&self) -> &'static str {
15
+ match self {
16
+ ReductionLevel::Off => "off",
17
+ ReductionLevel::Light => "light",
18
+ ReductionLevel::Moderate => "moderate",
19
+ ReductionLevel::Aggressive => "aggressive",
20
+ ReductionLevel::Maximum => "maximum",
21
+ }
22
+ }
23
+ }
24
+
25
+ impl From<&str> for ReductionLevel {
26
+ fn from(s: &str) -> Self {
27
+ match s.to_lowercase().as_str() {
28
+ "off" => ReductionLevel::Off,
29
+ "light" => ReductionLevel::Light,
30
+ "moderate" => ReductionLevel::Moderate,
31
+ "aggressive" => ReductionLevel::Aggressive,
32
+ "maximum" => ReductionLevel::Maximum,
33
+ _ => ReductionLevel::Moderate,
34
+ }
35
+ }
36
+ }
37
+
38
+ #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
39
+ pub struct TokenReductionConfig {
40
+ pub level: ReductionLevel,
41
+ pub language_hint: Option<String>,
42
+ pub preserve_markdown: bool,
43
+ pub preserve_code: bool,
44
+ pub semantic_threshold: f32,
45
+ pub enable_parallel: bool,
46
+ pub use_simd: bool,
47
+ pub custom_stopwords: Option<HashMap<String, Vec<String>>>,
48
+ pub preserve_patterns: Vec<String>,
49
+ pub target_reduction: Option<f32>,
50
+ pub enable_semantic_clustering: bool,
51
+ }
52
+
53
+ impl Default for TokenReductionConfig {
54
+ fn default() -> Self {
55
+ Self {
56
+ level: ReductionLevel::Moderate,
57
+ language_hint: None,
58
+ preserve_markdown: false,
59
+ preserve_code: true,
60
+ semantic_threshold: 0.3,
61
+ enable_parallel: true,
62
+ use_simd: true,
63
+ custom_stopwords: None,
64
+ preserve_patterns: vec![],
65
+ target_reduction: None,
66
+ enable_semantic_clustering: false,
67
+ }
68
+ }
69
+ }
70
+
71
+ impl TokenReductionConfig {
72
+ #[allow(clippy::too_many_arguments)]
73
+ pub fn new(
74
+ level: ReductionLevel,
75
+ language_hint: Option<String>,
76
+ preserve_markdown: bool,
77
+ preserve_code: bool,
78
+ semantic_threshold: f32,
79
+ enable_parallel: bool,
80
+ use_simd: bool,
81
+ custom_stopwords: Option<HashMap<String, Vec<String>>>,
82
+ preserve_patterns: Option<Vec<String>>,
83
+ target_reduction: Option<f32>,
84
+ enable_semantic_clustering: bool,
85
+ ) -> Self {
86
+ Self {
87
+ level,
88
+ language_hint,
89
+ preserve_markdown,
90
+ preserve_code,
91
+ semantic_threshold: semantic_threshold.clamp(0.0, 1.0),
92
+ enable_parallel,
93
+ use_simd,
94
+ custom_stopwords,
95
+ preserve_patterns: preserve_patterns.unwrap_or_default(),
96
+ target_reduction: target_reduction.map(|t| t.clamp(0.0, 1.0)),
97
+ enable_semantic_clustering,
98
+ }
99
+ }
100
+ }