kreuzberg 4.0.0.rc1 → 4.0.0.rc2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (342) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -8
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -534
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -9
  7. data/Gemfile.lock +9 -109
  8. data/README.md +426 -421
  9. data/Rakefile +25 -25
  10. data/Steepfile +47 -47
  11. data/examples/async_patterns.rb +341 -340
  12. data/ext/kreuzberg_rb/extconf.rb +45 -35
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -36
  15. data/ext/kreuzberg_rb/native/README.md +425 -425
  16. data/ext/kreuzberg_rb/native/build.rs +15 -17
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -2939
  22. data/extconf.rb +28 -28
  23. data/kreuzberg.gemspec +148 -105
  24. data/lib/kreuzberg/api_proxy.rb +142 -142
  25. data/lib/kreuzberg/cache_api.rb +46 -45
  26. data/lib/kreuzberg/cli.rb +55 -55
  27. data/lib/kreuzberg/cli_proxy.rb +127 -127
  28. data/lib/kreuzberg/config.rb +691 -684
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -50
  31. data/lib/kreuzberg/extraction_api.rb +85 -84
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -186
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -86
  35. data/lib/kreuzberg/result.rb +216 -216
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -79
  37. data/lib/kreuzberg/validator_protocol.rb +89 -89
  38. data/lib/kreuzberg/version.rb +5 -5
  39. data/lib/kreuzberg.rb +103 -82
  40. data/sig/kreuzberg/internal.rbs +184 -184
  41. data/sig/kreuzberg.rbs +520 -468
  42. data/spec/binding/cache_spec.rb +227 -227
  43. data/spec/binding/cli_proxy_spec.rb +85 -87
  44. data/spec/binding/cli_spec.rb +55 -54
  45. data/spec/binding/config_spec.rb +345 -345
  46. data/spec/binding/config_validation_spec.rb +283 -283
  47. data/spec/binding/error_handling_spec.rb +213 -213
  48. data/spec/binding/errors_spec.rb +66 -66
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  51. data/spec/binding/plugins/validator_spec.rb +274 -274
  52. data/spec/fixtures/config.toml +39 -39
  53. data/spec/fixtures/config.yaml +41 -42
  54. data/spec/fixtures/invalid_config.toml +4 -4
  55. data/spec/smoke/package_spec.rb +178 -178
  56. data/spec/spec_helper.rb +42 -42
  57. data/vendor/kreuzberg/Cargo.toml +204 -134
  58. data/vendor/kreuzberg/README.md +175 -175
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -460
  61. data/vendor/kreuzberg/src/api/error.rs +81 -81
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -199
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -79
  64. data/vendor/kreuzberg/src/api/server.rs +353 -353
  65. data/vendor/kreuzberg/src/api/types.rs +170 -170
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1143
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -677
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -35
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -1032
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -903
  71. data/vendor/kreuzberg/src/core/io.rs +329 -327
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -615
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -42
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -906
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -323
  76. data/vendor/kreuzberg/src/error.rs +431 -431
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -40
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -854
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -553
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -368
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -564
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -77
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -128
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -3000
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -328
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -269
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -425
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -479
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -129
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -344
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -410
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -195
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -268
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -496
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -234
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -126
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -242
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -128
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -267
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -294
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -942
  136. data/vendor/kreuzberg/src/lib.rs +105 -102
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -32
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -1966
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -847
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -450
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -122
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -346
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -50
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -420
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -161
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -1010
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -209
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -629
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -641
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -1324
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -955
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -19
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -697
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
  176. data/vendor/kreuzberg/src/types.rs +903 -873
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -17
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -959
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -966
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -543
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -542
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -304
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -509
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -580
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -439
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -493
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -424
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -124
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -325
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -393
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -159
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -253
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -428
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -510
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -676
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -627
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -43
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -1412
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -771
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -561
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -921
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -607
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -404
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -609
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +90 -95
  331. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  332. data/spec/examples.txt +0 -104
  333. data/vendor/kreuzberg/src/bin/profile_extract.rs +0 -455
  334. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +0 -275
  335. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +0 -178
  336. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +0 -491
  337. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +0 -496
  338. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +0 -1188
  339. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +0 -162
  340. data/vendor/kreuzberg/src/extractors/pandoc.rs +0 -201
  341. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +0 -92
  342. data/vendor/kreuzberg/tests/pandoc_integration.rs +0 -503
@@ -1,942 +1,942 @@
1
- //! Language detection using whatlang library.
2
- //!
3
- //! Provides fast language detection for extracted text content.
4
-
5
- use crate::Result;
6
- use crate::core::config::LanguageDetectionConfig;
7
- use whatlang::{Lang, detect};
8
-
9
- /// Detect languages in text using whatlang.
10
- ///
11
- /// Returns a list of detected language codes (ISO 639-3 format).
12
- /// Returns `None` if no languages could be detected with sufficient confidence.
13
- ///
14
- /// # Arguments
15
- ///
16
- /// * `text` - The text to analyze for language detection
17
- /// * `config` - Optional configuration for language detection
18
- ///
19
- /// # Example
20
- ///
21
- /// ```rust
22
- /// use kreuzberg::language_detection::detect_languages;
23
- /// use kreuzberg::core::config::LanguageDetectionConfig;
24
- ///
25
- /// let text = "Hello world! This is English text.";
26
- /// let config = LanguageDetectionConfig {
27
- /// enabled: true,
28
- /// min_confidence: 0.8,
29
- /// detect_multiple: false,
30
- /// };
31
- /// let languages = detect_languages(text, &config).expect("language detection succeeded");
32
- /// println!("Detected languages: {:?}", languages);
33
- /// ```
34
- pub fn detect_languages(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
35
- if !config.enabled {
36
- return Ok(None);
37
- }
38
-
39
- if text.trim().is_empty() {
40
- return Ok(None);
41
- }
42
-
43
- if !config.detect_multiple {
44
- return detect_single_language(text, config);
45
- }
46
-
47
- detect_multiple_languages(text, config)
48
- }
49
-
50
- /// Detect a single primary language in the text.
51
- fn detect_single_language(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
52
- match detect(text) {
53
- Some(info) => {
54
- if info.confidence() >= config.min_confidence {
55
- let lang_code = lang_to_iso639_3(info.lang());
56
- Ok(Some(vec![lang_code]))
57
- } else {
58
- Ok(None)
59
- }
60
- }
61
- None => Ok(None),
62
- }
63
- }
64
-
65
- /// Detect multiple languages in the text by analyzing chunks.
66
- ///
67
- /// This splits the text into chunks and detects the language of each chunk,
68
- /// then returns the most common languages found.
69
- fn detect_multiple_languages(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
70
- const CHUNK_SIZE: usize = 200;
71
- let char_vec: Vec<char> = text.chars().collect();
72
- let chunk_strings: Vec<String> = char_vec
73
- .chunks(CHUNK_SIZE)
74
- .map(|chunk| chunk.iter().collect::<String>())
75
- .collect();
76
-
77
- if chunk_strings.is_empty() {
78
- return Ok(None);
79
- }
80
-
81
- let mut lang_counts = std::collections::HashMap::new();
82
- let threshold = config.min_confidence.min(0.35);
83
-
84
- for chunk in &chunk_strings {
85
- if let Some(info) = detect(chunk)
86
- && info.confidence() >= threshold
87
- {
88
- *lang_counts.entry(info.lang()).or_insert(0) += 1;
89
- }
90
- }
91
-
92
- if lang_counts.is_empty() {
93
- return detect_single_language(text, config);
94
- }
95
-
96
- let mut lang_vec: Vec<(Lang, usize)> = lang_counts.into_iter().collect();
97
- lang_vec.sort_by(|a, b| b.1.cmp(&a.1));
98
-
99
- let languages: Vec<String> = lang_vec.iter().map(|(lang, _)| lang_to_iso639_3(*lang)).collect();
100
-
101
- Ok(Some(languages))
102
- }
103
-
104
- /// Convert whatlang Lang enum to ISO 639-3 language code.
105
- ///
106
- /// Maps whatlang's language codes to standardized ISO 639-3 codes.
107
- fn lang_to_iso639_3(lang: Lang) -> String {
108
- match lang {
109
- Lang::Eng => "eng",
110
- Lang::Rus => "rus",
111
- Lang::Cmn => "cmn",
112
- Lang::Spa => "spa",
113
- Lang::Por => "por",
114
- Lang::Ita => "ita",
115
- Lang::Fra => "fra",
116
- Lang::Deu => "deu",
117
- Lang::Ukr => "ukr",
118
- Lang::Kat => "kat",
119
- Lang::Ara => "ara",
120
- Lang::Hin => "hin",
121
- Lang::Jpn => "jpn",
122
- Lang::Heb => "heb",
123
- Lang::Yid => "yid",
124
- Lang::Pol => "pol",
125
- Lang::Amh => "amh",
126
- Lang::Jav => "jav",
127
- Lang::Kor => "kor",
128
- Lang::Nob => "nob",
129
- Lang::Dan => "dan",
130
- Lang::Swe => "swe",
131
- Lang::Fin => "fin",
132
- Lang::Tur => "tur",
133
- Lang::Nld => "nld",
134
- Lang::Hun => "hun",
135
- Lang::Ces => "ces",
136
- Lang::Ell => "ell",
137
- Lang::Bul => "bul",
138
- Lang::Bel => "bel",
139
- Lang::Mar => "mar",
140
- Lang::Kan => "kan",
141
- Lang::Ron => "ron",
142
- Lang::Slv => "slv",
143
- Lang::Hrv => "hrv",
144
- Lang::Srp => "srp",
145
- Lang::Mkd => "mkd",
146
- Lang::Lit => "lit",
147
- Lang::Lav => "lav",
148
- Lang::Est => "est",
149
- Lang::Tam => "tam",
150
- Lang::Vie => "vie",
151
- Lang::Urd => "urd",
152
- Lang::Tha => "tha",
153
- Lang::Guj => "guj",
154
- Lang::Uzb => "uzb",
155
- Lang::Pan => "pan",
156
- Lang::Aze => "aze",
157
- Lang::Ind => "ind",
158
- Lang::Tel => "tel",
159
- Lang::Pes => "pes",
160
- Lang::Mal => "mal",
161
- Lang::Ori => "ori",
162
- Lang::Mya => "mya",
163
- Lang::Nep => "nep",
164
- Lang::Sin => "sin",
165
- Lang::Khm => "khm",
166
- Lang::Tuk => "tuk",
167
- Lang::Aka => "aka",
168
- Lang::Zul => "zul",
169
- Lang::Sna => "sna",
170
- Lang::Afr => "afr",
171
- Lang::Lat => "lat",
172
- Lang::Slk => "slk",
173
- Lang::Cat => "cat",
174
- Lang::Tgl => "tgl",
175
- Lang::Hye => "hye",
176
- Lang::Epo => "epo",
177
- Lang::Ben => "ben",
178
- Lang::Cym => "cym",
179
- }
180
- .to_string()
181
- }
182
-
183
- #[cfg(test)]
184
- mod tests {
185
- use super::*;
186
-
187
- #[test]
188
- fn test_detect_single_language_english() {
189
- let text = "Hello world! This is a test of the language detection system.";
190
- let config = LanguageDetectionConfig {
191
- enabled: true,
192
- min_confidence: 0.8,
193
- detect_multiple: false,
194
- };
195
-
196
- let result = detect_languages(text, &config).unwrap();
197
- assert!(result.is_some());
198
- let langs = result.unwrap();
199
- assert_eq!(langs.len(), 1);
200
- assert_eq!(langs[0], "eng");
201
- }
202
-
203
- #[test]
204
- fn test_detect_single_language_spanish() {
205
- let text = "Hola mundo! Esta es una prueba del sistema de detección de idiomas.";
206
- let config = LanguageDetectionConfig {
207
- enabled: true,
208
- min_confidence: 0.8,
209
- detect_multiple: false,
210
- };
211
-
212
- let result = detect_languages(text, &config).unwrap();
213
- assert!(result.is_some());
214
- let langs = result.unwrap();
215
- assert_eq!(langs.len(), 1);
216
- assert_eq!(langs[0], "spa");
217
- }
218
-
219
- #[test]
220
- fn test_detect_multiple_languages() {
221
- let text = "Hello world! This is English text. The quick brown fox jumps over the lazy dog. \
222
- Hola mundo! Este es texto en español. El rápido zorro marrón salta sobre el perro perezoso. \
223
- Bonjour le monde! Ceci est un texte en français. Le renard brun rapide saute par-dessus le chien paresseux.";
224
- let config = LanguageDetectionConfig {
225
- enabled: true,
226
- min_confidence: 0.3,
227
- detect_multiple: true,
228
- };
229
-
230
- let result = detect_languages(text, &config).unwrap();
231
- if let Some(langs) = result {
232
- assert!(
233
- !langs.is_empty(),
234
- "If detection succeeds, should return at least one language"
235
- );
236
- }
237
- }
238
-
239
- #[test]
240
- fn test_detect_disabled() {
241
- let text = "Hello world!";
242
- let config = LanguageDetectionConfig {
243
- enabled: false,
244
- min_confidence: 0.8,
245
- detect_multiple: false,
246
- };
247
-
248
- let result = detect_languages(text, &config).unwrap();
249
- assert!(result.is_none());
250
- }
251
-
252
- #[test]
253
- fn test_detect_empty_text() {
254
- let text = "";
255
- let config = LanguageDetectionConfig {
256
- enabled: true,
257
- min_confidence: 0.8,
258
- detect_multiple: false,
259
- };
260
-
261
- let result = detect_languages(text, &config).unwrap();
262
- assert!(result.is_none());
263
- }
264
-
265
- #[test]
266
- fn test_lang_to_iso639_3() {
267
- assert_eq!(lang_to_iso639_3(Lang::Eng), "eng");
268
- assert_eq!(lang_to_iso639_3(Lang::Spa), "spa");
269
- assert_eq!(lang_to_iso639_3(Lang::Fra), "fra");
270
- assert_eq!(lang_to_iso639_3(Lang::Deu), "deu");
271
- assert_eq!(lang_to_iso639_3(Lang::Cmn), "cmn");
272
- }
273
-
274
- #[test]
275
- fn test_confidence_threshold_filters_low_confidence() {
276
- let text = "ok yes no";
277
- let high_confidence_config = LanguageDetectionConfig {
278
- enabled: true,
279
- min_confidence: 0.99,
280
- detect_multiple: false,
281
- };
282
-
283
- let result = detect_languages(text, &high_confidence_config).unwrap();
284
- assert!(result.is_none());
285
- }
286
-
287
- #[test]
288
- fn test_confidence_threshold_accepts_high_confidence() {
289
- let text = "The quick brown fox jumps over the lazy dog. This is definitely English text with clear patterns.";
290
- let low_confidence_config = LanguageDetectionConfig {
291
- enabled: true,
292
- min_confidence: 0.5,
293
- detect_multiple: false,
294
- };
295
-
296
- let result = detect_languages(text, &low_confidence_config).unwrap();
297
- assert!(result.is_some());
298
- let langs = result.unwrap();
299
- assert_eq!(langs.len(), 1);
300
- assert_eq!(langs[0], "eng");
301
- }
302
-
303
- #[test]
304
- fn test_confidence_threshold_boundary_low() {
305
- let text =
306
- "This is a comprehensive English sentence with multiple words to ensure accurate language detection.";
307
- let very_low_threshold = LanguageDetectionConfig {
308
- enabled: true,
309
- min_confidence: 0.01,
310
- detect_multiple: false,
311
- };
312
-
313
- let result = detect_languages(text, &very_low_threshold).unwrap();
314
- assert!(result.is_some());
315
- let langs = result.unwrap();
316
- assert_eq!(langs.len(), 1);
317
- assert_eq!(langs[0], "eng");
318
- }
319
-
320
- #[test]
321
- fn test_confidence_threshold_boundary_high() {
322
- let text = "The quick brown fox jumps over the lazy dog.";
323
- let max_threshold = LanguageDetectionConfig {
324
- enabled: true,
325
- min_confidence: 1.0,
326
- detect_multiple: false,
327
- };
328
-
329
- let result = detect_languages(text, &max_threshold).unwrap();
330
- if let Some(langs) = result {
331
- assert_eq!(langs.len(), 1);
332
- }
333
- }
334
-
335
- #[test]
336
- fn test_confidence_threshold_multiple_languages() {
337
- let text = format!(
338
- "{}{}",
339
- "Hello world! This is English text. The quick brown fox jumps over the lazy dog. ".repeat(10),
340
- "Hola mundo! Este es texto en español. El rápido zorro marrón salta sobre el perro perezoso. ".repeat(10)
341
- );
342
- let high_confidence_config = LanguageDetectionConfig {
343
- enabled: true,
344
- min_confidence: 0.5,
345
- detect_multiple: true,
346
- };
347
-
348
- let result = detect_languages(&text, &high_confidence_config).unwrap();
349
- if let Some(langs) = result {
350
- assert!(
351
- !langs.is_empty(),
352
- "If detection succeeds, should find at least one language"
353
- );
354
- let has_expected = langs.contains(&"eng".to_string())
355
- || langs.contains(&"spa".to_string())
356
- || langs.contains(&"fra".to_string());
357
- assert!(has_expected, "Should detect at least one of the languages in the text");
358
- }
359
- }
360
-
361
- #[test]
362
- fn test_confidence_threshold_filters_all_chunks() {
363
- let text = "a b c d e f g h i j k ".repeat(50);
364
- let high_confidence_config = LanguageDetectionConfig {
365
- enabled: true,
366
- min_confidence: 0.95,
367
- detect_multiple: true,
368
- };
369
-
370
- let result = detect_languages(&text, &high_confidence_config).unwrap();
371
- assert!(result.is_none() || result.unwrap().is_empty());
372
- }
373
-
374
- #[test]
375
- fn test_default_confidence_threshold() {
376
- let text = "This is a clear English sentence. The quick brown fox jumps over the lazy dog. \
377
- English text is easy to detect when there is sufficient content to analyze. \
378
- Language detection works best with longer text passages that provide more context.";
379
- let config = LanguageDetectionConfig {
380
- enabled: true,
381
- min_confidence: 0.5,
382
- detect_multiple: false,
383
- };
384
-
385
- let result = detect_languages(text, &config).unwrap();
386
- if let Some(langs) = result {
387
- assert_eq!(langs.len(), 1, "Single language mode should return one language");
388
- assert_eq!(langs[0], "eng", "Should detect English");
389
- }
390
- }
391
-
392
- #[test]
393
- fn test_english_spanish_document() {
394
- let text = format!(
395
- "{}{}",
396
- "The global economy has been experiencing significant changes in recent years. International cooperation is essential for addressing climate change and sustainable development. ".repeat(5),
397
- "La economía global ha estado experimentando cambios significativos en los últimos años. La cooperación internacional es esencial para abordar el cambio climático y el desarrollo sostenible. ".repeat(5)
398
- );
399
- let config = LanguageDetectionConfig {
400
- enabled: true,
401
- min_confidence: 0.5,
402
- detect_multiple: true,
403
- };
404
-
405
- let result = detect_languages(&text, &config).unwrap();
406
- assert!(result.is_some());
407
- let langs = result.unwrap();
408
- assert!(!langs.is_empty());
409
- assert!(langs.contains(&"eng".to_string()) || langs.contains(&"spa".to_string()));
410
- }
411
-
412
- #[test]
413
- fn test_chinese_english_document() {
414
- let text = format!(
415
- "{}{}",
416
- "中国是世界上人口最多的国家。中文是世界上使用人数最多的语言之一。中华文明有着五千年的悠久历史。".repeat(5),
417
- "China is the most populous country in the world. Chinese is one of the most widely spoken languages. Chinese civilization has a long history of five thousand years. ".repeat(5)
418
- );
419
- let config = LanguageDetectionConfig {
420
- enabled: true,
421
- min_confidence: 0.4,
422
- detect_multiple: true,
423
- };
424
-
425
- let result = detect_languages(&text, &config).unwrap();
426
- assert!(result.is_some());
427
- let langs = result.unwrap();
428
- assert!(!langs.is_empty());
429
- assert!(langs.contains(&"cmn".to_string()) || langs.contains(&"eng".to_string()));
430
- }
431
-
432
- #[test]
433
- fn test_french_german_document() {
434
- let text = format!(
435
- "{}{}",
436
- "La France est connue pour sa culture riche et sa cuisine délicieuse. Paris est la capitale de la France et une destination touristique populaire. ".repeat(5),
437
- "Deutschland ist bekannt für seine Ingenieurskunst und seine reiche Geschichte. Berlin ist die Hauptstadt Deutschlands und eine lebendige Metropole. ".repeat(5)
438
- );
439
- let config = LanguageDetectionConfig {
440
- enabled: true,
441
- min_confidence: 0.5,
442
- detect_multiple: true,
443
- };
444
-
445
- let result = detect_languages(&text, &config).unwrap();
446
- assert!(result.is_some());
447
- let langs = result.unwrap();
448
- assert!(!langs.is_empty());
449
- }
450
-
451
- #[test]
452
- fn test_russian_ukrainian_document() {
453
- let text = format!(
454
- "{}{}",
455
- "Россия является крупнейшей страной в мире по территории. Москва - столица России и крупнейший город страны. ".repeat(5),
456
- "Україна є країною в Східній Європі. Київ - столиця України та найбільше місто країни. ".repeat(5)
457
- );
458
- let config = LanguageDetectionConfig {
459
- enabled: true,
460
- min_confidence: 0.5,
461
- detect_multiple: true,
462
- };
463
-
464
- let result = detect_languages(&text, &config).unwrap();
465
- assert!(result.is_some());
466
- let langs = result.unwrap();
467
- assert!(!langs.is_empty());
468
- }
469
-
470
- #[test]
471
- fn test_romance_languages() {
472
- let text = "L'Italia è famosa per la sua arte e architettura. O português é falado em vários países. El español es uno de los idiomas más hablados del mundo. ".repeat(3);
473
- let config = LanguageDetectionConfig {
474
- enabled: true,
475
- min_confidence: 0.5,
476
- detect_multiple: true,
477
- };
478
-
479
- let result = detect_languages(&text, &config).unwrap();
480
- assert!(result.is_some());
481
- let langs = result.unwrap();
482
- assert!(!langs.is_empty());
483
- }
484
-
485
- #[test]
486
- fn test_germanic_languages() {
487
- let text = "Deutschland hat eine reiche Kulturgeschichte. Nederland is bekend om zijn tulpen en windmolens. Sverige är känt för sina skogar och innovationer. ".repeat(3);
488
- let config = LanguageDetectionConfig {
489
- enabled: true,
490
- min_confidence: 0.5,
491
- detect_multiple: true,
492
- };
493
-
494
- let result = detect_languages(&text, &config).unwrap();
495
- assert!(result.is_some());
496
- let langs = result.unwrap();
497
- assert!(!langs.is_empty());
498
- }
499
-
500
- #[test]
501
- fn test_slavic_languages() {
502
- let text = "Polska jest krajem w Europie Środkowej. Česká republika má bohatou historii. България е страна на Балканския полуостров. ".repeat(3);
503
- let config = LanguageDetectionConfig {
504
- enabled: true,
505
- min_confidence: 0.5,
506
- detect_multiple: true,
507
- };
508
-
509
- let result = detect_languages(&text, &config).unwrap();
510
- assert!(result.is_some());
511
- let langs = result.unwrap();
512
- assert!(!langs.is_empty());
513
- }
514
-
515
- #[test]
516
- fn test_cjk_languages() {
517
- let text = "中国是一个历史悠久的国家。日本は美しい桜の国です。한국은 아시아의 선진국입니다。".repeat(3);
518
- let config = LanguageDetectionConfig {
519
- enabled: true,
520
- min_confidence: 0.4,
521
- detect_multiple: true,
522
- };
523
-
524
- let result = detect_languages(&text, &config).unwrap();
525
- assert!(result.is_some());
526
- let langs = result.unwrap();
527
- assert!(!langs.is_empty());
528
- }
529
-
530
- #[test]
531
- fn test_arabic_persian() {
532
- let text = "اللغة العربية هي واحدة من أقدم اللغات في العالم. زبان فارسی زبانی زیبا و شاعرانه است. ".repeat(5);
533
- let config = LanguageDetectionConfig {
534
- enabled: true,
535
- min_confidence: 0.4,
536
- detect_multiple: true,
537
- };
538
-
539
- let result = detect_languages(&text, &config).unwrap();
540
- assert!(result.is_some());
541
- let langs = result.unwrap();
542
- assert!(!langs.is_empty());
543
- }
544
-
545
- #[test]
546
- fn test_very_short_text() {
547
- let text = "Hello";
548
- let config = LanguageDetectionConfig {
549
- enabled: true,
550
- min_confidence: 0.5,
551
- detect_multiple: false,
552
- };
553
-
554
- let result = detect_languages(text, &config).unwrap();
555
- if let Some(langs) = result {
556
- assert!(!langs.is_empty());
557
- }
558
- }
559
-
560
- #[test]
561
- fn test_medium_length_text() {
562
- let text = "Machine learning is a subset of artificial intelligence that enables computers to learn from data.";
563
- let config = LanguageDetectionConfig {
564
- enabled: true,
565
- min_confidence: 0.5,
566
- detect_multiple: false,
567
- };
568
-
569
- let result = detect_languages(text, &config).unwrap();
570
- assert!(result.is_some());
571
- let langs = result.unwrap();
572
- assert_eq!(langs.len(), 1);
573
- assert_eq!(langs[0], "eng");
574
- }
575
-
576
- #[test]
577
- fn test_very_long_text() {
578
- let paragraph = "The advancement of technology in the twenty-first century has transformed how we live, work, and communicate. \
579
- From smartphones to artificial intelligence, these innovations have created unprecedented opportunities and challenges. \
580
- Understanding the implications of technological progress requires careful consideration of ethical, social, and economic factors. ";
581
- let text = paragraph.repeat(20);
582
- let config = LanguageDetectionConfig {
583
- enabled: true,
584
- min_confidence: 0.7,
585
- detect_multiple: false,
586
- };
587
-
588
- let result = detect_languages(&text, &config).unwrap();
589
- assert!(result.is_some());
590
- let langs = result.unwrap();
591
- assert_eq!(langs.len(), 1);
592
- assert_eq!(langs[0], "eng");
593
- }
594
-
595
- #[test]
596
- fn test_numbers_only() {
597
- let text = "123456789 0123456789 987654321";
598
- let config = LanguageDetectionConfig {
599
- enabled: true,
600
- min_confidence: 0.5,
601
- detect_multiple: false,
602
- };
603
-
604
- let result = detect_languages(text, &config).unwrap();
605
- assert!(result.is_none());
606
- }
607
-
608
- #[test]
609
- fn test_punctuation_only() {
610
- let text = "!!! ??? ... --- *** @@@ ###";
611
- let config = LanguageDetectionConfig {
612
- enabled: true,
613
- min_confidence: 0.5,
614
- detect_multiple: false,
615
- };
616
-
617
- let result = detect_languages(text, &config).unwrap();
618
- assert!(result.is_none());
619
- }
620
-
621
- #[test]
622
- fn test_whitespace_only() {
623
- let text = " \t\n \n\n\t\t ";
624
- let config = LanguageDetectionConfig {
625
- enabled: true,
626
- min_confidence: 0.5,
627
- detect_multiple: false,
628
- };
629
-
630
- let result = detect_languages(text, &config).unwrap();
631
- assert!(result.is_none());
632
- }
633
-
634
- #[test]
635
- fn test_mixed_numbers_and_text() {
636
- let text = "The year 2024 marks the 100th anniversary of the founding. Over 50 countries participated in the event with more than 10,000 attendees.";
637
- let config = LanguageDetectionConfig {
638
- enabled: true,
639
- min_confidence: 0.5,
640
- detect_multiple: false,
641
- };
642
-
643
- let result = detect_languages(text, &config).unwrap();
644
- assert!(result.is_some());
645
- let langs = result.unwrap();
646
- assert_eq!(langs[0], "eng");
647
- }
648
-
649
- #[test]
650
- fn test_text_with_urls() {
651
- let text = "Visit our website at https://example.com for more information. You can also contact us at info@example.com or follow us on social media.";
652
- let config = LanguageDetectionConfig {
653
- enabled: true,
654
- min_confidence: 0.5,
655
- detect_multiple: false,
656
- };
657
-
658
- let result = detect_languages(text, &config).unwrap();
659
- assert!(result.is_some());
660
- let langs = result.unwrap();
661
- assert_eq!(langs[0], "eng");
662
- }
663
-
664
- #[test]
665
- fn test_text_with_email_addresses() {
666
- let text = "Please send your resume to jobs@company.com or contact.us@example.org for inquiries. Our support team at support@help.com is available 24/7.";
667
- let config = LanguageDetectionConfig {
668
- enabled: true,
669
- min_confidence: 0.5,
670
- detect_multiple: false,
671
- };
672
-
673
- let result = detect_languages(text, &config).unwrap();
674
- assert!(result.is_some());
675
- let langs = result.unwrap();
676
- assert_eq!(langs[0], "eng");
677
- }
678
-
679
- #[test]
680
- fn test_code_with_comments() {
681
- let text = r#"
682
- // This function calculates the factorial of a number
683
- fn factorial(n: u64) -> u64 {
684
- if n == 0 {
685
- return 1;
686
- }
687
- n * factorial(n - 1)
688
- }
689
-
690
- // The algorithm uses recursion to compute the result efficiently
691
- // It handles edge cases like zero and negative numbers appropriately
692
- "#;
693
- let config = LanguageDetectionConfig {
694
- enabled: true,
695
- min_confidence: 0.4,
696
- detect_multiple: false,
697
- };
698
-
699
- let result = detect_languages(text, &config).unwrap();
700
- if let Some(langs) = result {
701
- assert!(!langs.is_empty());
702
- }
703
- }
704
-
705
- #[test]
706
- fn test_predominantly_code() {
707
- let text = r#"
708
- let x = 42;
709
- let y = x * 2;
710
- println!("{}", y);
711
- fn main() {
712
- let vec = vec![1, 2, 3];
713
- for i in vec {
714
- println!("{}", i);
715
- }
716
- }
717
- "#;
718
- let config = LanguageDetectionConfig {
719
- enabled: true,
720
- min_confidence: 0.5,
721
- detect_multiple: false,
722
- };
723
-
724
- let result = detect_languages(text, &config).unwrap();
725
- assert!(result.is_none() || result.as_ref().unwrap().is_empty() || result.as_ref().unwrap().len() <= 1);
726
- }
727
-
728
- #[test]
729
- fn test_documentation_with_code() {
730
- let text = r#"
731
- Language detection is an important feature in document processing systems.
732
- It allows applications to automatically identify the language of text content.
733
- This is particularly useful for multilingual documents and international applications.
734
-
735
- Example code:
736
- let config = LanguageDetectionConfig::default();
737
- let result = detect_languages(text, &config);
738
-
739
- The detection algorithm analyzes character patterns and word frequencies to determine the most likely language.
740
- Modern detection systems achieve high accuracy rates across dozens of languages.
741
- "#;
742
- let config = LanguageDetectionConfig {
743
- enabled: true,
744
- min_confidence: 0.5,
745
- detect_multiple: false,
746
- };
747
-
748
- let result = detect_languages(text, &config).unwrap();
749
- assert!(result.is_some());
750
- let langs = result.unwrap();
751
- assert_eq!(langs[0], "eng");
752
- }
753
-
754
- #[test]
755
- fn test_medical_terminology() {
756
- let text = "The patient presented with acute myocardial infarction and was administered thrombolytic therapy. \
757
- The electrocardiogram showed significant ST-segment elevation in the anterior leads. \
758
- Cardiac biomarkers including troponin and creatine kinase were significantly elevated.";
759
- let config = LanguageDetectionConfig {
760
- enabled: true,
761
- min_confidence: 0.5,
762
- detect_multiple: false,
763
- };
764
-
765
- let result = detect_languages(text, &config).unwrap();
766
- assert!(result.is_some());
767
- let langs = result.unwrap();
768
- assert_eq!(langs[0], "eng");
769
- }
770
-
771
- #[test]
772
- fn test_legal_terminology() {
773
- let text = "The plaintiff hereby alleges that the defendant breached the contractual obligations as stipulated in the aforementioned agreement. \
774
- Pursuant to clause 5.2, the defendant was required to provide adequate consideration within thirty days of execution. \
775
- The court finds that the preponderance of evidence supports the plaintiff's claims.";
776
- let config = LanguageDetectionConfig {
777
- enabled: true,
778
- min_confidence: 0.5,
779
- detect_multiple: false,
780
- };
781
-
782
- let result = detect_languages(text, &config).unwrap();
783
- assert!(result.is_some());
784
- let langs = result.unwrap();
785
- assert_eq!(langs[0], "eng");
786
- }
787
-
788
- #[test]
789
- fn test_scientific_terminology() {
790
- let text = "The experimental protocol involved spectrophotometric analysis using ultraviolet-visible spectroscopy. \
791
- Quantum mechanical calculations were performed using density functional theory at the B3LYP level. \
792
- The results demonstrated significant correlation between molecular structure and optical properties.";
793
- let config = LanguageDetectionConfig {
794
- enabled: true,
795
- min_confidence: 0.5,
796
- detect_multiple: false,
797
- };
798
-
799
- let result = detect_languages(text, &config).unwrap();
800
- assert!(result.is_some());
801
- let langs = result.unwrap();
802
- assert_eq!(langs[0], "eng");
803
- }
804
-
805
- #[test]
806
- fn test_latin_cyrillic_mix() {
807
- let text = format!(
808
- "{}{}",
809
- "Modern technology enables global communication across language barriers. ".repeat(5),
810
- "Современные технологии позволяют общаться по всему миру. ".repeat(5)
811
- );
812
- let config = LanguageDetectionConfig {
813
- enabled: true,
814
- min_confidence: 0.5,
815
- detect_multiple: true,
816
- };
817
-
818
- let result = detect_languages(&text, &config).unwrap();
819
- assert!(result.is_some());
820
- let langs = result.unwrap();
821
- assert!(!langs.is_empty());
822
- }
823
-
824
- #[test]
825
- fn test_latin_cjk_mix() {
826
- let text = format!(
827
- "{}{}",
828
- "Technology companies are expanding into Asian markets. ".repeat(5),
829
- "科技公司正在进军亚洲市场。".repeat(5)
830
- );
831
- let config = LanguageDetectionConfig {
832
- enabled: true,
833
- min_confidence: 0.4,
834
- detect_multiple: true,
835
- };
836
-
837
- let result = detect_languages(&text, &config).unwrap();
838
- assert!(result.is_some());
839
- let langs = result.unwrap();
840
- assert!(!langs.is_empty());
841
- }
842
-
843
- #[test]
844
- fn test_latin_arabic_mix() {
845
- let text = format!(
846
- "{}{}",
847
- "International cooperation is essential for global peace and prosperity. ".repeat(5),
848
- "التعاون الدولي ضروري للسلام والازدهار العالمي. ".repeat(5)
849
- );
850
- let config = LanguageDetectionConfig {
851
- enabled: true,
852
- min_confidence: 0.4,
853
- detect_multiple: true,
854
- };
855
-
856
- let result = detect_languages(&text, &config).unwrap();
857
- assert!(result.is_some());
858
- let langs = result.unwrap();
859
- assert!(!langs.is_empty());
860
- }
861
-
862
- #[test]
863
- fn test_single_word_detection() {
864
- let words = vec![("hello", "eng"), ("bonjour", "fra"), ("hola", "spa"), ("привет", "rus")];
865
-
866
- let config = LanguageDetectionConfig {
867
- enabled: true,
868
- min_confidence: 0.3,
869
- detect_multiple: false,
870
- };
871
-
872
- for (word, _expected_lang) in words {
873
- let result = detect_languages(word, &config).unwrap();
874
- if let Some(langs) = result {
875
- assert!(!langs.is_empty());
876
- }
877
- }
878
- }
879
-
880
- #[test]
881
- fn test_repetitive_text() {
882
- let text = "test test test test test ".repeat(100);
883
- let config = LanguageDetectionConfig {
884
- enabled: true,
885
- min_confidence: 0.5,
886
- detect_multiple: false,
887
- };
888
-
889
- let result = detect_languages(&text, &config).unwrap();
890
- if let Some(langs) = result {
891
- assert!(!langs.is_empty());
892
- }
893
- }
894
-
895
- #[test]
896
- fn test_detection_consistency() {
897
- let text = "This is a consistent test of language detection capabilities across multiple runs.";
898
- let config = LanguageDetectionConfig {
899
- enabled: true,
900
- min_confidence: 0.5,
901
- detect_multiple: false,
902
- };
903
-
904
- let result1 = detect_languages(text, &config).unwrap();
905
- let result2 = detect_languages(text, &config).unwrap();
906
-
907
- assert_eq!(result1, result2, "Detection should be deterministic");
908
- }
909
-
910
- #[test]
911
- fn test_chunk_size_boundary() {
912
- let chunk_text = "a".repeat(500);
913
- let config = LanguageDetectionConfig {
914
- enabled: true,
915
- min_confidence: 0.5,
916
- detect_multiple: true,
917
- };
918
-
919
- let result = detect_languages(&chunk_text, &config).unwrap();
920
- assert!(result.is_none() || result.is_some());
921
-
922
- let over_chunk = "This is English text. ".repeat(30);
923
- let result2 = detect_languages(&over_chunk, &config).unwrap();
924
- assert!(result2.is_none() || result2.is_some());
925
- }
926
-
927
- #[test]
928
- fn test_special_characters_with_text() {
929
- let text =
930
- "The company's revenue increased by 25% year-over-year. CEO said: \"We're excited!\" #growth @investors";
931
- let config = LanguageDetectionConfig {
932
- enabled: true,
933
- min_confidence: 0.5,
934
- detect_multiple: false,
935
- };
936
-
937
- let result = detect_languages(text, &config).unwrap();
938
- assert!(result.is_some());
939
- let langs = result.unwrap();
940
- assert_eq!(langs[0], "eng");
941
- }
942
- }
1
+ //! Language detection using whatlang library.
2
+ //!
3
+ //! Provides fast language detection for extracted text content.
4
+
5
+ use crate::Result;
6
+ use crate::core::config::LanguageDetectionConfig;
7
+ use whatlang::{Lang, detect};
8
+
9
+ /// Detect languages in text using whatlang.
10
+ ///
11
+ /// Returns a list of detected language codes (ISO 639-3 format).
12
+ /// Returns `None` if no languages could be detected with sufficient confidence.
13
+ ///
14
+ /// # Arguments
15
+ ///
16
+ /// * `text` - The text to analyze for language detection
17
+ /// * `config` - Optional configuration for language detection
18
+ ///
19
+ /// # Example
20
+ ///
21
+ /// ```rust
22
+ /// use kreuzberg::language_detection::detect_languages;
23
+ /// use kreuzberg::core::config::LanguageDetectionConfig;
24
+ ///
25
+ /// let text = "Hello world! This is English text.";
26
+ /// let config = LanguageDetectionConfig {
27
+ /// enabled: true,
28
+ /// min_confidence: 0.8,
29
+ /// detect_multiple: false,
30
+ /// };
31
+ /// let languages = detect_languages(text, &config).expect("language detection succeeded");
32
+ /// println!("Detected languages: {:?}", languages);
33
+ /// ```
34
+ pub fn detect_languages(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
35
+ if !config.enabled {
36
+ return Ok(None);
37
+ }
38
+
39
+ if text.trim().is_empty() {
40
+ return Ok(None);
41
+ }
42
+
43
+ if !config.detect_multiple {
44
+ return detect_single_language(text, config);
45
+ }
46
+
47
+ detect_multiple_languages(text, config)
48
+ }
49
+
50
+ /// Detect a single primary language in the text.
51
+ fn detect_single_language(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
52
+ match detect(text) {
53
+ Some(info) => {
54
+ if info.confidence() >= config.min_confidence {
55
+ let lang_code = lang_to_iso639_3(info.lang());
56
+ Ok(Some(vec![lang_code]))
57
+ } else {
58
+ Ok(None)
59
+ }
60
+ }
61
+ None => Ok(None),
62
+ }
63
+ }
64
+
65
+ /// Detect multiple languages in the text by analyzing chunks.
66
+ ///
67
+ /// This splits the text into chunks and detects the language of each chunk,
68
+ /// then returns the most common languages found.
69
+ fn detect_multiple_languages(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
70
+ const CHUNK_SIZE: usize = 200;
71
+ let char_vec: Vec<char> = text.chars().collect();
72
+ let chunk_strings: Vec<String> = char_vec
73
+ .chunks(CHUNK_SIZE)
74
+ .map(|chunk| chunk.iter().collect::<String>())
75
+ .collect();
76
+
77
+ if chunk_strings.is_empty() {
78
+ return Ok(None);
79
+ }
80
+
81
+ let mut lang_counts = std::collections::HashMap::new();
82
+ let threshold = config.min_confidence.min(0.35);
83
+
84
+ for chunk in &chunk_strings {
85
+ if let Some(info) = detect(chunk)
86
+ && info.confidence() >= threshold
87
+ {
88
+ *lang_counts.entry(info.lang()).or_insert(0) += 1;
89
+ }
90
+ }
91
+
92
+ if lang_counts.is_empty() {
93
+ return detect_single_language(text, config);
94
+ }
95
+
96
+ let mut lang_vec: Vec<(Lang, usize)> = lang_counts.into_iter().collect();
97
+ lang_vec.sort_by(|a, b| b.1.cmp(&a.1));
98
+
99
+ let languages: Vec<String> = lang_vec.iter().map(|(lang, _)| lang_to_iso639_3(*lang)).collect();
100
+
101
+ Ok(Some(languages))
102
+ }
103
+
104
+ /// Convert whatlang Lang enum to ISO 639-3 language code.
105
+ ///
106
+ /// Maps whatlang's language codes to standardized ISO 639-3 codes.
107
+ fn lang_to_iso639_3(lang: Lang) -> String {
108
+ match lang {
109
+ Lang::Eng => "eng",
110
+ Lang::Rus => "rus",
111
+ Lang::Cmn => "cmn",
112
+ Lang::Spa => "spa",
113
+ Lang::Por => "por",
114
+ Lang::Ita => "ita",
115
+ Lang::Fra => "fra",
116
+ Lang::Deu => "deu",
117
+ Lang::Ukr => "ukr",
118
+ Lang::Kat => "kat",
119
+ Lang::Ara => "ara",
120
+ Lang::Hin => "hin",
121
+ Lang::Jpn => "jpn",
122
+ Lang::Heb => "heb",
123
+ Lang::Yid => "yid",
124
+ Lang::Pol => "pol",
125
+ Lang::Amh => "amh",
126
+ Lang::Jav => "jav",
127
+ Lang::Kor => "kor",
128
+ Lang::Nob => "nob",
129
+ Lang::Dan => "dan",
130
+ Lang::Swe => "swe",
131
+ Lang::Fin => "fin",
132
+ Lang::Tur => "tur",
133
+ Lang::Nld => "nld",
134
+ Lang::Hun => "hun",
135
+ Lang::Ces => "ces",
136
+ Lang::Ell => "ell",
137
+ Lang::Bul => "bul",
138
+ Lang::Bel => "bel",
139
+ Lang::Mar => "mar",
140
+ Lang::Kan => "kan",
141
+ Lang::Ron => "ron",
142
+ Lang::Slv => "slv",
143
+ Lang::Hrv => "hrv",
144
+ Lang::Srp => "srp",
145
+ Lang::Mkd => "mkd",
146
+ Lang::Lit => "lit",
147
+ Lang::Lav => "lav",
148
+ Lang::Est => "est",
149
+ Lang::Tam => "tam",
150
+ Lang::Vie => "vie",
151
+ Lang::Urd => "urd",
152
+ Lang::Tha => "tha",
153
+ Lang::Guj => "guj",
154
+ Lang::Uzb => "uzb",
155
+ Lang::Pan => "pan",
156
+ Lang::Aze => "aze",
157
+ Lang::Ind => "ind",
158
+ Lang::Tel => "tel",
159
+ Lang::Pes => "pes",
160
+ Lang::Mal => "mal",
161
+ Lang::Ori => "ori",
162
+ Lang::Mya => "mya",
163
+ Lang::Nep => "nep",
164
+ Lang::Sin => "sin",
165
+ Lang::Khm => "khm",
166
+ Lang::Tuk => "tuk",
167
+ Lang::Aka => "aka",
168
+ Lang::Zul => "zul",
169
+ Lang::Sna => "sna",
170
+ Lang::Afr => "afr",
171
+ Lang::Lat => "lat",
172
+ Lang::Slk => "slk",
173
+ Lang::Cat => "cat",
174
+ Lang::Tgl => "tgl",
175
+ Lang::Hye => "hye",
176
+ Lang::Epo => "epo",
177
+ Lang::Ben => "ben",
178
+ Lang::Cym => "cym",
179
+ }
180
+ .to_string()
181
+ }
182
+
183
+ #[cfg(test)]
184
+ mod tests {
185
+ use super::*;
186
+
187
+ #[test]
188
+ fn test_detect_single_language_english() {
189
+ let text = "Hello world! This is a test of the language detection system.";
190
+ let config = LanguageDetectionConfig {
191
+ enabled: true,
192
+ min_confidence: 0.8,
193
+ detect_multiple: false,
194
+ };
195
+
196
+ let result = detect_languages(text, &config).unwrap();
197
+ assert!(result.is_some());
198
+ let langs = result.unwrap();
199
+ assert_eq!(langs.len(), 1);
200
+ assert_eq!(langs[0], "eng");
201
+ }
202
+
203
+ #[test]
204
+ fn test_detect_single_language_spanish() {
205
+ let text = "Hola mundo! Esta es una prueba del sistema de detección de idiomas.";
206
+ let config = LanguageDetectionConfig {
207
+ enabled: true,
208
+ min_confidence: 0.8,
209
+ detect_multiple: false,
210
+ };
211
+
212
+ let result = detect_languages(text, &config).unwrap();
213
+ assert!(result.is_some());
214
+ let langs = result.unwrap();
215
+ assert_eq!(langs.len(), 1);
216
+ assert_eq!(langs[0], "spa");
217
+ }
218
+
219
+ #[test]
220
+ fn test_detect_multiple_languages() {
221
+ let text = "Hello world! This is English text. The quick brown fox jumps over the lazy dog. \
222
+ Hola mundo! Este es texto en español. El rápido zorro marrón salta sobre el perro perezoso. \
223
+ Bonjour le monde! Ceci est un texte en français. Le renard brun rapide saute par-dessus le chien paresseux.";
224
+ let config = LanguageDetectionConfig {
225
+ enabled: true,
226
+ min_confidence: 0.3,
227
+ detect_multiple: true,
228
+ };
229
+
230
+ let result = detect_languages(text, &config).unwrap();
231
+ if let Some(langs) = result {
232
+ assert!(
233
+ !langs.is_empty(),
234
+ "If detection succeeds, should return at least one language"
235
+ );
236
+ }
237
+ }
238
+
239
+ #[test]
240
+ fn test_detect_disabled() {
241
+ let text = "Hello world!";
242
+ let config = LanguageDetectionConfig {
243
+ enabled: false,
244
+ min_confidence: 0.8,
245
+ detect_multiple: false,
246
+ };
247
+
248
+ let result = detect_languages(text, &config).unwrap();
249
+ assert!(result.is_none());
250
+ }
251
+
252
+ #[test]
253
+ fn test_detect_empty_text() {
254
+ let text = "";
255
+ let config = LanguageDetectionConfig {
256
+ enabled: true,
257
+ min_confidence: 0.8,
258
+ detect_multiple: false,
259
+ };
260
+
261
+ let result = detect_languages(text, &config).unwrap();
262
+ assert!(result.is_none());
263
+ }
264
+
265
+ #[test]
266
+ fn test_lang_to_iso639_3() {
267
+ assert_eq!(lang_to_iso639_3(Lang::Eng), "eng");
268
+ assert_eq!(lang_to_iso639_3(Lang::Spa), "spa");
269
+ assert_eq!(lang_to_iso639_3(Lang::Fra), "fra");
270
+ assert_eq!(lang_to_iso639_3(Lang::Deu), "deu");
271
+ assert_eq!(lang_to_iso639_3(Lang::Cmn), "cmn");
272
+ }
273
+
274
+ #[test]
275
+ fn test_confidence_threshold_filters_low_confidence() {
276
+ let text = "ok yes no";
277
+ let high_confidence_config = LanguageDetectionConfig {
278
+ enabled: true,
279
+ min_confidence: 0.99,
280
+ detect_multiple: false,
281
+ };
282
+
283
+ let result = detect_languages(text, &high_confidence_config).unwrap();
284
+ assert!(result.is_none());
285
+ }
286
+
287
+ #[test]
288
+ fn test_confidence_threshold_accepts_high_confidence() {
289
+ let text = "The quick brown fox jumps over the lazy dog. This is definitely English text with clear patterns.";
290
+ let low_confidence_config = LanguageDetectionConfig {
291
+ enabled: true,
292
+ min_confidence: 0.5,
293
+ detect_multiple: false,
294
+ };
295
+
296
+ let result = detect_languages(text, &low_confidence_config).unwrap();
297
+ assert!(result.is_some());
298
+ let langs = result.unwrap();
299
+ assert_eq!(langs.len(), 1);
300
+ assert_eq!(langs[0], "eng");
301
+ }
302
+
303
+ #[test]
304
+ fn test_confidence_threshold_boundary_low() {
305
+ let text =
306
+ "This is a comprehensive English sentence with multiple words to ensure accurate language detection.";
307
+ let very_low_threshold = LanguageDetectionConfig {
308
+ enabled: true,
309
+ min_confidence: 0.01,
310
+ detect_multiple: false,
311
+ };
312
+
313
+ let result = detect_languages(text, &very_low_threshold).unwrap();
314
+ assert!(result.is_some());
315
+ let langs = result.unwrap();
316
+ assert_eq!(langs.len(), 1);
317
+ assert_eq!(langs[0], "eng");
318
+ }
319
+
320
+ #[test]
321
+ fn test_confidence_threshold_boundary_high() {
322
+ let text = "The quick brown fox jumps over the lazy dog.";
323
+ let max_threshold = LanguageDetectionConfig {
324
+ enabled: true,
325
+ min_confidence: 1.0,
326
+ detect_multiple: false,
327
+ };
328
+
329
+ let result = detect_languages(text, &max_threshold).unwrap();
330
+ if let Some(langs) = result {
331
+ assert_eq!(langs.len(), 1);
332
+ }
333
+ }
334
+
335
+ #[test]
336
+ fn test_confidence_threshold_multiple_languages() {
337
+ let text = format!(
338
+ "{}{}",
339
+ "Hello world! This is English text. The quick brown fox jumps over the lazy dog. ".repeat(10),
340
+ "Hola mundo! Este es texto en español. El rápido zorro marrón salta sobre el perro perezoso. ".repeat(10)
341
+ );
342
+ let high_confidence_config = LanguageDetectionConfig {
343
+ enabled: true,
344
+ min_confidence: 0.5,
345
+ detect_multiple: true,
346
+ };
347
+
348
+ let result = detect_languages(&text, &high_confidence_config).unwrap();
349
+ if let Some(langs) = result {
350
+ assert!(
351
+ !langs.is_empty(),
352
+ "If detection succeeds, should find at least one language"
353
+ );
354
+ let has_expected = langs.contains(&"eng".to_string())
355
+ || langs.contains(&"spa".to_string())
356
+ || langs.contains(&"fra".to_string());
357
+ assert!(has_expected, "Should detect at least one of the languages in the text");
358
+ }
359
+ }
360
+
361
+ #[test]
362
+ fn test_confidence_threshold_filters_all_chunks() {
363
+ let text = "a b c d e f g h i j k ".repeat(50);
364
+ let high_confidence_config = LanguageDetectionConfig {
365
+ enabled: true,
366
+ min_confidence: 0.95,
367
+ detect_multiple: true,
368
+ };
369
+
370
+ let result = detect_languages(&text, &high_confidence_config).unwrap();
371
+ assert!(result.is_none() || result.unwrap().is_empty());
372
+ }
373
+
374
+ #[test]
375
+ fn test_default_confidence_threshold() {
376
+ let text = "This is a clear English sentence. The quick brown fox jumps over the lazy dog. \
377
+ English text is easy to detect when there is sufficient content to analyze. \
378
+ Language detection works best with longer text passages that provide more context.";
379
+ let config = LanguageDetectionConfig {
380
+ enabled: true,
381
+ min_confidence: 0.5,
382
+ detect_multiple: false,
383
+ };
384
+
385
+ let result = detect_languages(text, &config).unwrap();
386
+ if let Some(langs) = result {
387
+ assert_eq!(langs.len(), 1, "Single language mode should return one language");
388
+ assert_eq!(langs[0], "eng", "Should detect English");
389
+ }
390
+ }
391
+
392
+ #[test]
393
+ fn test_english_spanish_document() {
394
+ let text = format!(
395
+ "{}{}",
396
+ "The global economy has been experiencing significant changes in recent years. International cooperation is essential for addressing climate change and sustainable development. ".repeat(5),
397
+ "La economía global ha estado experimentando cambios significativos en los últimos años. La cooperación internacional es esencial para abordar el cambio climático y el desarrollo sostenible. ".repeat(5)
398
+ );
399
+ let config = LanguageDetectionConfig {
400
+ enabled: true,
401
+ min_confidence: 0.5,
402
+ detect_multiple: true,
403
+ };
404
+
405
+ let result = detect_languages(&text, &config).unwrap();
406
+ assert!(result.is_some());
407
+ let langs = result.unwrap();
408
+ assert!(!langs.is_empty());
409
+ assert!(langs.contains(&"eng".to_string()) || langs.contains(&"spa".to_string()));
410
+ }
411
+
412
+ #[test]
413
+ fn test_chinese_english_document() {
414
+ let text = format!(
415
+ "{}{}",
416
+ "中国是世界上人口最多的国家。中文是世界上使用人数最多的语言之一。中华文明有着五千年的悠久历史。".repeat(5),
417
+ "China is the most populous country in the world. Chinese is one of the most widely spoken languages. Chinese civilization has a long history of five thousand years. ".repeat(5)
418
+ );
419
+ let config = LanguageDetectionConfig {
420
+ enabled: true,
421
+ min_confidence: 0.4,
422
+ detect_multiple: true,
423
+ };
424
+
425
+ let result = detect_languages(&text, &config).unwrap();
426
+ assert!(result.is_some());
427
+ let langs = result.unwrap();
428
+ assert!(!langs.is_empty());
429
+ assert!(langs.contains(&"cmn".to_string()) || langs.contains(&"eng".to_string()));
430
+ }
431
+
432
+ #[test]
433
+ fn test_french_german_document() {
434
+ let text = format!(
435
+ "{}{}",
436
+ "La France est connue pour sa culture riche et sa cuisine délicieuse. Paris est la capitale de la France et une destination touristique populaire. ".repeat(5),
437
+ "Deutschland ist bekannt für seine Ingenieurskunst und seine reiche Geschichte. Berlin ist die Hauptstadt Deutschlands und eine lebendige Metropole. ".repeat(5)
438
+ );
439
+ let config = LanguageDetectionConfig {
440
+ enabled: true,
441
+ min_confidence: 0.5,
442
+ detect_multiple: true,
443
+ };
444
+
445
+ let result = detect_languages(&text, &config).unwrap();
446
+ assert!(result.is_some());
447
+ let langs = result.unwrap();
448
+ assert!(!langs.is_empty());
449
+ }
450
+
451
+ #[test]
452
+ fn test_russian_ukrainian_document() {
453
+ let text = format!(
454
+ "{}{}",
455
+ "Россия является крупнейшей страной в мире по территории. Москва - столица России и крупнейший город страны. ".repeat(5),
456
+ "Україна є країною в Східній Європі. Київ - столиця України та найбільше місто країни. ".repeat(5)
457
+ );
458
+ let config = LanguageDetectionConfig {
459
+ enabled: true,
460
+ min_confidence: 0.5,
461
+ detect_multiple: true,
462
+ };
463
+
464
+ let result = detect_languages(&text, &config).unwrap();
465
+ assert!(result.is_some());
466
+ let langs = result.unwrap();
467
+ assert!(!langs.is_empty());
468
+ }
469
+
470
+ #[test]
471
+ fn test_romance_languages() {
472
+ let text = "L'Italia è famosa per la sua arte e architettura. O português é falado em vários países. El español es uno de los idiomas más hablados del mundo. ".repeat(3);
473
+ let config = LanguageDetectionConfig {
474
+ enabled: true,
475
+ min_confidence: 0.5,
476
+ detect_multiple: true,
477
+ };
478
+
479
+ let result = detect_languages(&text, &config).unwrap();
480
+ assert!(result.is_some());
481
+ let langs = result.unwrap();
482
+ assert!(!langs.is_empty());
483
+ }
484
+
485
+ #[test]
486
+ fn test_germanic_languages() {
487
+ let text = "Deutschland hat eine reiche Kulturgeschichte. Nederland is bekend om zijn tulpen en windmolens. Sverige är känt för sina skogar och innovationer. ".repeat(3);
488
+ let config = LanguageDetectionConfig {
489
+ enabled: true,
490
+ min_confidence: 0.5,
491
+ detect_multiple: true,
492
+ };
493
+
494
+ let result = detect_languages(&text, &config).unwrap();
495
+ assert!(result.is_some());
496
+ let langs = result.unwrap();
497
+ assert!(!langs.is_empty());
498
+ }
499
+
500
+ #[test]
501
+ fn test_slavic_languages() {
502
+ let text = "Polska jest krajem w Europie Środkowej. Česká republika má bohatou historii. България е страна на Балканския полуостров. ".repeat(3);
503
+ let config = LanguageDetectionConfig {
504
+ enabled: true,
505
+ min_confidence: 0.5,
506
+ detect_multiple: true,
507
+ };
508
+
509
+ let result = detect_languages(&text, &config).unwrap();
510
+ assert!(result.is_some());
511
+ let langs = result.unwrap();
512
+ assert!(!langs.is_empty());
513
+ }
514
+
515
+ #[test]
516
+ fn test_cjk_languages() {
517
+ let text = "中国是一个历史悠久的国家。日本は美しい桜の国です。한국은 아시아의 선진국입니다。".repeat(3);
518
+ let config = LanguageDetectionConfig {
519
+ enabled: true,
520
+ min_confidence: 0.4,
521
+ detect_multiple: true,
522
+ };
523
+
524
+ let result = detect_languages(&text, &config).unwrap();
525
+ assert!(result.is_some());
526
+ let langs = result.unwrap();
527
+ assert!(!langs.is_empty());
528
+ }
529
+
530
+ #[test]
531
+ fn test_arabic_persian() {
532
+ let text = "اللغة العربية هي واحدة من أقدم اللغات في العالم. زبان فارسی زبانی زیبا و شاعرانه است. ".repeat(5);
533
+ let config = LanguageDetectionConfig {
534
+ enabled: true,
535
+ min_confidence: 0.4,
536
+ detect_multiple: true,
537
+ };
538
+
539
+ let result = detect_languages(&text, &config).unwrap();
540
+ assert!(result.is_some());
541
+ let langs = result.unwrap();
542
+ assert!(!langs.is_empty());
543
+ }
544
+
545
+ #[test]
546
+ fn test_very_short_text() {
547
+ let text = "Hello";
548
+ let config = LanguageDetectionConfig {
549
+ enabled: true,
550
+ min_confidence: 0.5,
551
+ detect_multiple: false,
552
+ };
553
+
554
+ let result = detect_languages(text, &config).unwrap();
555
+ if let Some(langs) = result {
556
+ assert!(!langs.is_empty());
557
+ }
558
+ }
559
+
560
+ #[test]
561
+ fn test_medium_length_text() {
562
+ let text = "Machine learning is a subset of artificial intelligence that enables computers to learn from data.";
563
+ let config = LanguageDetectionConfig {
564
+ enabled: true,
565
+ min_confidence: 0.5,
566
+ detect_multiple: false,
567
+ };
568
+
569
+ let result = detect_languages(text, &config).unwrap();
570
+ assert!(result.is_some());
571
+ let langs = result.unwrap();
572
+ assert_eq!(langs.len(), 1);
573
+ assert_eq!(langs[0], "eng");
574
+ }
575
+
576
+ #[test]
577
+ fn test_very_long_text() {
578
+ let paragraph = "The advancement of technology in the twenty-first century has transformed how we live, work, and communicate. \
579
+ From smartphones to artificial intelligence, these innovations have created unprecedented opportunities and challenges. \
580
+ Understanding the implications of technological progress requires careful consideration of ethical, social, and economic factors. ";
581
+ let text = paragraph.repeat(20);
582
+ let config = LanguageDetectionConfig {
583
+ enabled: true,
584
+ min_confidence: 0.7,
585
+ detect_multiple: false,
586
+ };
587
+
588
+ let result = detect_languages(&text, &config).unwrap();
589
+ assert!(result.is_some());
590
+ let langs = result.unwrap();
591
+ assert_eq!(langs.len(), 1);
592
+ assert_eq!(langs[0], "eng");
593
+ }
594
+
595
+ #[test]
596
+ fn test_numbers_only() {
597
+ let text = "123456789 0123456789 987654321";
598
+ let config = LanguageDetectionConfig {
599
+ enabled: true,
600
+ min_confidence: 0.5,
601
+ detect_multiple: false,
602
+ };
603
+
604
+ let result = detect_languages(text, &config).unwrap();
605
+ assert!(result.is_none());
606
+ }
607
+
608
+ #[test]
609
+ fn test_punctuation_only() {
610
+ let text = "!!! ??? ... --- *** @@@ ###";
611
+ let config = LanguageDetectionConfig {
612
+ enabled: true,
613
+ min_confidence: 0.5,
614
+ detect_multiple: false,
615
+ };
616
+
617
+ let result = detect_languages(text, &config).unwrap();
618
+ assert!(result.is_none());
619
+ }
620
+
621
+ #[test]
622
+ fn test_whitespace_only() {
623
+ let text = " \t\n \n\n\t\t ";
624
+ let config = LanguageDetectionConfig {
625
+ enabled: true,
626
+ min_confidence: 0.5,
627
+ detect_multiple: false,
628
+ };
629
+
630
+ let result = detect_languages(text, &config).unwrap();
631
+ assert!(result.is_none());
632
+ }
633
+
634
+ #[test]
635
+ fn test_mixed_numbers_and_text() {
636
+ let text = "The year 2024 marks the 100th anniversary of the founding. Over 50 countries participated in the event with more than 10,000 attendees.";
637
+ let config = LanguageDetectionConfig {
638
+ enabled: true,
639
+ min_confidence: 0.5,
640
+ detect_multiple: false,
641
+ };
642
+
643
+ let result = detect_languages(text, &config).unwrap();
644
+ assert!(result.is_some());
645
+ let langs = result.unwrap();
646
+ assert_eq!(langs[0], "eng");
647
+ }
648
+
649
+ #[test]
650
+ fn test_text_with_urls() {
651
+ let text = "Visit our website at https://example.com for more information. You can also contact us at info@example.com or follow us on social media.";
652
+ let config = LanguageDetectionConfig {
653
+ enabled: true,
654
+ min_confidence: 0.5,
655
+ detect_multiple: false,
656
+ };
657
+
658
+ let result = detect_languages(text, &config).unwrap();
659
+ assert!(result.is_some());
660
+ let langs = result.unwrap();
661
+ assert_eq!(langs[0], "eng");
662
+ }
663
+
664
+ #[test]
665
+ fn test_text_with_email_addresses() {
666
+ let text = "Please send your resume to jobs@company.com or contact.us@example.org for inquiries. Our support team at support@help.com is available 24/7.";
667
+ let config = LanguageDetectionConfig {
668
+ enabled: true,
669
+ min_confidence: 0.5,
670
+ detect_multiple: false,
671
+ };
672
+
673
+ let result = detect_languages(text, &config).unwrap();
674
+ assert!(result.is_some());
675
+ let langs = result.unwrap();
676
+ assert_eq!(langs[0], "eng");
677
+ }
678
+
679
+ #[test]
680
+ fn test_code_with_comments() {
681
+ let text = r#"
682
+ // This function calculates the factorial of a number
683
+ fn factorial(n: u64) -> u64 {
684
+ if n == 0 {
685
+ return 1;
686
+ }
687
+ n * factorial(n - 1)
688
+ }
689
+
690
+ // The algorithm uses recursion to compute the result efficiently
691
+ // It handles edge cases like zero and negative numbers appropriately
692
+ "#;
693
+ let config = LanguageDetectionConfig {
694
+ enabled: true,
695
+ min_confidence: 0.4,
696
+ detect_multiple: false,
697
+ };
698
+
699
+ let result = detect_languages(text, &config).unwrap();
700
+ if let Some(langs) = result {
701
+ assert!(!langs.is_empty());
702
+ }
703
+ }
704
+
705
+ #[test]
706
+ fn test_predominantly_code() {
707
+ let text = r#"
708
+ let x = 42;
709
+ let y = x * 2;
710
+ println!("{}", y);
711
+ fn main() {
712
+ let vec = vec![1, 2, 3];
713
+ for i in vec {
714
+ println!("{}", i);
715
+ }
716
+ }
717
+ "#;
718
+ let config = LanguageDetectionConfig {
719
+ enabled: true,
720
+ min_confidence: 0.5,
721
+ detect_multiple: false,
722
+ };
723
+
724
+ let result = detect_languages(text, &config).unwrap();
725
+ assert!(result.is_none() || result.as_ref().unwrap().is_empty() || result.as_ref().unwrap().len() <= 1);
726
+ }
727
+
728
+ #[test]
729
+ fn test_documentation_with_code() {
730
+ let text = r#"
731
+ Language detection is an important feature in document processing systems.
732
+ It allows applications to automatically identify the language of text content.
733
+ This is particularly useful for multilingual documents and international applications.
734
+
735
+ Example code:
736
+ let config = LanguageDetectionConfig::default();
737
+ let result = detect_languages(text, &config);
738
+
739
+ The detection algorithm analyzes character patterns and word frequencies to determine the most likely language.
740
+ Modern detection systems achieve high accuracy rates across dozens of languages.
741
+ "#;
742
+ let config = LanguageDetectionConfig {
743
+ enabled: true,
744
+ min_confidence: 0.5,
745
+ detect_multiple: false,
746
+ };
747
+
748
+ let result = detect_languages(text, &config).unwrap();
749
+ assert!(result.is_some());
750
+ let langs = result.unwrap();
751
+ assert_eq!(langs[0], "eng");
752
+ }
753
+
754
+ #[test]
755
+ fn test_medical_terminology() {
756
+ let text = "The patient presented with acute myocardial infarction and was administered thrombolytic therapy. \
757
+ The electrocardiogram showed significant ST-segment elevation in the anterior leads. \
758
+ Cardiac biomarkers including troponin and creatine kinase were significantly elevated.";
759
+ let config = LanguageDetectionConfig {
760
+ enabled: true,
761
+ min_confidence: 0.5,
762
+ detect_multiple: false,
763
+ };
764
+
765
+ let result = detect_languages(text, &config).unwrap();
766
+ assert!(result.is_some());
767
+ let langs = result.unwrap();
768
+ assert_eq!(langs[0], "eng");
769
+ }
770
+
771
+ #[test]
772
+ fn test_legal_terminology() {
773
+ let text = "The plaintiff hereby alleges that the defendant breached the contractual obligations as stipulated in the aforementioned agreement. \
774
+ Pursuant to clause 5.2, the defendant was required to provide adequate consideration within thirty days of execution. \
775
+ The court finds that the preponderance of evidence supports the plaintiff's claims.";
776
+ let config = LanguageDetectionConfig {
777
+ enabled: true,
778
+ min_confidence: 0.5,
779
+ detect_multiple: false,
780
+ };
781
+
782
+ let result = detect_languages(text, &config).unwrap();
783
+ assert!(result.is_some());
784
+ let langs = result.unwrap();
785
+ assert_eq!(langs[0], "eng");
786
+ }
787
+
788
+ #[test]
789
+ fn test_scientific_terminology() {
790
+ let text = "The experimental protocol involved spectrophotometric analysis using ultraviolet-visible spectroscopy. \
791
+ Quantum mechanical calculations were performed using density functional theory at the B3LYP level. \
792
+ The results demonstrated significant correlation between molecular structure and optical properties.";
793
+ let config = LanguageDetectionConfig {
794
+ enabled: true,
795
+ min_confidence: 0.5,
796
+ detect_multiple: false,
797
+ };
798
+
799
+ let result = detect_languages(text, &config).unwrap();
800
+ assert!(result.is_some());
801
+ let langs = result.unwrap();
802
+ assert_eq!(langs[0], "eng");
803
+ }
804
+
805
+ #[test]
806
+ fn test_latin_cyrillic_mix() {
807
+ let text = format!(
808
+ "{}{}",
809
+ "Modern technology enables global communication across language barriers. ".repeat(5),
810
+ "Современные технологии позволяют общаться по всему миру. ".repeat(5)
811
+ );
812
+ let config = LanguageDetectionConfig {
813
+ enabled: true,
814
+ min_confidence: 0.5,
815
+ detect_multiple: true,
816
+ };
817
+
818
+ let result = detect_languages(&text, &config).unwrap();
819
+ assert!(result.is_some());
820
+ let langs = result.unwrap();
821
+ assert!(!langs.is_empty());
822
+ }
823
+
824
+ #[test]
825
+ fn test_latin_cjk_mix() {
826
+ let text = format!(
827
+ "{}{}",
828
+ "Technology companies are expanding into Asian markets. ".repeat(5),
829
+ "科技公司正在进军亚洲市场。".repeat(5)
830
+ );
831
+ let config = LanguageDetectionConfig {
832
+ enabled: true,
833
+ min_confidence: 0.4,
834
+ detect_multiple: true,
835
+ };
836
+
837
+ let result = detect_languages(&text, &config).unwrap();
838
+ assert!(result.is_some());
839
+ let langs = result.unwrap();
840
+ assert!(!langs.is_empty());
841
+ }
842
+
843
+ #[test]
844
+ fn test_latin_arabic_mix() {
845
+ let text = format!(
846
+ "{}{}",
847
+ "International cooperation is essential for global peace and prosperity. ".repeat(5),
848
+ "التعاون الدولي ضروري للسلام والازدهار العالمي. ".repeat(5)
849
+ );
850
+ let config = LanguageDetectionConfig {
851
+ enabled: true,
852
+ min_confidence: 0.4,
853
+ detect_multiple: true,
854
+ };
855
+
856
+ let result = detect_languages(&text, &config).unwrap();
857
+ assert!(result.is_some());
858
+ let langs = result.unwrap();
859
+ assert!(!langs.is_empty());
860
+ }
861
+
862
+ #[test]
863
+ fn test_single_word_detection() {
864
+ let words = vec![("hello", "eng"), ("bonjour", "fra"), ("hola", "spa"), ("привет", "rus")];
865
+
866
+ let config = LanguageDetectionConfig {
867
+ enabled: true,
868
+ min_confidence: 0.3,
869
+ detect_multiple: false,
870
+ };
871
+
872
+ for (word, _expected_lang) in words {
873
+ let result = detect_languages(word, &config).unwrap();
874
+ if let Some(langs) = result {
875
+ assert!(!langs.is_empty());
876
+ }
877
+ }
878
+ }
879
+
880
+ #[test]
881
+ fn test_repetitive_text() {
882
+ let text = "test test test test test ".repeat(100);
883
+ let config = LanguageDetectionConfig {
884
+ enabled: true,
885
+ min_confidence: 0.5,
886
+ detect_multiple: false,
887
+ };
888
+
889
+ let result = detect_languages(&text, &config).unwrap();
890
+ if let Some(langs) = result {
891
+ assert!(!langs.is_empty());
892
+ }
893
+ }
894
+
895
+ #[test]
896
+ fn test_detection_consistency() {
897
+ let text = "This is a consistent test of language detection capabilities across multiple runs.";
898
+ let config = LanguageDetectionConfig {
899
+ enabled: true,
900
+ min_confidence: 0.5,
901
+ detect_multiple: false,
902
+ };
903
+
904
+ let result1 = detect_languages(text, &config).unwrap();
905
+ let result2 = detect_languages(text, &config).unwrap();
906
+
907
+ assert_eq!(result1, result2, "Detection should be deterministic");
908
+ }
909
+
910
+ #[test]
911
+ fn test_chunk_size_boundary() {
912
+ let chunk_text = "a".repeat(500);
913
+ let config = LanguageDetectionConfig {
914
+ enabled: true,
915
+ min_confidence: 0.5,
916
+ detect_multiple: true,
917
+ };
918
+
919
+ let result = detect_languages(&chunk_text, &config).unwrap();
920
+ assert!(result.is_none() || result.is_some());
921
+
922
+ let over_chunk = "This is English text. ".repeat(30);
923
+ let result2 = detect_languages(&over_chunk, &config).unwrap();
924
+ assert!(result2.is_none() || result2.is_some());
925
+ }
926
+
927
+ #[test]
928
+ fn test_special_characters_with_text() {
929
+ let text =
930
+ "The company's revenue increased by 25% year-over-year. CEO said: \"We're excited!\" #growth @investors";
931
+ let config = LanguageDetectionConfig {
932
+ enabled: true,
933
+ min_confidence: 0.5,
934
+ detect_multiple: false,
935
+ };
936
+
937
+ let result = detect_languages(text, &config).unwrap();
938
+ assert!(result.is_some());
939
+ let langs = result.unwrap();
940
+ assert_eq!(langs[0], "eng");
941
+ }
942
+ }