kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +538 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +105 -2
  8. data/README.md +454 -454
  9. data/Rakefile +33 -25
  10. data/Steepfile +47 -47
  11. data/examples/async_patterns.rb +341 -341
  12. data/ext/kreuzberg_rb/extconf.rb +45 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
  14. data/ext/kreuzberg_rb/native/Cargo.lock +6940 -6941
  15. data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
  23. data/extconf.rb +28 -28
  24. data/kreuzberg.gemspec +214 -214
  25. data/lib/kreuzberg/api_proxy.rb +142 -142
  26. data/lib/kreuzberg/cache_api.rb +81 -81
  27. data/lib/kreuzberg/cli.rb +55 -55
  28. data/lib/kreuzberg/cli_proxy.rb +127 -127
  29. data/lib/kreuzberg/config.rb +724 -724
  30. data/lib/kreuzberg/error_context.rb +80 -80
  31. data/lib/kreuzberg/errors.rb +118 -118
  32. data/lib/kreuzberg/extraction_api.rb +340 -340
  33. data/lib/kreuzberg/mcp_proxy.rb +186 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +86 -86
  36. data/lib/kreuzberg/result.rb +279 -279
  37. data/lib/kreuzberg/setup_lib_path.rb +80 -80
  38. data/lib/kreuzberg/validator_protocol.rb +89 -89
  39. data/lib/kreuzberg/version.rb +5 -5
  40. data/lib/kreuzberg.rb +109 -109
  41. data/lib/{pdfium.dll → libpdfium.dylib} +0 -0
  42. data/sig/kreuzberg/internal.rbs +184 -184
  43. data/sig/kreuzberg.rbs +546 -546
  44. data/spec/binding/cache_spec.rb +227 -227
  45. data/spec/binding/cli_proxy_spec.rb +85 -85
  46. data/spec/binding/cli_spec.rb +55 -55
  47. data/spec/binding/config_spec.rb +345 -345
  48. data/spec/binding/config_validation_spec.rb +283 -283
  49. data/spec/binding/error_handling_spec.rb +213 -213
  50. data/spec/binding/errors_spec.rb +66 -66
  51. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  52. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  53. data/spec/binding/plugins/validator_spec.rb +274 -274
  54. data/spec/fixtures/config.toml +39 -39
  55. data/spec/fixtures/config.yaml +41 -41
  56. data/spec/fixtures/invalid_config.toml +4 -4
  57. data/spec/smoke/package_spec.rb +178 -178
  58. data/spec/spec_helper.rb +42 -42
  59. data/vendor/Cargo.toml +1 -1
  60. data/vendor/kreuzberg/Cargo.toml +5 -5
  61. data/vendor/kreuzberg/README.md +230 -230
  62. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
  63. data/vendor/kreuzberg/build.rs +843 -843
  64. data/vendor/kreuzberg/src/api/error.rs +81 -81
  65. data/vendor/kreuzberg/src/api/handlers.rs +199 -199
  66. data/vendor/kreuzberg/src/api/mod.rs +79 -79
  67. data/vendor/kreuzberg/src/api/server.rs +353 -353
  68. data/vendor/kreuzberg/src/api/types.rs +170 -170
  69. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  70. data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
  71. data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
  72. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  73. data/vendor/kreuzberg/src/core/config.rs +1080 -1080
  74. data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
  75. data/vendor/kreuzberg/src/core/io.rs +329 -329
  76. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  77. data/vendor/kreuzberg/src/core/mod.rs +47 -47
  78. data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
  79. data/vendor/kreuzberg/src/embeddings.rs +500 -500
  80. data/vendor/kreuzberg/src/error.rs +431 -431
  81. data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
  82. data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
  83. data/vendor/kreuzberg/src/extraction/email.rs +854 -854
  84. data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
  85. data/vendor/kreuzberg/src/extraction/html.rs +601 -601
  86. data/vendor/kreuzberg/src/extraction/image.rs +491 -491
  87. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
  88. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
  89. data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
  90. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  91. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  92. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  93. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  94. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
  95. data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
  96. data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
  97. data/vendor/kreuzberg/src/extraction/table.rs +328 -328
  98. data/vendor/kreuzberg/src/extraction/text.rs +269 -269
  99. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  100. data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
  101. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
  102. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
  103. data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
  104. data/vendor/kreuzberg/src/extractors/email.rs +157 -157
  105. data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
  106. data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
  107. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
  108. data/vendor/kreuzberg/src/extractors/html.rs +407 -407
  109. data/vendor/kreuzberg/src/extractors/image.rs +219 -219
  110. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
  112. data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
  114. data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
  115. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  116. data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
  117. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
  118. data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
  119. data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
  120. data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
  121. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
  122. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  123. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  124. data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
  125. data/vendor/kreuzberg/src/extractors/text.rs +265 -265
  126. data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
  127. data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
  128. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  129. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  130. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  131. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  132. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  133. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  134. data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
  135. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  136. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  137. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  138. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
  139. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
  140. data/vendor/kreuzberg/src/lib.rs +113 -113
  141. data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
  142. data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
  143. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  144. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  145. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  146. data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
  147. data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
  148. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  149. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  150. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
  151. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  152. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  153. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  154. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  155. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
  156. data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
  157. data/vendor/kreuzberg/src/pdf/error.rs +130 -130
  158. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  159. data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
  160. data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
  161. data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
  162. data/vendor/kreuzberg/src/pdf/table.rs +420 -420
  163. data/vendor/kreuzberg/src/pdf/text.rs +240 -240
  164. data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
  165. data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
  166. data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
  167. data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
  168. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
  169. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  170. data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
  171. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  172. data/vendor/kreuzberg/src/text/mod.rs +25 -25
  173. data/vendor/kreuzberg/src/text/quality.rs +697 -697
  174. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
  175. data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
  176. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  177. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  178. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
  179. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
  180. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  181. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  182. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
  183. data/vendor/kreuzberg/src/types.rs +1055 -1055
  184. data/vendor/kreuzberg/src/utils/mod.rs +17 -17
  185. data/vendor/kreuzberg/src/utils/quality.rs +959 -959
  186. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  187. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  188. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  189. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  190. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  191. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  192. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  193. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  194. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  195. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  196. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  197. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  198. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  199. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  200. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  201. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  202. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  203. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  204. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  205. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  206. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  207. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  208. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  209. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  210. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  211. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  212. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  213. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  214. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  215. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  216. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  217. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  218. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  219. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  220. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  221. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  222. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  223. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  224. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  225. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  226. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  227. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  228. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  229. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  230. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  231. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  232. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  233. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  234. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  235. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  236. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  237. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  238. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  239. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  240. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  241. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  242. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  243. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  244. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  245. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  246. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  247. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  248. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  249. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  250. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  251. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  252. data/vendor/kreuzberg/tests/api_tests.rs +966 -966
  253. data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
  254. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
  255. data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
  256. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  257. data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
  258. data/vendor/kreuzberg/tests/config_features.rs +612 -612
  259. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
  260. data/vendor/kreuzberg/tests/core_integration.rs +510 -510
  261. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  262. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
  263. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  264. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  265. data/vendor/kreuzberg/tests/email_integration.rs +327 -327
  266. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  267. data/vendor/kreuzberg/tests/error_handling.rs +402 -402
  268. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  269. data/vendor/kreuzberg/tests/format_integration.rs +164 -164
  270. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
  271. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  272. data/vendor/kreuzberg/tests/image_integration.rs +255 -255
  273. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  274. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  275. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  276. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  277. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  278. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  279. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  280. data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
  281. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
  282. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
  283. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
  284. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  285. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
  286. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  287. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  288. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
  289. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
  290. data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
  291. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
  292. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
  293. data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
  294. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  295. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
  296. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
  297. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
  298. data/vendor/kreuzberg/tests/security_validation.rs +416 -416
  299. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  300. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
  301. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
  302. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
  303. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  304. data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
  305. data/vendor/kreuzberg-ffi/README.md +851 -851
  306. data/vendor/kreuzberg-ffi/build.rs +176 -176
  307. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
  308. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
  309. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
  310. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
  311. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
  312. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
  313. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
  314. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
  315. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
  316. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
  317. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
  318. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
  319. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
  320. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  321. data/vendor/kreuzberg-tesseract/LICENSE +22 -22
  322. data/vendor/kreuzberg-tesseract/README.md +399 -399
  323. data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
  324. data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
  325. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
  326. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
  327. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
  328. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
  329. data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
  330. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
  331. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
  332. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
  333. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
  334. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
  335. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
  336. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
  337. data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
  338. data/vendor/rb-sys/Cargo.lock +393 -393
  339. data/vendor/rb-sys/Cargo.toml +70 -70
  340. data/vendor/rb-sys/Cargo.toml.orig +57 -57
  341. data/vendor/rb-sys/LICENSE-APACHE +190 -190
  342. data/vendor/rb-sys/LICENSE-MIT +21 -21
  343. data/vendor/rb-sys/build/features.rs +111 -111
  344. data/vendor/rb-sys/build/main.rs +286 -286
  345. data/vendor/rb-sys/build/stable_api_config.rs +155 -155
  346. data/vendor/rb-sys/build/version.rs +50 -50
  347. data/vendor/rb-sys/readme.md +36 -36
  348. data/vendor/rb-sys/src/bindings.rs +21 -21
  349. data/vendor/rb-sys/src/hidden.rs +11 -11
  350. data/vendor/rb-sys/src/lib.rs +35 -35
  351. data/vendor/rb-sys/src/macros.rs +371 -371
  352. data/vendor/rb-sys/src/memory.rs +53 -53
  353. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
  354. data/vendor/rb-sys/src/special_consts.rs +31 -31
  355. data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
  356. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
  357. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
  358. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
  359. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
  360. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
  361. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
  362. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
  363. data/vendor/rb-sys/src/stable_api.rs +260 -260
  364. data/vendor/rb-sys/src/symbol.rs +31 -31
  365. data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
  366. data/vendor/rb-sys/src/utils.rs +89 -89
  367. data/vendor/rb-sys/src/value_type.rs +7 -7
  368. metadata +73 -4
  369. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
@@ -1,985 +1,985 @@
1
- //! Language detection using whatlang library.
2
- //!
3
- //! Provides fast language detection for extracted text content.
4
-
5
- use crate::Result;
6
- use crate::core::config::LanguageDetectionConfig;
7
- use once_cell::sync::Lazy;
8
- use std::sync::Arc;
9
- use whatlang::{Lang, detect};
10
-
11
- pub mod processor;
12
- pub use processor::LanguageDetector;
13
-
14
- /// Detect languages in text using whatlang.
15
- ///
16
- /// Returns a list of detected language codes (ISO 639-3 format).
17
- /// Returns `None` if no languages could be detected with sufficient confidence.
18
- ///
19
- /// # Arguments
20
- ///
21
- /// * `text` - The text to analyze for language detection
22
- /// * `config` - Optional configuration for language detection
23
- ///
24
- /// # Example
25
- ///
26
- /// ```rust
27
- /// use kreuzberg::language_detection::detect_languages;
28
- /// use kreuzberg::core::config::LanguageDetectionConfig;
29
- ///
30
- /// let text = "Hello world! This is English text.";
31
- /// let config = LanguageDetectionConfig {
32
- /// enabled: true,
33
- /// min_confidence: 0.8,
34
- /// detect_multiple: false,
35
- /// };
36
- /// let languages = detect_languages(text, &config).expect("language detection succeeded");
37
- /// println!("Detected languages: {:?}", languages);
38
- /// ```
39
- pub fn detect_languages(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
40
- if !config.enabled {
41
- return Ok(None);
42
- }
43
-
44
- if text.trim().is_empty() {
45
- return Ok(None);
46
- }
47
-
48
- if !config.detect_multiple {
49
- return detect_single_language(text, config);
50
- }
51
-
52
- detect_multiple_languages(text, config)
53
- }
54
-
55
- /// Detect a single primary language in the text.
56
- fn detect_single_language(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
57
- match detect(text) {
58
- Some(info) => {
59
- if info.confidence() >= config.min_confidence {
60
- let lang_code = lang_to_iso639_3(info.lang());
61
- Ok(Some(vec![lang_code]))
62
- } else {
63
- Ok(None)
64
- }
65
- }
66
- None => Ok(None),
67
- }
68
- }
69
-
70
- /// Detect multiple languages in the text by analyzing chunks.
71
- ///
72
- /// This splits the text into chunks and detects the language of each chunk,
73
- /// then returns the most common languages found.
74
- fn detect_multiple_languages(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
75
- const CHUNK_SIZE: usize = 200;
76
- let char_vec: Vec<char> = text.chars().collect();
77
- let chunk_strings: Vec<String> = char_vec
78
- .chunks(CHUNK_SIZE)
79
- .map(|chunk| chunk.iter().collect::<String>())
80
- .collect();
81
-
82
- if chunk_strings.is_empty() {
83
- return Ok(None);
84
- }
85
-
86
- let mut lang_counts = std::collections::HashMap::new();
87
- let threshold = config.min_confidence.min(0.35);
88
-
89
- for chunk in &chunk_strings {
90
- if let Some(info) = detect(chunk)
91
- && info.confidence() >= threshold
92
- {
93
- *lang_counts.entry(info.lang()).or_insert(0) += 1;
94
- }
95
- }
96
-
97
- if lang_counts.is_empty() {
98
- return detect_single_language(text, config);
99
- }
100
-
101
- let mut lang_vec: Vec<(Lang, usize)> = lang_counts.into_iter().collect();
102
- lang_vec.sort_by(|a, b| b.1.cmp(&a.1));
103
-
104
- let languages: Vec<String> = lang_vec.iter().map(|(lang, _)| lang_to_iso639_3(*lang)).collect();
105
-
106
- Ok(Some(languages))
107
- }
108
-
109
- /// Convert whatlang Lang enum to ISO 639-3 language code.
110
- ///
111
- /// Maps whatlang's language codes to standardized ISO 639-3 codes.
112
- fn lang_to_iso639_3(lang: Lang) -> String {
113
- match lang {
114
- Lang::Eng => "eng",
115
- Lang::Rus => "rus",
116
- Lang::Cmn => "cmn",
117
- Lang::Spa => "spa",
118
- Lang::Por => "por",
119
- Lang::Ita => "ita",
120
- Lang::Fra => "fra",
121
- Lang::Deu => "deu",
122
- Lang::Ukr => "ukr",
123
- Lang::Kat => "kat",
124
- Lang::Ara => "ara",
125
- Lang::Hin => "hin",
126
- Lang::Jpn => "jpn",
127
- Lang::Heb => "heb",
128
- Lang::Yid => "yid",
129
- Lang::Pol => "pol",
130
- Lang::Amh => "amh",
131
- Lang::Jav => "jav",
132
- Lang::Kor => "kor",
133
- Lang::Nob => "nob",
134
- Lang::Dan => "dan",
135
- Lang::Swe => "swe",
136
- Lang::Fin => "fin",
137
- Lang::Tur => "tur",
138
- Lang::Nld => "nld",
139
- Lang::Hun => "hun",
140
- Lang::Ces => "ces",
141
- Lang::Ell => "ell",
142
- Lang::Bul => "bul",
143
- Lang::Bel => "bel",
144
- Lang::Mar => "mar",
145
- Lang::Kan => "kan",
146
- Lang::Ron => "ron",
147
- Lang::Slv => "slv",
148
- Lang::Hrv => "hrv",
149
- Lang::Srp => "srp",
150
- Lang::Mkd => "mkd",
151
- Lang::Lit => "lit",
152
- Lang::Lav => "lav",
153
- Lang::Est => "est",
154
- Lang::Tam => "tam",
155
- Lang::Vie => "vie",
156
- Lang::Urd => "urd",
157
- Lang::Tha => "tha",
158
- Lang::Guj => "guj",
159
- Lang::Uzb => "uzb",
160
- Lang::Pan => "pan",
161
- Lang::Aze => "aze",
162
- Lang::Ind => "ind",
163
- Lang::Tel => "tel",
164
- Lang::Pes => "pes",
165
- Lang::Mal => "mal",
166
- Lang::Ori => "ori",
167
- Lang::Mya => "mya",
168
- Lang::Nep => "nep",
169
- Lang::Sin => "sin",
170
- Lang::Khm => "khm",
171
- Lang::Tuk => "tuk",
172
- Lang::Aka => "aka",
173
- Lang::Zul => "zul",
174
- Lang::Sna => "sna",
175
- Lang::Afr => "afr",
176
- Lang::Lat => "lat",
177
- Lang::Slk => "slk",
178
- Lang::Cat => "cat",
179
- Lang::Tgl => "tgl",
180
- Lang::Hye => "hye",
181
- Lang::Epo => "epo",
182
- Lang::Ben => "ben",
183
- Lang::Cym => "cym",
184
- }
185
- .to_string()
186
- }
187
-
188
- #[cfg(test)]
189
- mod tests {
190
- use super::*;
191
-
192
- #[test]
193
- fn test_detect_single_language_english() {
194
- let text = "Hello world! This is a test of the language detection system.";
195
- let config = LanguageDetectionConfig {
196
- enabled: true,
197
- min_confidence: 0.8,
198
- detect_multiple: false,
199
- };
200
-
201
- let result = detect_languages(text, &config).unwrap();
202
- assert!(result.is_some());
203
- let langs = result.unwrap();
204
- assert_eq!(langs.len(), 1);
205
- assert_eq!(langs[0], "eng");
206
- }
207
-
208
- #[test]
209
- fn test_detect_single_language_spanish() {
210
- let text = "Hola mundo! Esta es una prueba del sistema de detección de idiomas.";
211
- let config = LanguageDetectionConfig {
212
- enabled: true,
213
- min_confidence: 0.8,
214
- detect_multiple: false,
215
- };
216
-
217
- let result = detect_languages(text, &config).unwrap();
218
- assert!(result.is_some());
219
- let langs = result.unwrap();
220
- assert_eq!(langs.len(), 1);
221
- assert_eq!(langs[0], "spa");
222
- }
223
-
224
- #[test]
225
- fn test_detect_multiple_languages() {
226
- let text = "Hello world! This is English text. The quick brown fox jumps over the lazy dog. \
227
- Hola mundo! Este es texto en español. El rápido zorro marrón salta sobre el perro perezoso. \
228
- Bonjour le monde! Ceci est un texte en français. Le renard brun rapide saute par-dessus le chien paresseux.";
229
- let config = LanguageDetectionConfig {
230
- enabled: true,
231
- min_confidence: 0.3,
232
- detect_multiple: true,
233
- };
234
-
235
- let result = detect_languages(text, &config).unwrap();
236
- if let Some(langs) = result {
237
- assert!(
238
- !langs.is_empty(),
239
- "If detection succeeds, should return at least one language"
240
- );
241
- }
242
- }
243
-
244
- #[test]
245
- fn test_detect_disabled() {
246
- let text = "Hello world!";
247
- let config = LanguageDetectionConfig {
248
- enabled: false,
249
- min_confidence: 0.8,
250
- detect_multiple: false,
251
- };
252
-
253
- let result = detect_languages(text, &config).unwrap();
254
- assert!(result.is_none());
255
- }
256
-
257
- #[test]
258
- fn test_detect_empty_text() {
259
- let text = "";
260
- let config = LanguageDetectionConfig {
261
- enabled: true,
262
- min_confidence: 0.8,
263
- detect_multiple: false,
264
- };
265
-
266
- let result = detect_languages(text, &config).unwrap();
267
- assert!(result.is_none());
268
- }
269
-
270
- #[test]
271
- fn test_lang_to_iso639_3() {
272
- assert_eq!(lang_to_iso639_3(Lang::Eng), "eng");
273
- assert_eq!(lang_to_iso639_3(Lang::Spa), "spa");
274
- assert_eq!(lang_to_iso639_3(Lang::Fra), "fra");
275
- assert_eq!(lang_to_iso639_3(Lang::Deu), "deu");
276
- assert_eq!(lang_to_iso639_3(Lang::Cmn), "cmn");
277
- }
278
-
279
- #[test]
280
- fn test_confidence_threshold_filters_low_confidence() {
281
- let text = "ok yes no";
282
- let high_confidence_config = LanguageDetectionConfig {
283
- enabled: true,
284
- min_confidence: 0.99,
285
- detect_multiple: false,
286
- };
287
-
288
- let result = detect_languages(text, &high_confidence_config).unwrap();
289
- assert!(result.is_none());
290
- }
291
-
292
- #[test]
293
- fn test_confidence_threshold_accepts_high_confidence() {
294
- let text = "The quick brown fox jumps over the lazy dog. This is definitely English text with clear patterns.";
295
- let low_confidence_config = LanguageDetectionConfig {
296
- enabled: true,
297
- min_confidence: 0.5,
298
- detect_multiple: false,
299
- };
300
-
301
- let result = detect_languages(text, &low_confidence_config).unwrap();
302
- assert!(result.is_some());
303
- let langs = result.unwrap();
304
- assert_eq!(langs.len(), 1);
305
- assert_eq!(langs[0], "eng");
306
- }
307
-
308
- #[test]
309
- fn test_confidence_threshold_boundary_low() {
310
- let text =
311
- "This is a comprehensive English sentence with multiple words to ensure accurate language detection.";
312
- let very_low_threshold = LanguageDetectionConfig {
313
- enabled: true,
314
- min_confidence: 0.01,
315
- detect_multiple: false,
316
- };
317
-
318
- let result = detect_languages(text, &very_low_threshold).unwrap();
319
- assert!(result.is_some());
320
- let langs = result.unwrap();
321
- assert_eq!(langs.len(), 1);
322
- assert_eq!(langs[0], "eng");
323
- }
324
-
325
- #[test]
326
- fn test_confidence_threshold_boundary_high() {
327
- let text = "The quick brown fox jumps over the lazy dog.";
328
- let max_threshold = LanguageDetectionConfig {
329
- enabled: true,
330
- min_confidence: 1.0,
331
- detect_multiple: false,
332
- };
333
-
334
- let result = detect_languages(text, &max_threshold).unwrap();
335
- if let Some(langs) = result {
336
- assert_eq!(langs.len(), 1);
337
- }
338
- }
339
-
340
- #[test]
341
- fn test_confidence_threshold_multiple_languages() {
342
- let text = format!(
343
- "{}{}",
344
- "Hello world! This is English text. The quick brown fox jumps over the lazy dog. ".repeat(10),
345
- "Hola mundo! Este es texto en español. El rápido zorro marrón salta sobre el perro perezoso. ".repeat(10)
346
- );
347
- let high_confidence_config = LanguageDetectionConfig {
348
- enabled: true,
349
- min_confidence: 0.5,
350
- detect_multiple: true,
351
- };
352
-
353
- let result = detect_languages(&text, &high_confidence_config).unwrap();
354
- if let Some(langs) = result {
355
- assert!(
356
- !langs.is_empty(),
357
- "If detection succeeds, should find at least one language"
358
- );
359
- let has_expected = langs.contains(&"eng".to_string())
360
- || langs.contains(&"spa".to_string())
361
- || langs.contains(&"fra".to_string());
362
- assert!(has_expected, "Should detect at least one of the languages in the text");
363
- }
364
- }
365
-
366
- #[test]
367
- fn test_confidence_threshold_filters_all_chunks() {
368
- let text = "a b c d e f g h i j k ".repeat(50);
369
- let high_confidence_config = LanguageDetectionConfig {
370
- enabled: true,
371
- min_confidence: 0.95,
372
- detect_multiple: true,
373
- };
374
-
375
- let result = detect_languages(&text, &high_confidence_config).unwrap();
376
- assert!(result.is_none() || result.unwrap().is_empty());
377
- }
378
-
379
- #[test]
380
- fn test_default_confidence_threshold() {
381
- let text = "This is a clear English sentence. The quick brown fox jumps over the lazy dog. \
382
- English text is easy to detect when there is sufficient content to analyze. \
383
- Language detection works best with longer text passages that provide more context.";
384
- let config = LanguageDetectionConfig {
385
- enabled: true,
386
- min_confidence: 0.5,
387
- detect_multiple: false,
388
- };
389
-
390
- let result = detect_languages(text, &config).unwrap();
391
- if let Some(langs) = result {
392
- assert_eq!(langs.len(), 1, "Single language mode should return one language");
393
- assert_eq!(langs[0], "eng", "Should detect English");
394
- }
395
- }
396
-
397
- #[test]
398
- fn test_english_spanish_document() {
399
- let text = format!(
400
- "{}{}",
401
- "The global economy has been experiencing significant changes in recent years. International cooperation is essential for addressing climate change and sustainable development. ".repeat(5),
402
- "La economía global ha estado experimentando cambios significativos en los últimos años. La cooperación internacional es esencial para abordar el cambio climático y el desarrollo sostenible. ".repeat(5)
403
- );
404
- let config = LanguageDetectionConfig {
405
- enabled: true,
406
- min_confidence: 0.5,
407
- detect_multiple: true,
408
- };
409
-
410
- let result = detect_languages(&text, &config).unwrap();
411
- assert!(result.is_some());
412
- let langs = result.unwrap();
413
- assert!(!langs.is_empty());
414
- assert!(langs.contains(&"eng".to_string()) || langs.contains(&"spa".to_string()));
415
- }
416
-
417
- #[test]
418
- fn test_chinese_english_document() {
419
- let text = format!(
420
- "{}{}",
421
- "中国是世界上人口最多的国家。中文是世界上使用人数最多的语言之一。中华文明有着五千年的悠久历史。".repeat(5),
422
- "China is the most populous country in the world. Chinese is one of the most widely spoken languages. Chinese civilization has a long history of five thousand years. ".repeat(5)
423
- );
424
- let config = LanguageDetectionConfig {
425
- enabled: true,
426
- min_confidence: 0.4,
427
- detect_multiple: true,
428
- };
429
-
430
- let result = detect_languages(&text, &config).unwrap();
431
- assert!(result.is_some());
432
- let langs = result.unwrap();
433
- assert!(!langs.is_empty());
434
- assert!(langs.contains(&"cmn".to_string()) || langs.contains(&"eng".to_string()));
435
- }
436
-
437
- #[test]
438
- fn test_french_german_document() {
439
- let text = format!(
440
- "{}{}",
441
- "La France est connue pour sa culture riche et sa cuisine délicieuse. Paris est la capitale de la France et une destination touristique populaire. ".repeat(5),
442
- "Deutschland ist bekannt für seine Ingenieurskunst und seine reiche Geschichte. Berlin ist die Hauptstadt Deutschlands und eine lebendige Metropole. ".repeat(5)
443
- );
444
- let config = LanguageDetectionConfig {
445
- enabled: true,
446
- min_confidence: 0.5,
447
- detect_multiple: true,
448
- };
449
-
450
- let result = detect_languages(&text, &config).unwrap();
451
- assert!(result.is_some());
452
- let langs = result.unwrap();
453
- assert!(!langs.is_empty());
454
- }
455
-
456
- #[test]
457
- fn test_russian_ukrainian_document() {
458
- let text = format!(
459
- "{}{}",
460
- "Россия является крупнейшей страной в мире по территории. Москва - столица России и крупнейший город страны. ".repeat(5),
461
- "Україна є країною в Східній Європі. Київ - столиця України та найбільше місто країни. ".repeat(5)
462
- );
463
- let config = LanguageDetectionConfig {
464
- enabled: true,
465
- min_confidence: 0.5,
466
- detect_multiple: true,
467
- };
468
-
469
- let result = detect_languages(&text, &config).unwrap();
470
- assert!(result.is_some());
471
- let langs = result.unwrap();
472
- assert!(!langs.is_empty());
473
- }
474
-
475
- #[test]
476
- fn test_romance_languages() {
477
- let text = "L'Italia è famosa per la sua arte e architettura. O português é falado em vários países. El español es uno de los idiomas más hablados del mundo. ".repeat(3);
478
- let config = LanguageDetectionConfig {
479
- enabled: true,
480
- min_confidence: 0.5,
481
- detect_multiple: true,
482
- };
483
-
484
- let result = detect_languages(&text, &config).unwrap();
485
- assert!(result.is_some());
486
- let langs = result.unwrap();
487
- assert!(!langs.is_empty());
488
- }
489
-
490
- #[test]
491
- fn test_germanic_languages() {
492
- let text = "Deutschland hat eine reiche Kulturgeschichte. Nederland is bekend om zijn tulpen en windmolens. Sverige är känt för sina skogar och innovationer. ".repeat(3);
493
- let config = LanguageDetectionConfig {
494
- enabled: true,
495
- min_confidence: 0.5,
496
- detect_multiple: true,
497
- };
498
-
499
- let result = detect_languages(&text, &config).unwrap();
500
- assert!(result.is_some());
501
- let langs = result.unwrap();
502
- assert!(!langs.is_empty());
503
- }
504
-
505
- #[test]
506
- fn test_slavic_languages() {
507
- let text = "Polska jest krajem w Europie Środkowej. Česká republika má bohatou historii. България е страна на Балканския полуостров. ".repeat(3);
508
- let config = LanguageDetectionConfig {
509
- enabled: true,
510
- min_confidence: 0.5,
511
- detect_multiple: true,
512
- };
513
-
514
- let result = detect_languages(&text, &config).unwrap();
515
- assert!(result.is_some());
516
- let langs = result.unwrap();
517
- assert!(!langs.is_empty());
518
- }
519
-
520
- #[test]
521
- fn test_cjk_languages() {
522
- let text = "中国是一个历史悠久的国家。日本は美しい桜の国です。한국은 아시아의 선진국입니다。".repeat(3);
523
- let config = LanguageDetectionConfig {
524
- enabled: true,
525
- min_confidence: 0.4,
526
- detect_multiple: true,
527
- };
528
-
529
- let result = detect_languages(&text, &config).unwrap();
530
- assert!(result.is_some());
531
- let langs = result.unwrap();
532
- assert!(!langs.is_empty());
533
- }
534
-
535
- #[test]
536
- fn test_arabic_persian() {
537
- let text = "اللغة العربية هي واحدة من أقدم اللغات في العالم. زبان فارسی زبانی زیبا و شاعرانه است. ".repeat(5);
538
- let config = LanguageDetectionConfig {
539
- enabled: true,
540
- min_confidence: 0.4,
541
- detect_multiple: true,
542
- };
543
-
544
- let result = detect_languages(&text, &config).unwrap();
545
- assert!(result.is_some());
546
- let langs = result.unwrap();
547
- assert!(!langs.is_empty());
548
- }
549
-
550
- #[test]
551
- fn test_very_short_text() {
552
- let text = "Hello";
553
- let config = LanguageDetectionConfig {
554
- enabled: true,
555
- min_confidence: 0.5,
556
- detect_multiple: false,
557
- };
558
-
559
- let result = detect_languages(text, &config).unwrap();
560
- if let Some(langs) = result {
561
- assert!(!langs.is_empty());
562
- }
563
- }
564
-
565
- #[test]
566
- fn test_medium_length_text() {
567
- let text = "Machine learning is a subset of artificial intelligence that enables computers to learn from data.";
568
- let config = LanguageDetectionConfig {
569
- enabled: true,
570
- min_confidence: 0.5,
571
- detect_multiple: false,
572
- };
573
-
574
- let result = detect_languages(text, &config).unwrap();
575
- assert!(result.is_some());
576
- let langs = result.unwrap();
577
- assert_eq!(langs.len(), 1);
578
- assert_eq!(langs[0], "eng");
579
- }
580
-
581
- #[test]
582
- fn test_very_long_text() {
583
- let paragraph = "The advancement of technology in the twenty-first century has transformed how we live, work, and communicate. \
584
- From smartphones to artificial intelligence, these innovations have created unprecedented opportunities and challenges. \
585
- Understanding the implications of technological progress requires careful consideration of ethical, social, and economic factors. ";
586
- let text = paragraph.repeat(20);
587
- let config = LanguageDetectionConfig {
588
- enabled: true,
589
- min_confidence: 0.7,
590
- detect_multiple: false,
591
- };
592
-
593
- let result = detect_languages(&text, &config).unwrap();
594
- assert!(result.is_some());
595
- let langs = result.unwrap();
596
- assert_eq!(langs.len(), 1);
597
- assert_eq!(langs[0], "eng");
598
- }
599
-
600
- #[test]
601
- fn test_numbers_only() {
602
- let text = "123456789 0123456789 987654321";
603
- let config = LanguageDetectionConfig {
604
- enabled: true,
605
- min_confidence: 0.5,
606
- detect_multiple: false,
607
- };
608
-
609
- let result = detect_languages(text, &config).unwrap();
610
- assert!(result.is_none());
611
- }
612
-
613
- #[test]
614
- fn test_punctuation_only() {
615
- let text = "!!! ??? ... --- *** @@@ ###";
616
- let config = LanguageDetectionConfig {
617
- enabled: true,
618
- min_confidence: 0.5,
619
- detect_multiple: false,
620
- };
621
-
622
- let result = detect_languages(text, &config).unwrap();
623
- assert!(result.is_none());
624
- }
625
-
626
- #[test]
627
- fn test_whitespace_only() {
628
- let text = " \t\n \n\n\t\t ";
629
- let config = LanguageDetectionConfig {
630
- enabled: true,
631
- min_confidence: 0.5,
632
- detect_multiple: false,
633
- };
634
-
635
- let result = detect_languages(text, &config).unwrap();
636
- assert!(result.is_none());
637
- }
638
-
639
- #[test]
640
- fn test_mixed_numbers_and_text() {
641
- let text = "The year 2024 marks the 100th anniversary of the founding. Over 50 countries participated in the event with more than 10,000 attendees.";
642
- let config = LanguageDetectionConfig {
643
- enabled: true,
644
- min_confidence: 0.5,
645
- detect_multiple: false,
646
- };
647
-
648
- let result = detect_languages(text, &config).unwrap();
649
- assert!(result.is_some());
650
- let langs = result.unwrap();
651
- assert_eq!(langs[0], "eng");
652
- }
653
-
654
- #[test]
655
- fn test_text_with_urls() {
656
- let text = "Visit our website at https://example.com for more information. You can also contact us at info@example.com or follow us on social media.";
657
- let config = LanguageDetectionConfig {
658
- enabled: true,
659
- min_confidence: 0.5,
660
- detect_multiple: false,
661
- };
662
-
663
- let result = detect_languages(text, &config).unwrap();
664
- assert!(result.is_some());
665
- let langs = result.unwrap();
666
- assert_eq!(langs[0], "eng");
667
- }
668
-
669
- #[test]
670
- fn test_text_with_email_addresses() {
671
- let text = "Please send your resume to jobs@company.com or contact.us@example.org for inquiries. Our support team at support@help.com is available 24/7.";
672
- let config = LanguageDetectionConfig {
673
- enabled: true,
674
- min_confidence: 0.5,
675
- detect_multiple: false,
676
- };
677
-
678
- let result = detect_languages(text, &config).unwrap();
679
- assert!(result.is_some());
680
- let langs = result.unwrap();
681
- assert_eq!(langs[0], "eng");
682
- }
683
-
684
- #[test]
685
- fn test_code_with_comments() {
686
- let text = r#"
687
- // This function calculates the factorial of a number
688
- fn factorial(n: u64) -> u64 {
689
- if n == 0 {
690
- return 1;
691
- }
692
- n * factorial(n - 1)
693
- }
694
-
695
- // The algorithm uses recursion to compute the result efficiently
696
- // It handles edge cases like zero and negative numbers appropriately
697
- "#;
698
- let config = LanguageDetectionConfig {
699
- enabled: true,
700
- min_confidence: 0.4,
701
- detect_multiple: false,
702
- };
703
-
704
- let result = detect_languages(text, &config).unwrap();
705
- if let Some(langs) = result {
706
- assert!(!langs.is_empty());
707
- }
708
- }
709
-
710
- #[test]
711
- fn test_predominantly_code() {
712
- let text = r#"
713
- let x = 42;
714
- let y = x * 2;
715
- println!("{}", y);
716
- fn main() {
717
- let vec = vec![1, 2, 3];
718
- for i in vec {
719
- println!("{}", i);
720
- }
721
- }
722
- "#;
723
- let config = LanguageDetectionConfig {
724
- enabled: true,
725
- min_confidence: 0.5,
726
- detect_multiple: false,
727
- };
728
-
729
- let result = detect_languages(text, &config).unwrap();
730
- assert!(result.is_none() || result.as_ref().unwrap().is_empty() || result.as_ref().unwrap().len() <= 1);
731
- }
732
-
733
- #[test]
734
- fn test_documentation_with_code() {
735
- let text = r#"
736
- Language detection is an important feature in document processing systems.
737
- It allows applications to automatically identify the language of text content.
738
- This is particularly useful for multilingual documents and international applications.
739
-
740
- Example code:
741
- let config = LanguageDetectionConfig::default();
742
- let result = detect_languages(text, &config);
743
-
744
- The detection algorithm analyzes character patterns and word frequencies to determine the most likely language.
745
- Modern detection systems achieve high accuracy rates across dozens of languages.
746
- "#;
747
- let config = LanguageDetectionConfig {
748
- enabled: true,
749
- min_confidence: 0.5,
750
- detect_multiple: false,
751
- };
752
-
753
- let result = detect_languages(text, &config).unwrap();
754
- assert!(result.is_some());
755
- let langs = result.unwrap();
756
- assert_eq!(langs[0], "eng");
757
- }
758
-
759
- #[test]
760
- fn test_medical_terminology() {
761
- let text = "The patient presented with acute myocardial infarction and was administered thrombolytic therapy. \
762
- The electrocardiogram showed significant ST-segment elevation in the anterior leads. \
763
- Cardiac biomarkers including troponin and creatine kinase were significantly elevated.";
764
- let config = LanguageDetectionConfig {
765
- enabled: true,
766
- min_confidence: 0.5,
767
- detect_multiple: false,
768
- };
769
-
770
- let result = detect_languages(text, &config).unwrap();
771
- assert!(result.is_some());
772
- let langs = result.unwrap();
773
- assert_eq!(langs[0], "eng");
774
- }
775
-
776
- #[test]
777
- fn test_legal_terminology() {
778
- let text = "The plaintiff hereby alleges that the defendant breached the contractual obligations as stipulated in the aforementioned agreement. \
779
- Pursuant to clause 5.2, the defendant was required to provide adequate consideration within thirty days of execution. \
780
- The court finds that the preponderance of evidence supports the plaintiff's claims.";
781
- let config = LanguageDetectionConfig {
782
- enabled: true,
783
- min_confidence: 0.5,
784
- detect_multiple: false,
785
- };
786
-
787
- let result = detect_languages(text, &config).unwrap();
788
- assert!(result.is_some());
789
- let langs = result.unwrap();
790
- assert_eq!(langs[0], "eng");
791
- }
792
-
793
- #[test]
794
- fn test_scientific_terminology() {
795
- let text = "The experimental protocol involved spectrophotometric analysis using ultraviolet-visible spectroscopy. \
796
- Quantum mechanical calculations were performed using density functional theory at the B3LYP level. \
797
- The results demonstrated significant correlation between molecular structure and optical properties.";
798
- let config = LanguageDetectionConfig {
799
- enabled: true,
800
- min_confidence: 0.5,
801
- detect_multiple: false,
802
- };
803
-
804
- let result = detect_languages(text, &config).unwrap();
805
- assert!(result.is_some());
806
- let langs = result.unwrap();
807
- assert_eq!(langs[0], "eng");
808
- }
809
-
810
- #[test]
811
- fn test_latin_cyrillic_mix() {
812
- let text = format!(
813
- "{}{}",
814
- "Modern technology enables global communication across language barriers. ".repeat(5),
815
- "Современные технологии позволяют общаться по всему миру. ".repeat(5)
816
- );
817
- let config = LanguageDetectionConfig {
818
- enabled: true,
819
- min_confidence: 0.5,
820
- detect_multiple: true,
821
- };
822
-
823
- let result = detect_languages(&text, &config).unwrap();
824
- assert!(result.is_some());
825
- let langs = result.unwrap();
826
- assert!(!langs.is_empty());
827
- }
828
-
829
- #[test]
830
- fn test_latin_cjk_mix() {
831
- let text = format!(
832
- "{}{}",
833
- "Technology companies are expanding into Asian markets. ".repeat(5),
834
- "科技公司正在进军亚洲市场。".repeat(5)
835
- );
836
- let config = LanguageDetectionConfig {
837
- enabled: true,
838
- min_confidence: 0.4,
839
- detect_multiple: true,
840
- };
841
-
842
- let result = detect_languages(&text, &config).unwrap();
843
- assert!(result.is_some());
844
- let langs = result.unwrap();
845
- assert!(!langs.is_empty());
846
- }
847
-
848
- #[test]
849
- fn test_latin_arabic_mix() {
850
- let text = format!(
851
- "{}{}",
852
- "International cooperation is essential for global peace and prosperity. ".repeat(5),
853
- "التعاون الدولي ضروري للسلام والازدهار العالمي. ".repeat(5)
854
- );
855
- let config = LanguageDetectionConfig {
856
- enabled: true,
857
- min_confidence: 0.4,
858
- detect_multiple: true,
859
- };
860
-
861
- let result = detect_languages(&text, &config).unwrap();
862
- assert!(result.is_some());
863
- let langs = result.unwrap();
864
- assert!(!langs.is_empty());
865
- }
866
-
867
- #[test]
868
- fn test_single_word_detection() {
869
- let words = vec![("hello", "eng"), ("bonjour", "fra"), ("hola", "spa"), ("привет", "rus")];
870
-
871
- let config = LanguageDetectionConfig {
872
- enabled: true,
873
- min_confidence: 0.3,
874
- detect_multiple: false,
875
- };
876
-
877
- for (word, _expected_lang) in words {
878
- let result = detect_languages(word, &config).unwrap();
879
- if let Some(langs) = result {
880
- assert!(!langs.is_empty());
881
- }
882
- }
883
- }
884
-
885
- #[test]
886
- fn test_repetitive_text() {
887
- let text = "test test test test test ".repeat(100);
888
- let config = LanguageDetectionConfig {
889
- enabled: true,
890
- min_confidence: 0.5,
891
- detect_multiple: false,
892
- };
893
-
894
- let result = detect_languages(&text, &config).unwrap();
895
- if let Some(langs) = result {
896
- assert!(!langs.is_empty());
897
- }
898
- }
899
-
900
- #[test]
901
- fn test_detection_consistency() {
902
- let text = "This is a consistent test of language detection capabilities across multiple runs.";
903
- let config = LanguageDetectionConfig {
904
- enabled: true,
905
- min_confidence: 0.5,
906
- detect_multiple: false,
907
- };
908
-
909
- let result1 = detect_languages(text, &config).unwrap();
910
- let result2 = detect_languages(text, &config).unwrap();
911
-
912
- assert_eq!(result1, result2, "Detection should be deterministic");
913
- }
914
-
915
- #[test]
916
- fn test_chunk_size_boundary() {
917
- let chunk_text = "a".repeat(500);
918
- let config = LanguageDetectionConfig {
919
- enabled: true,
920
- min_confidence: 0.5,
921
- detect_multiple: true,
922
- };
923
-
924
- let result = detect_languages(&chunk_text, &config).unwrap();
925
- assert!(result.is_none() || result.is_some());
926
-
927
- let over_chunk = "This is English text. ".repeat(30);
928
- let result2 = detect_languages(&over_chunk, &config).unwrap();
929
- assert!(result2.is_none() || result2.is_some());
930
- }
931
-
932
- #[test]
933
- fn test_special_characters_with_text() {
934
- let text =
935
- "The company's revenue increased by 25% year-over-year. CEO said: \"We're excited!\" #growth @investors";
936
- let config = LanguageDetectionConfig {
937
- enabled: true,
938
- min_confidence: 0.5,
939
- detect_multiple: false,
940
- };
941
-
942
- let result = detect_languages(text, &config).unwrap();
943
- assert!(result.is_some());
944
- let langs = result.unwrap();
945
- assert_eq!(langs[0], "eng");
946
- }
947
- }
948
-
949
- /// Lazy-initialized flag that ensures language detection processor is registered exactly once.
950
- ///
951
- /// This static is accessed on first use to automatically register the
952
- /// language detection processor with the plugin registry.
953
- static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_language_detection_processor);
954
-
955
- /// Ensure the language detection processor is registered.
956
- ///
957
- /// This function is called automatically when needed.
958
- /// It's safe to call multiple times - registration only happens once.
959
- pub fn ensure_initialized() -> Result<()> {
960
- PROCESSOR_INITIALIZED
961
- .as_ref()
962
- .map(|_| ())
963
- .map_err(|e| crate::KreuzbergError::Plugin {
964
- message: format!("Failed to register language detection processor: {}", e),
965
- plugin_name: "language-detection".to_string(),
966
- })
967
- }
968
-
969
- /// Register the language detection processor with the global registry.
970
- ///
971
- /// This function should be called once at application startup to register
972
- /// the language detection post-processor.
973
- ///
974
- /// **Note:** This is called automatically on first use.
975
- /// Explicit calling is optional.
976
- pub fn register_language_detection_processor() -> Result<()> {
977
- let registry = crate::plugins::registry::get_post_processor_registry();
978
- let mut registry = registry
979
- .write()
980
- .map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
981
-
982
- registry.register(Arc::new(LanguageDetector), 40)?;
983
-
984
- Ok(())
985
- }
1
+ //! Language detection using whatlang library.
2
+ //!
3
+ //! Provides fast language detection for extracted text content.
4
+
5
+ use crate::Result;
6
+ use crate::core::config::LanguageDetectionConfig;
7
+ use once_cell::sync::Lazy;
8
+ use std::sync::Arc;
9
+ use whatlang::{Lang, detect};
10
+
11
+ pub mod processor;
12
+ pub use processor::LanguageDetector;
13
+
14
+ /// Detect languages in text using whatlang.
15
+ ///
16
+ /// Returns a list of detected language codes (ISO 639-3 format).
17
+ /// Returns `None` if no languages could be detected with sufficient confidence.
18
+ ///
19
+ /// # Arguments
20
+ ///
21
+ /// * `text` - The text to analyze for language detection
22
+ /// * `config` - Optional configuration for language detection
23
+ ///
24
+ /// # Example
25
+ ///
26
+ /// ```rust
27
+ /// use kreuzberg::language_detection::detect_languages;
28
+ /// use kreuzberg::core::config::LanguageDetectionConfig;
29
+ ///
30
+ /// let text = "Hello world! This is English text.";
31
+ /// let config = LanguageDetectionConfig {
32
+ /// enabled: true,
33
+ /// min_confidence: 0.8,
34
+ /// detect_multiple: false,
35
+ /// };
36
+ /// let languages = detect_languages(text, &config).expect("language detection succeeded");
37
+ /// println!("Detected languages: {:?}", languages);
38
+ /// ```
39
+ pub fn detect_languages(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
40
+ if !config.enabled {
41
+ return Ok(None);
42
+ }
43
+
44
+ if text.trim().is_empty() {
45
+ return Ok(None);
46
+ }
47
+
48
+ if !config.detect_multiple {
49
+ return detect_single_language(text, config);
50
+ }
51
+
52
+ detect_multiple_languages(text, config)
53
+ }
54
+
55
+ /// Detect a single primary language in the text.
56
+ fn detect_single_language(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
57
+ match detect(text) {
58
+ Some(info) => {
59
+ if info.confidence() >= config.min_confidence {
60
+ let lang_code = lang_to_iso639_3(info.lang());
61
+ Ok(Some(vec![lang_code]))
62
+ } else {
63
+ Ok(None)
64
+ }
65
+ }
66
+ None => Ok(None),
67
+ }
68
+ }
69
+
70
+ /// Detect multiple languages in the text by analyzing chunks.
71
+ ///
72
+ /// This splits the text into chunks and detects the language of each chunk,
73
+ /// then returns the most common languages found.
74
+ fn detect_multiple_languages(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
75
+ const CHUNK_SIZE: usize = 200;
76
+ let char_vec: Vec<char> = text.chars().collect();
77
+ let chunk_strings: Vec<String> = char_vec
78
+ .chunks(CHUNK_SIZE)
79
+ .map(|chunk| chunk.iter().collect::<String>())
80
+ .collect();
81
+
82
+ if chunk_strings.is_empty() {
83
+ return Ok(None);
84
+ }
85
+
86
+ let mut lang_counts = std::collections::HashMap::new();
87
+ let threshold = config.min_confidence.min(0.35);
88
+
89
+ for chunk in &chunk_strings {
90
+ if let Some(info) = detect(chunk)
91
+ && info.confidence() >= threshold
92
+ {
93
+ *lang_counts.entry(info.lang()).or_insert(0) += 1;
94
+ }
95
+ }
96
+
97
+ if lang_counts.is_empty() {
98
+ return detect_single_language(text, config);
99
+ }
100
+
101
+ let mut lang_vec: Vec<(Lang, usize)> = lang_counts.into_iter().collect();
102
+ lang_vec.sort_by(|a, b| b.1.cmp(&a.1));
103
+
104
+ let languages: Vec<String> = lang_vec.iter().map(|(lang, _)| lang_to_iso639_3(*lang)).collect();
105
+
106
+ Ok(Some(languages))
107
+ }
108
+
109
+ /// Convert whatlang Lang enum to ISO 639-3 language code.
110
+ ///
111
+ /// Maps whatlang's language codes to standardized ISO 639-3 codes.
112
+ fn lang_to_iso639_3(lang: Lang) -> String {
113
+ match lang {
114
+ Lang::Eng => "eng",
115
+ Lang::Rus => "rus",
116
+ Lang::Cmn => "cmn",
117
+ Lang::Spa => "spa",
118
+ Lang::Por => "por",
119
+ Lang::Ita => "ita",
120
+ Lang::Fra => "fra",
121
+ Lang::Deu => "deu",
122
+ Lang::Ukr => "ukr",
123
+ Lang::Kat => "kat",
124
+ Lang::Ara => "ara",
125
+ Lang::Hin => "hin",
126
+ Lang::Jpn => "jpn",
127
+ Lang::Heb => "heb",
128
+ Lang::Yid => "yid",
129
+ Lang::Pol => "pol",
130
+ Lang::Amh => "amh",
131
+ Lang::Jav => "jav",
132
+ Lang::Kor => "kor",
133
+ Lang::Nob => "nob",
134
+ Lang::Dan => "dan",
135
+ Lang::Swe => "swe",
136
+ Lang::Fin => "fin",
137
+ Lang::Tur => "tur",
138
+ Lang::Nld => "nld",
139
+ Lang::Hun => "hun",
140
+ Lang::Ces => "ces",
141
+ Lang::Ell => "ell",
142
+ Lang::Bul => "bul",
143
+ Lang::Bel => "bel",
144
+ Lang::Mar => "mar",
145
+ Lang::Kan => "kan",
146
+ Lang::Ron => "ron",
147
+ Lang::Slv => "slv",
148
+ Lang::Hrv => "hrv",
149
+ Lang::Srp => "srp",
150
+ Lang::Mkd => "mkd",
151
+ Lang::Lit => "lit",
152
+ Lang::Lav => "lav",
153
+ Lang::Est => "est",
154
+ Lang::Tam => "tam",
155
+ Lang::Vie => "vie",
156
+ Lang::Urd => "urd",
157
+ Lang::Tha => "tha",
158
+ Lang::Guj => "guj",
159
+ Lang::Uzb => "uzb",
160
+ Lang::Pan => "pan",
161
+ Lang::Aze => "aze",
162
+ Lang::Ind => "ind",
163
+ Lang::Tel => "tel",
164
+ Lang::Pes => "pes",
165
+ Lang::Mal => "mal",
166
+ Lang::Ori => "ori",
167
+ Lang::Mya => "mya",
168
+ Lang::Nep => "nep",
169
+ Lang::Sin => "sin",
170
+ Lang::Khm => "khm",
171
+ Lang::Tuk => "tuk",
172
+ Lang::Aka => "aka",
173
+ Lang::Zul => "zul",
174
+ Lang::Sna => "sna",
175
+ Lang::Afr => "afr",
176
+ Lang::Lat => "lat",
177
+ Lang::Slk => "slk",
178
+ Lang::Cat => "cat",
179
+ Lang::Tgl => "tgl",
180
+ Lang::Hye => "hye",
181
+ Lang::Epo => "epo",
182
+ Lang::Ben => "ben",
183
+ Lang::Cym => "cym",
184
+ }
185
+ .to_string()
186
+ }
187
+
188
+ #[cfg(test)]
189
+ mod tests {
190
+ use super::*;
191
+
192
+ #[test]
193
+ fn test_detect_single_language_english() {
194
+ let text = "Hello world! This is a test of the language detection system.";
195
+ let config = LanguageDetectionConfig {
196
+ enabled: true,
197
+ min_confidence: 0.8,
198
+ detect_multiple: false,
199
+ };
200
+
201
+ let result = detect_languages(text, &config).unwrap();
202
+ assert!(result.is_some());
203
+ let langs = result.unwrap();
204
+ assert_eq!(langs.len(), 1);
205
+ assert_eq!(langs[0], "eng");
206
+ }
207
+
208
+ #[test]
209
+ fn test_detect_single_language_spanish() {
210
+ let text = "Hola mundo! Esta es una prueba del sistema de detección de idiomas.";
211
+ let config = LanguageDetectionConfig {
212
+ enabled: true,
213
+ min_confidence: 0.8,
214
+ detect_multiple: false,
215
+ };
216
+
217
+ let result = detect_languages(text, &config).unwrap();
218
+ assert!(result.is_some());
219
+ let langs = result.unwrap();
220
+ assert_eq!(langs.len(), 1);
221
+ assert_eq!(langs[0], "spa");
222
+ }
223
+
224
+ #[test]
225
+ fn test_detect_multiple_languages() {
226
+ let text = "Hello world! This is English text. The quick brown fox jumps over the lazy dog. \
227
+ Hola mundo! Este es texto en español. El rápido zorro marrón salta sobre el perro perezoso. \
228
+ Bonjour le monde! Ceci est un texte en français. Le renard brun rapide saute par-dessus le chien paresseux.";
229
+ let config = LanguageDetectionConfig {
230
+ enabled: true,
231
+ min_confidence: 0.3,
232
+ detect_multiple: true,
233
+ };
234
+
235
+ let result = detect_languages(text, &config).unwrap();
236
+ if let Some(langs) = result {
237
+ assert!(
238
+ !langs.is_empty(),
239
+ "If detection succeeds, should return at least one language"
240
+ );
241
+ }
242
+ }
243
+
244
+ #[test]
245
+ fn test_detect_disabled() {
246
+ let text = "Hello world!";
247
+ let config = LanguageDetectionConfig {
248
+ enabled: false,
249
+ min_confidence: 0.8,
250
+ detect_multiple: false,
251
+ };
252
+
253
+ let result = detect_languages(text, &config).unwrap();
254
+ assert!(result.is_none());
255
+ }
256
+
257
+ #[test]
258
+ fn test_detect_empty_text() {
259
+ let text = "";
260
+ let config = LanguageDetectionConfig {
261
+ enabled: true,
262
+ min_confidence: 0.8,
263
+ detect_multiple: false,
264
+ };
265
+
266
+ let result = detect_languages(text, &config).unwrap();
267
+ assert!(result.is_none());
268
+ }
269
+
270
+ #[test]
271
+ fn test_lang_to_iso639_3() {
272
+ assert_eq!(lang_to_iso639_3(Lang::Eng), "eng");
273
+ assert_eq!(lang_to_iso639_3(Lang::Spa), "spa");
274
+ assert_eq!(lang_to_iso639_3(Lang::Fra), "fra");
275
+ assert_eq!(lang_to_iso639_3(Lang::Deu), "deu");
276
+ assert_eq!(lang_to_iso639_3(Lang::Cmn), "cmn");
277
+ }
278
+
279
+ #[test]
280
+ fn test_confidence_threshold_filters_low_confidence() {
281
+ let text = "ok yes no";
282
+ let high_confidence_config = LanguageDetectionConfig {
283
+ enabled: true,
284
+ min_confidence: 0.99,
285
+ detect_multiple: false,
286
+ };
287
+
288
+ let result = detect_languages(text, &high_confidence_config).unwrap();
289
+ assert!(result.is_none());
290
+ }
291
+
292
+ #[test]
293
+ fn test_confidence_threshold_accepts_high_confidence() {
294
+ let text = "The quick brown fox jumps over the lazy dog. This is definitely English text with clear patterns.";
295
+ let low_confidence_config = LanguageDetectionConfig {
296
+ enabled: true,
297
+ min_confidence: 0.5,
298
+ detect_multiple: false,
299
+ };
300
+
301
+ let result = detect_languages(text, &low_confidence_config).unwrap();
302
+ assert!(result.is_some());
303
+ let langs = result.unwrap();
304
+ assert_eq!(langs.len(), 1);
305
+ assert_eq!(langs[0], "eng");
306
+ }
307
+
308
+ #[test]
309
+ fn test_confidence_threshold_boundary_low() {
310
+ let text =
311
+ "This is a comprehensive English sentence with multiple words to ensure accurate language detection.";
312
+ let very_low_threshold = LanguageDetectionConfig {
313
+ enabled: true,
314
+ min_confidence: 0.01,
315
+ detect_multiple: false,
316
+ };
317
+
318
+ let result = detect_languages(text, &very_low_threshold).unwrap();
319
+ assert!(result.is_some());
320
+ let langs = result.unwrap();
321
+ assert_eq!(langs.len(), 1);
322
+ assert_eq!(langs[0], "eng");
323
+ }
324
+
325
+ #[test]
326
+ fn test_confidence_threshold_boundary_high() {
327
+ let text = "The quick brown fox jumps over the lazy dog.";
328
+ let max_threshold = LanguageDetectionConfig {
329
+ enabled: true,
330
+ min_confidence: 1.0,
331
+ detect_multiple: false,
332
+ };
333
+
334
+ let result = detect_languages(text, &max_threshold).unwrap();
335
+ if let Some(langs) = result {
336
+ assert_eq!(langs.len(), 1);
337
+ }
338
+ }
339
+
340
+ #[test]
341
+ fn test_confidence_threshold_multiple_languages() {
342
+ let text = format!(
343
+ "{}{}",
344
+ "Hello world! This is English text. The quick brown fox jumps over the lazy dog. ".repeat(10),
345
+ "Hola mundo! Este es texto en español. El rápido zorro marrón salta sobre el perro perezoso. ".repeat(10)
346
+ );
347
+ let high_confidence_config = LanguageDetectionConfig {
348
+ enabled: true,
349
+ min_confidence: 0.5,
350
+ detect_multiple: true,
351
+ };
352
+
353
+ let result = detect_languages(&text, &high_confidence_config).unwrap();
354
+ if let Some(langs) = result {
355
+ assert!(
356
+ !langs.is_empty(),
357
+ "If detection succeeds, should find at least one language"
358
+ );
359
+ let has_expected = langs.contains(&"eng".to_string())
360
+ || langs.contains(&"spa".to_string())
361
+ || langs.contains(&"fra".to_string());
362
+ assert!(has_expected, "Should detect at least one of the languages in the text");
363
+ }
364
+ }
365
+
366
+ #[test]
367
+ fn test_confidence_threshold_filters_all_chunks() {
368
+ let text = "a b c d e f g h i j k ".repeat(50);
369
+ let high_confidence_config = LanguageDetectionConfig {
370
+ enabled: true,
371
+ min_confidence: 0.95,
372
+ detect_multiple: true,
373
+ };
374
+
375
+ let result = detect_languages(&text, &high_confidence_config).unwrap();
376
+ assert!(result.is_none() || result.unwrap().is_empty());
377
+ }
378
+
379
+ #[test]
380
+ fn test_default_confidence_threshold() {
381
+ let text = "This is a clear English sentence. The quick brown fox jumps over the lazy dog. \
382
+ English text is easy to detect when there is sufficient content to analyze. \
383
+ Language detection works best with longer text passages that provide more context.";
384
+ let config = LanguageDetectionConfig {
385
+ enabled: true,
386
+ min_confidence: 0.5,
387
+ detect_multiple: false,
388
+ };
389
+
390
+ let result = detect_languages(text, &config).unwrap();
391
+ if let Some(langs) = result {
392
+ assert_eq!(langs.len(), 1, "Single language mode should return one language");
393
+ assert_eq!(langs[0], "eng", "Should detect English");
394
+ }
395
+ }
396
+
397
+ #[test]
398
+ fn test_english_spanish_document() {
399
+ let text = format!(
400
+ "{}{}",
401
+ "The global economy has been experiencing significant changes in recent years. International cooperation is essential for addressing climate change and sustainable development. ".repeat(5),
402
+ "La economía global ha estado experimentando cambios significativos en los últimos años. La cooperación internacional es esencial para abordar el cambio climático y el desarrollo sostenible. ".repeat(5)
403
+ );
404
+ let config = LanguageDetectionConfig {
405
+ enabled: true,
406
+ min_confidence: 0.5,
407
+ detect_multiple: true,
408
+ };
409
+
410
+ let result = detect_languages(&text, &config).unwrap();
411
+ assert!(result.is_some());
412
+ let langs = result.unwrap();
413
+ assert!(!langs.is_empty());
414
+ assert!(langs.contains(&"eng".to_string()) || langs.contains(&"spa".to_string()));
415
+ }
416
+
417
+ #[test]
418
+ fn test_chinese_english_document() {
419
+ let text = format!(
420
+ "{}{}",
421
+ "中国是世界上人口最多的国家。中文是世界上使用人数最多的语言之一。中华文明有着五千年的悠久历史。".repeat(5),
422
+ "China is the most populous country in the world. Chinese is one of the most widely spoken languages. Chinese civilization has a long history of five thousand years. ".repeat(5)
423
+ );
424
+ let config = LanguageDetectionConfig {
425
+ enabled: true,
426
+ min_confidence: 0.4,
427
+ detect_multiple: true,
428
+ };
429
+
430
+ let result = detect_languages(&text, &config).unwrap();
431
+ assert!(result.is_some());
432
+ let langs = result.unwrap();
433
+ assert!(!langs.is_empty());
434
+ assert!(langs.contains(&"cmn".to_string()) || langs.contains(&"eng".to_string()));
435
+ }
436
+
437
+ #[test]
438
+ fn test_french_german_document() {
439
+ let text = format!(
440
+ "{}{}",
441
+ "La France est connue pour sa culture riche et sa cuisine délicieuse. Paris est la capitale de la France et une destination touristique populaire. ".repeat(5),
442
+ "Deutschland ist bekannt für seine Ingenieurskunst und seine reiche Geschichte. Berlin ist die Hauptstadt Deutschlands und eine lebendige Metropole. ".repeat(5)
443
+ );
444
+ let config = LanguageDetectionConfig {
445
+ enabled: true,
446
+ min_confidence: 0.5,
447
+ detect_multiple: true,
448
+ };
449
+
450
+ let result = detect_languages(&text, &config).unwrap();
451
+ assert!(result.is_some());
452
+ let langs = result.unwrap();
453
+ assert!(!langs.is_empty());
454
+ }
455
+
456
+ #[test]
457
+ fn test_russian_ukrainian_document() {
458
+ let text = format!(
459
+ "{}{}",
460
+ "Россия является крупнейшей страной в мире по территории. Москва - столица России и крупнейший город страны. ".repeat(5),
461
+ "Україна є країною в Східній Європі. Київ - столиця України та найбільше місто країни. ".repeat(5)
462
+ );
463
+ let config = LanguageDetectionConfig {
464
+ enabled: true,
465
+ min_confidence: 0.5,
466
+ detect_multiple: true,
467
+ };
468
+
469
+ let result = detect_languages(&text, &config).unwrap();
470
+ assert!(result.is_some());
471
+ let langs = result.unwrap();
472
+ assert!(!langs.is_empty());
473
+ }
474
+
475
+ #[test]
476
+ fn test_romance_languages() {
477
+ let text = "L'Italia è famosa per la sua arte e architettura. O português é falado em vários países. El español es uno de los idiomas más hablados del mundo. ".repeat(3);
478
+ let config = LanguageDetectionConfig {
479
+ enabled: true,
480
+ min_confidence: 0.5,
481
+ detect_multiple: true,
482
+ };
483
+
484
+ let result = detect_languages(&text, &config).unwrap();
485
+ assert!(result.is_some());
486
+ let langs = result.unwrap();
487
+ assert!(!langs.is_empty());
488
+ }
489
+
490
+ #[test]
491
+ fn test_germanic_languages() {
492
+ let text = "Deutschland hat eine reiche Kulturgeschichte. Nederland is bekend om zijn tulpen en windmolens. Sverige är känt för sina skogar och innovationer. ".repeat(3);
493
+ let config = LanguageDetectionConfig {
494
+ enabled: true,
495
+ min_confidence: 0.5,
496
+ detect_multiple: true,
497
+ };
498
+
499
+ let result = detect_languages(&text, &config).unwrap();
500
+ assert!(result.is_some());
501
+ let langs = result.unwrap();
502
+ assert!(!langs.is_empty());
503
+ }
504
+
505
+ #[test]
506
+ fn test_slavic_languages() {
507
+ let text = "Polska jest krajem w Europie Środkowej. Česká republika má bohatou historii. България е страна на Балканския полуостров. ".repeat(3);
508
+ let config = LanguageDetectionConfig {
509
+ enabled: true,
510
+ min_confidence: 0.5,
511
+ detect_multiple: true,
512
+ };
513
+
514
+ let result = detect_languages(&text, &config).unwrap();
515
+ assert!(result.is_some());
516
+ let langs = result.unwrap();
517
+ assert!(!langs.is_empty());
518
+ }
519
+
520
+ #[test]
521
+ fn test_cjk_languages() {
522
+ let text = "中国是一个历史悠久的国家。日本は美しい桜の国です。한국은 아시아의 선진국입니다。".repeat(3);
523
+ let config = LanguageDetectionConfig {
524
+ enabled: true,
525
+ min_confidence: 0.4,
526
+ detect_multiple: true,
527
+ };
528
+
529
+ let result = detect_languages(&text, &config).unwrap();
530
+ assert!(result.is_some());
531
+ let langs = result.unwrap();
532
+ assert!(!langs.is_empty());
533
+ }
534
+
535
+ #[test]
536
+ fn test_arabic_persian() {
537
+ let text = "اللغة العربية هي واحدة من أقدم اللغات في العالم. زبان فارسی زبانی زیبا و شاعرانه است. ".repeat(5);
538
+ let config = LanguageDetectionConfig {
539
+ enabled: true,
540
+ min_confidence: 0.4,
541
+ detect_multiple: true,
542
+ };
543
+
544
+ let result = detect_languages(&text, &config).unwrap();
545
+ assert!(result.is_some());
546
+ let langs = result.unwrap();
547
+ assert!(!langs.is_empty());
548
+ }
549
+
550
+ #[test]
551
+ fn test_very_short_text() {
552
+ let text = "Hello";
553
+ let config = LanguageDetectionConfig {
554
+ enabled: true,
555
+ min_confidence: 0.5,
556
+ detect_multiple: false,
557
+ };
558
+
559
+ let result = detect_languages(text, &config).unwrap();
560
+ if let Some(langs) = result {
561
+ assert!(!langs.is_empty());
562
+ }
563
+ }
564
+
565
+ #[test]
566
+ fn test_medium_length_text() {
567
+ let text = "Machine learning is a subset of artificial intelligence that enables computers to learn from data.";
568
+ let config = LanguageDetectionConfig {
569
+ enabled: true,
570
+ min_confidence: 0.5,
571
+ detect_multiple: false,
572
+ };
573
+
574
+ let result = detect_languages(text, &config).unwrap();
575
+ assert!(result.is_some());
576
+ let langs = result.unwrap();
577
+ assert_eq!(langs.len(), 1);
578
+ assert_eq!(langs[0], "eng");
579
+ }
580
+
581
+ #[test]
582
+ fn test_very_long_text() {
583
+ let paragraph = "The advancement of technology in the twenty-first century has transformed how we live, work, and communicate. \
584
+ From smartphones to artificial intelligence, these innovations have created unprecedented opportunities and challenges. \
585
+ Understanding the implications of technological progress requires careful consideration of ethical, social, and economic factors. ";
586
+ let text = paragraph.repeat(20);
587
+ let config = LanguageDetectionConfig {
588
+ enabled: true,
589
+ min_confidence: 0.7,
590
+ detect_multiple: false,
591
+ };
592
+
593
+ let result = detect_languages(&text, &config).unwrap();
594
+ assert!(result.is_some());
595
+ let langs = result.unwrap();
596
+ assert_eq!(langs.len(), 1);
597
+ assert_eq!(langs[0], "eng");
598
+ }
599
+
600
+ #[test]
601
+ fn test_numbers_only() {
602
+ let text = "123456789 0123456789 987654321";
603
+ let config = LanguageDetectionConfig {
604
+ enabled: true,
605
+ min_confidence: 0.5,
606
+ detect_multiple: false,
607
+ };
608
+
609
+ let result = detect_languages(text, &config).unwrap();
610
+ assert!(result.is_none());
611
+ }
612
+
613
+ #[test]
614
+ fn test_punctuation_only() {
615
+ let text = "!!! ??? ... --- *** @@@ ###";
616
+ let config = LanguageDetectionConfig {
617
+ enabled: true,
618
+ min_confidence: 0.5,
619
+ detect_multiple: false,
620
+ };
621
+
622
+ let result = detect_languages(text, &config).unwrap();
623
+ assert!(result.is_none());
624
+ }
625
+
626
+ #[test]
627
+ fn test_whitespace_only() {
628
+ let text = " \t\n \n\n\t\t ";
629
+ let config = LanguageDetectionConfig {
630
+ enabled: true,
631
+ min_confidence: 0.5,
632
+ detect_multiple: false,
633
+ };
634
+
635
+ let result = detect_languages(text, &config).unwrap();
636
+ assert!(result.is_none());
637
+ }
638
+
639
+ #[test]
640
+ fn test_mixed_numbers_and_text() {
641
+ let text = "The year 2024 marks the 100th anniversary of the founding. Over 50 countries participated in the event with more than 10,000 attendees.";
642
+ let config = LanguageDetectionConfig {
643
+ enabled: true,
644
+ min_confidence: 0.5,
645
+ detect_multiple: false,
646
+ };
647
+
648
+ let result = detect_languages(text, &config).unwrap();
649
+ assert!(result.is_some());
650
+ let langs = result.unwrap();
651
+ assert_eq!(langs[0], "eng");
652
+ }
653
+
654
+ #[test]
655
+ fn test_text_with_urls() {
656
+ let text = "Visit our website at https://example.com for more information. You can also contact us at info@example.com or follow us on social media.";
657
+ let config = LanguageDetectionConfig {
658
+ enabled: true,
659
+ min_confidence: 0.5,
660
+ detect_multiple: false,
661
+ };
662
+
663
+ let result = detect_languages(text, &config).unwrap();
664
+ assert!(result.is_some());
665
+ let langs = result.unwrap();
666
+ assert_eq!(langs[0], "eng");
667
+ }
668
+
669
+ #[test]
670
+ fn test_text_with_email_addresses() {
671
+ let text = "Please send your resume to jobs@company.com or contact.us@example.org for inquiries. Our support team at support@help.com is available 24/7.";
672
+ let config = LanguageDetectionConfig {
673
+ enabled: true,
674
+ min_confidence: 0.5,
675
+ detect_multiple: false,
676
+ };
677
+
678
+ let result = detect_languages(text, &config).unwrap();
679
+ assert!(result.is_some());
680
+ let langs = result.unwrap();
681
+ assert_eq!(langs[0], "eng");
682
+ }
683
+
684
+ #[test]
685
+ fn test_code_with_comments() {
686
+ let text = r#"
687
+ // This function calculates the factorial of a number
688
+ fn factorial(n: u64) -> u64 {
689
+ if n == 0 {
690
+ return 1;
691
+ }
692
+ n * factorial(n - 1)
693
+ }
694
+
695
+ // The algorithm uses recursion to compute the result efficiently
696
+ // It handles edge cases like zero and negative numbers appropriately
697
+ "#;
698
+ let config = LanguageDetectionConfig {
699
+ enabled: true,
700
+ min_confidence: 0.4,
701
+ detect_multiple: false,
702
+ };
703
+
704
+ let result = detect_languages(text, &config).unwrap();
705
+ if let Some(langs) = result {
706
+ assert!(!langs.is_empty());
707
+ }
708
+ }
709
+
710
+ #[test]
711
+ fn test_predominantly_code() {
712
+ let text = r#"
713
+ let x = 42;
714
+ let y = x * 2;
715
+ println!("{}", y);
716
+ fn main() {
717
+ let vec = vec![1, 2, 3];
718
+ for i in vec {
719
+ println!("{}", i);
720
+ }
721
+ }
722
+ "#;
723
+ let config = LanguageDetectionConfig {
724
+ enabled: true,
725
+ min_confidence: 0.5,
726
+ detect_multiple: false,
727
+ };
728
+
729
+ let result = detect_languages(text, &config).unwrap();
730
+ assert!(result.is_none() || result.as_ref().unwrap().is_empty() || result.as_ref().unwrap().len() <= 1);
731
+ }
732
+
733
+ #[test]
734
+ fn test_documentation_with_code() {
735
+ let text = r#"
736
+ Language detection is an important feature in document processing systems.
737
+ It allows applications to automatically identify the language of text content.
738
+ This is particularly useful for multilingual documents and international applications.
739
+
740
+ Example code:
741
+ let config = LanguageDetectionConfig::default();
742
+ let result = detect_languages(text, &config);
743
+
744
+ The detection algorithm analyzes character patterns and word frequencies to determine the most likely language.
745
+ Modern detection systems achieve high accuracy rates across dozens of languages.
746
+ "#;
747
+ let config = LanguageDetectionConfig {
748
+ enabled: true,
749
+ min_confidence: 0.5,
750
+ detect_multiple: false,
751
+ };
752
+
753
+ let result = detect_languages(text, &config).unwrap();
754
+ assert!(result.is_some());
755
+ let langs = result.unwrap();
756
+ assert_eq!(langs[0], "eng");
757
+ }
758
+
759
+ #[test]
760
+ fn test_medical_terminology() {
761
+ let text = "The patient presented with acute myocardial infarction and was administered thrombolytic therapy. \
762
+ The electrocardiogram showed significant ST-segment elevation in the anterior leads. \
763
+ Cardiac biomarkers including troponin and creatine kinase were significantly elevated.";
764
+ let config = LanguageDetectionConfig {
765
+ enabled: true,
766
+ min_confidence: 0.5,
767
+ detect_multiple: false,
768
+ };
769
+
770
+ let result = detect_languages(text, &config).unwrap();
771
+ assert!(result.is_some());
772
+ let langs = result.unwrap();
773
+ assert_eq!(langs[0], "eng");
774
+ }
775
+
776
+ #[test]
777
+ fn test_legal_terminology() {
778
+ let text = "The plaintiff hereby alleges that the defendant breached the contractual obligations as stipulated in the aforementioned agreement. \
779
+ Pursuant to clause 5.2, the defendant was required to provide adequate consideration within thirty days of execution. \
780
+ The court finds that the preponderance of evidence supports the plaintiff's claims.";
781
+ let config = LanguageDetectionConfig {
782
+ enabled: true,
783
+ min_confidence: 0.5,
784
+ detect_multiple: false,
785
+ };
786
+
787
+ let result = detect_languages(text, &config).unwrap();
788
+ assert!(result.is_some());
789
+ let langs = result.unwrap();
790
+ assert_eq!(langs[0], "eng");
791
+ }
792
+
793
+ #[test]
794
+ fn test_scientific_terminology() {
795
+ let text = "The experimental protocol involved spectrophotometric analysis using ultraviolet-visible spectroscopy. \
796
+ Quantum mechanical calculations were performed using density functional theory at the B3LYP level. \
797
+ The results demonstrated significant correlation between molecular structure and optical properties.";
798
+ let config = LanguageDetectionConfig {
799
+ enabled: true,
800
+ min_confidence: 0.5,
801
+ detect_multiple: false,
802
+ };
803
+
804
+ let result = detect_languages(text, &config).unwrap();
805
+ assert!(result.is_some());
806
+ let langs = result.unwrap();
807
+ assert_eq!(langs[0], "eng");
808
+ }
809
+
810
+ #[test]
811
+ fn test_latin_cyrillic_mix() {
812
+ let text = format!(
813
+ "{}{}",
814
+ "Modern technology enables global communication across language barriers. ".repeat(5),
815
+ "Современные технологии позволяют общаться по всему миру. ".repeat(5)
816
+ );
817
+ let config = LanguageDetectionConfig {
818
+ enabled: true,
819
+ min_confidence: 0.5,
820
+ detect_multiple: true,
821
+ };
822
+
823
+ let result = detect_languages(&text, &config).unwrap();
824
+ assert!(result.is_some());
825
+ let langs = result.unwrap();
826
+ assert!(!langs.is_empty());
827
+ }
828
+
829
+ #[test]
830
+ fn test_latin_cjk_mix() {
831
+ let text = format!(
832
+ "{}{}",
833
+ "Technology companies are expanding into Asian markets. ".repeat(5),
834
+ "科技公司正在进军亚洲市场。".repeat(5)
835
+ );
836
+ let config = LanguageDetectionConfig {
837
+ enabled: true,
838
+ min_confidence: 0.4,
839
+ detect_multiple: true,
840
+ };
841
+
842
+ let result = detect_languages(&text, &config).unwrap();
843
+ assert!(result.is_some());
844
+ let langs = result.unwrap();
845
+ assert!(!langs.is_empty());
846
+ }
847
+
848
+ #[test]
849
+ fn test_latin_arabic_mix() {
850
+ let text = format!(
851
+ "{}{}",
852
+ "International cooperation is essential for global peace and prosperity. ".repeat(5),
853
+ "التعاون الدولي ضروري للسلام والازدهار العالمي. ".repeat(5)
854
+ );
855
+ let config = LanguageDetectionConfig {
856
+ enabled: true,
857
+ min_confidence: 0.4,
858
+ detect_multiple: true,
859
+ };
860
+
861
+ let result = detect_languages(&text, &config).unwrap();
862
+ assert!(result.is_some());
863
+ let langs = result.unwrap();
864
+ assert!(!langs.is_empty());
865
+ }
866
+
867
+ #[test]
868
+ fn test_single_word_detection() {
869
+ let words = vec![("hello", "eng"), ("bonjour", "fra"), ("hola", "spa"), ("привет", "rus")];
870
+
871
+ let config = LanguageDetectionConfig {
872
+ enabled: true,
873
+ min_confidence: 0.3,
874
+ detect_multiple: false,
875
+ };
876
+
877
+ for (word, _expected_lang) in words {
878
+ let result = detect_languages(word, &config).unwrap();
879
+ if let Some(langs) = result {
880
+ assert!(!langs.is_empty());
881
+ }
882
+ }
883
+ }
884
+
885
+ #[test]
886
+ fn test_repetitive_text() {
887
+ let text = "test test test test test ".repeat(100);
888
+ let config = LanguageDetectionConfig {
889
+ enabled: true,
890
+ min_confidence: 0.5,
891
+ detect_multiple: false,
892
+ };
893
+
894
+ let result = detect_languages(&text, &config).unwrap();
895
+ if let Some(langs) = result {
896
+ assert!(!langs.is_empty());
897
+ }
898
+ }
899
+
900
+ #[test]
901
+ fn test_detection_consistency() {
902
+ let text = "This is a consistent test of language detection capabilities across multiple runs.";
903
+ let config = LanguageDetectionConfig {
904
+ enabled: true,
905
+ min_confidence: 0.5,
906
+ detect_multiple: false,
907
+ };
908
+
909
+ let result1 = detect_languages(text, &config).unwrap();
910
+ let result2 = detect_languages(text, &config).unwrap();
911
+
912
+ assert_eq!(result1, result2, "Detection should be deterministic");
913
+ }
914
+
915
+ #[test]
916
+ fn test_chunk_size_boundary() {
917
+ let chunk_text = "a".repeat(500);
918
+ let config = LanguageDetectionConfig {
919
+ enabled: true,
920
+ min_confidence: 0.5,
921
+ detect_multiple: true,
922
+ };
923
+
924
+ let result = detect_languages(&chunk_text, &config).unwrap();
925
+ assert!(result.is_none() || result.is_some());
926
+
927
+ let over_chunk = "This is English text. ".repeat(30);
928
+ let result2 = detect_languages(&over_chunk, &config).unwrap();
929
+ assert!(result2.is_none() || result2.is_some());
930
+ }
931
+
932
+ #[test]
933
+ fn test_special_characters_with_text() {
934
+ let text =
935
+ "The company's revenue increased by 25% year-over-year. CEO said: \"We're excited!\" #growth @investors";
936
+ let config = LanguageDetectionConfig {
937
+ enabled: true,
938
+ min_confidence: 0.5,
939
+ detect_multiple: false,
940
+ };
941
+
942
+ let result = detect_languages(text, &config).unwrap();
943
+ assert!(result.is_some());
944
+ let langs = result.unwrap();
945
+ assert_eq!(langs[0], "eng");
946
+ }
947
+ }
948
+
949
+ /// Lazy-initialized flag that ensures language detection processor is registered exactly once.
950
+ ///
951
+ /// This static is accessed on first use to automatically register the
952
+ /// language detection processor with the plugin registry.
953
+ static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_language_detection_processor);
954
+
955
+ /// Ensure the language detection processor is registered.
956
+ ///
957
+ /// This function is called automatically when needed.
958
+ /// It's safe to call multiple times - registration only happens once.
959
+ pub fn ensure_initialized() -> Result<()> {
960
+ PROCESSOR_INITIALIZED
961
+ .as_ref()
962
+ .map(|_| ())
963
+ .map_err(|e| crate::KreuzbergError::Plugin {
964
+ message: format!("Failed to register language detection processor: {}", e),
965
+ plugin_name: "language-detection".to_string(),
966
+ })
967
+ }
968
+
969
+ /// Register the language detection processor with the global registry.
970
+ ///
971
+ /// This function should be called once at application startup to register
972
+ /// the language detection post-processor.
973
+ ///
974
+ /// **Note:** This is called automatically on first use.
975
+ /// Explicit calling is optional.
976
+ pub fn register_language_detection_processor() -> Result<()> {
977
+ let registry = crate::plugins::registry::get_post_processor_registry();
978
+ let mut registry = registry
979
+ .write()
980
+ .map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
981
+
982
+ registry.register(Arc::new(LanguageDetector), 40)?;
983
+
984
+ Ok(())
985
+ }