kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,942 @@
1
+ //! Language detection using whatlang library.
2
+ //!
3
+ //! Provides fast language detection for extracted text content.
4
+
5
+ use crate::Result;
6
+ use crate::core::config::LanguageDetectionConfig;
7
+ use whatlang::{Lang, detect};
8
+
9
+ /// Detect languages in text using whatlang.
10
+ ///
11
+ /// Returns a list of detected language codes (ISO 639-3 format).
12
+ /// Returns `None` if no languages could be detected with sufficient confidence.
13
+ ///
14
+ /// # Arguments
15
+ ///
16
+ /// * `text` - The text to analyze for language detection
17
+ /// * `config` - Optional configuration for language detection
18
+ ///
19
+ /// # Example
20
+ ///
21
+ /// ```rust
22
+ /// use kreuzberg::language_detection::detect_languages;
23
+ /// use kreuzberg::core::config::LanguageDetectionConfig;
24
+ ///
25
+ /// let text = "Hello world! This is English text.";
26
+ /// let config = LanguageDetectionConfig {
27
+ /// enabled: true,
28
+ /// min_confidence: 0.8,
29
+ /// detect_multiple: false,
30
+ /// };
31
+ /// let languages = detect_languages(text, &config).expect("language detection succeeded");
32
+ /// println!("Detected languages: {:?}", languages);
33
+ /// ```
34
+ pub fn detect_languages(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
35
+ if !config.enabled {
36
+ return Ok(None);
37
+ }
38
+
39
+ if text.trim().is_empty() {
40
+ return Ok(None);
41
+ }
42
+
43
+ if !config.detect_multiple {
44
+ return detect_single_language(text, config);
45
+ }
46
+
47
+ detect_multiple_languages(text, config)
48
+ }
49
+
50
+ /// Detect a single primary language in the text.
51
+ fn detect_single_language(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
52
+ match detect(text) {
53
+ Some(info) => {
54
+ if info.confidence() >= config.min_confidence {
55
+ let lang_code = lang_to_iso639_3(info.lang());
56
+ Ok(Some(vec![lang_code]))
57
+ } else {
58
+ Ok(None)
59
+ }
60
+ }
61
+ None => Ok(None),
62
+ }
63
+ }
64
+
65
+ /// Detect multiple languages in the text by analyzing chunks.
66
+ ///
67
+ /// This splits the text into chunks and detects the language of each chunk,
68
+ /// then returns the most common languages found.
69
+ fn detect_multiple_languages(text: &str, config: &LanguageDetectionConfig) -> Result<Option<Vec<String>>> {
70
+ const CHUNK_SIZE: usize = 200;
71
+ let char_vec: Vec<char> = text.chars().collect();
72
+ let chunk_strings: Vec<String> = char_vec
73
+ .chunks(CHUNK_SIZE)
74
+ .map(|chunk| chunk.iter().collect::<String>())
75
+ .collect();
76
+
77
+ if chunk_strings.is_empty() {
78
+ return Ok(None);
79
+ }
80
+
81
+ let mut lang_counts = std::collections::HashMap::new();
82
+ let threshold = config.min_confidence.min(0.35);
83
+
84
+ for chunk in &chunk_strings {
85
+ if let Some(info) = detect(chunk)
86
+ && info.confidence() >= threshold
87
+ {
88
+ *lang_counts.entry(info.lang()).or_insert(0) += 1;
89
+ }
90
+ }
91
+
92
+ if lang_counts.is_empty() {
93
+ return detect_single_language(text, config);
94
+ }
95
+
96
+ let mut lang_vec: Vec<(Lang, usize)> = lang_counts.into_iter().collect();
97
+ lang_vec.sort_by(|a, b| b.1.cmp(&a.1));
98
+
99
+ let languages: Vec<String> = lang_vec.iter().map(|(lang, _)| lang_to_iso639_3(*lang)).collect();
100
+
101
+ Ok(Some(languages))
102
+ }
103
+
104
+ /// Convert whatlang Lang enum to ISO 639-3 language code.
105
+ ///
106
+ /// Maps whatlang's language codes to standardized ISO 639-3 codes.
107
+ fn lang_to_iso639_3(lang: Lang) -> String {
108
+ match lang {
109
+ Lang::Eng => "eng",
110
+ Lang::Rus => "rus",
111
+ Lang::Cmn => "cmn",
112
+ Lang::Spa => "spa",
113
+ Lang::Por => "por",
114
+ Lang::Ita => "ita",
115
+ Lang::Fra => "fra",
116
+ Lang::Deu => "deu",
117
+ Lang::Ukr => "ukr",
118
+ Lang::Kat => "kat",
119
+ Lang::Ara => "ara",
120
+ Lang::Hin => "hin",
121
+ Lang::Jpn => "jpn",
122
+ Lang::Heb => "heb",
123
+ Lang::Yid => "yid",
124
+ Lang::Pol => "pol",
125
+ Lang::Amh => "amh",
126
+ Lang::Jav => "jav",
127
+ Lang::Kor => "kor",
128
+ Lang::Nob => "nob",
129
+ Lang::Dan => "dan",
130
+ Lang::Swe => "swe",
131
+ Lang::Fin => "fin",
132
+ Lang::Tur => "tur",
133
+ Lang::Nld => "nld",
134
+ Lang::Hun => "hun",
135
+ Lang::Ces => "ces",
136
+ Lang::Ell => "ell",
137
+ Lang::Bul => "bul",
138
+ Lang::Bel => "bel",
139
+ Lang::Mar => "mar",
140
+ Lang::Kan => "kan",
141
+ Lang::Ron => "ron",
142
+ Lang::Slv => "slv",
143
+ Lang::Hrv => "hrv",
144
+ Lang::Srp => "srp",
145
+ Lang::Mkd => "mkd",
146
+ Lang::Lit => "lit",
147
+ Lang::Lav => "lav",
148
+ Lang::Est => "est",
149
+ Lang::Tam => "tam",
150
+ Lang::Vie => "vie",
151
+ Lang::Urd => "urd",
152
+ Lang::Tha => "tha",
153
+ Lang::Guj => "guj",
154
+ Lang::Uzb => "uzb",
155
+ Lang::Pan => "pan",
156
+ Lang::Aze => "aze",
157
+ Lang::Ind => "ind",
158
+ Lang::Tel => "tel",
159
+ Lang::Pes => "pes",
160
+ Lang::Mal => "mal",
161
+ Lang::Ori => "ori",
162
+ Lang::Mya => "mya",
163
+ Lang::Nep => "nep",
164
+ Lang::Sin => "sin",
165
+ Lang::Khm => "khm",
166
+ Lang::Tuk => "tuk",
167
+ Lang::Aka => "aka",
168
+ Lang::Zul => "zul",
169
+ Lang::Sna => "sna",
170
+ Lang::Afr => "afr",
171
+ Lang::Lat => "lat",
172
+ Lang::Slk => "slk",
173
+ Lang::Cat => "cat",
174
+ Lang::Tgl => "tgl",
175
+ Lang::Hye => "hye",
176
+ Lang::Epo => "epo",
177
+ Lang::Ben => "ben",
178
+ Lang::Cym => "cym",
179
+ }
180
+ .to_string()
181
+ }
182
+
183
+ #[cfg(test)]
184
+ mod tests {
185
+ use super::*;
186
+
187
+ #[test]
188
+ fn test_detect_single_language_english() {
189
+ let text = "Hello world! This is a test of the language detection system.";
190
+ let config = LanguageDetectionConfig {
191
+ enabled: true,
192
+ min_confidence: 0.8,
193
+ detect_multiple: false,
194
+ };
195
+
196
+ let result = detect_languages(text, &config).unwrap();
197
+ assert!(result.is_some());
198
+ let langs = result.unwrap();
199
+ assert_eq!(langs.len(), 1);
200
+ assert_eq!(langs[0], "eng");
201
+ }
202
+
203
+ #[test]
204
+ fn test_detect_single_language_spanish() {
205
+ let text = "Hola mundo! Esta es una prueba del sistema de detección de idiomas.";
206
+ let config = LanguageDetectionConfig {
207
+ enabled: true,
208
+ min_confidence: 0.8,
209
+ detect_multiple: false,
210
+ };
211
+
212
+ let result = detect_languages(text, &config).unwrap();
213
+ assert!(result.is_some());
214
+ let langs = result.unwrap();
215
+ assert_eq!(langs.len(), 1);
216
+ assert_eq!(langs[0], "spa");
217
+ }
218
+
219
+ #[test]
220
+ fn test_detect_multiple_languages() {
221
+ let text = "Hello world! This is English text. The quick brown fox jumps over the lazy dog. \
222
+ Hola mundo! Este es texto en español. El rápido zorro marrón salta sobre el perro perezoso. \
223
+ Bonjour le monde! Ceci est un texte en français. Le renard brun rapide saute par-dessus le chien paresseux.";
224
+ let config = LanguageDetectionConfig {
225
+ enabled: true,
226
+ min_confidence: 0.3,
227
+ detect_multiple: true,
228
+ };
229
+
230
+ let result = detect_languages(text, &config).unwrap();
231
+ if let Some(langs) = result {
232
+ assert!(
233
+ !langs.is_empty(),
234
+ "If detection succeeds, should return at least one language"
235
+ );
236
+ }
237
+ }
238
+
239
+ #[test]
240
+ fn test_detect_disabled() {
241
+ let text = "Hello world!";
242
+ let config = LanguageDetectionConfig {
243
+ enabled: false,
244
+ min_confidence: 0.8,
245
+ detect_multiple: false,
246
+ };
247
+
248
+ let result = detect_languages(text, &config).unwrap();
249
+ assert!(result.is_none());
250
+ }
251
+
252
+ #[test]
253
+ fn test_detect_empty_text() {
254
+ let text = "";
255
+ let config = LanguageDetectionConfig {
256
+ enabled: true,
257
+ min_confidence: 0.8,
258
+ detect_multiple: false,
259
+ };
260
+
261
+ let result = detect_languages(text, &config).unwrap();
262
+ assert!(result.is_none());
263
+ }
264
+
265
+ #[test]
266
+ fn test_lang_to_iso639_3() {
267
+ assert_eq!(lang_to_iso639_3(Lang::Eng), "eng");
268
+ assert_eq!(lang_to_iso639_3(Lang::Spa), "spa");
269
+ assert_eq!(lang_to_iso639_3(Lang::Fra), "fra");
270
+ assert_eq!(lang_to_iso639_3(Lang::Deu), "deu");
271
+ assert_eq!(lang_to_iso639_3(Lang::Cmn), "cmn");
272
+ }
273
+
274
+ #[test]
275
+ fn test_confidence_threshold_filters_low_confidence() {
276
+ let text = "ok yes no";
277
+ let high_confidence_config = LanguageDetectionConfig {
278
+ enabled: true,
279
+ min_confidence: 0.99,
280
+ detect_multiple: false,
281
+ };
282
+
283
+ let result = detect_languages(text, &high_confidence_config).unwrap();
284
+ assert!(result.is_none());
285
+ }
286
+
287
+ #[test]
288
+ fn test_confidence_threshold_accepts_high_confidence() {
289
+ let text = "The quick brown fox jumps over the lazy dog. This is definitely English text with clear patterns.";
290
+ let low_confidence_config = LanguageDetectionConfig {
291
+ enabled: true,
292
+ min_confidence: 0.5,
293
+ detect_multiple: false,
294
+ };
295
+
296
+ let result = detect_languages(text, &low_confidence_config).unwrap();
297
+ assert!(result.is_some());
298
+ let langs = result.unwrap();
299
+ assert_eq!(langs.len(), 1);
300
+ assert_eq!(langs[0], "eng");
301
+ }
302
+
303
+ #[test]
304
+ fn test_confidence_threshold_boundary_low() {
305
+ let text =
306
+ "This is a comprehensive English sentence with multiple words to ensure accurate language detection.";
307
+ let very_low_threshold = LanguageDetectionConfig {
308
+ enabled: true,
309
+ min_confidence: 0.01,
310
+ detect_multiple: false,
311
+ };
312
+
313
+ let result = detect_languages(text, &very_low_threshold).unwrap();
314
+ assert!(result.is_some());
315
+ let langs = result.unwrap();
316
+ assert_eq!(langs.len(), 1);
317
+ assert_eq!(langs[0], "eng");
318
+ }
319
+
320
+ #[test]
321
+ fn test_confidence_threshold_boundary_high() {
322
+ let text = "The quick brown fox jumps over the lazy dog.";
323
+ let max_threshold = LanguageDetectionConfig {
324
+ enabled: true,
325
+ min_confidence: 1.0,
326
+ detect_multiple: false,
327
+ };
328
+
329
+ let result = detect_languages(text, &max_threshold).unwrap();
330
+ if let Some(langs) = result {
331
+ assert_eq!(langs.len(), 1);
332
+ }
333
+ }
334
+
335
+ #[test]
336
+ fn test_confidence_threshold_multiple_languages() {
337
+ let text = format!(
338
+ "{}{}",
339
+ "Hello world! This is English text. The quick brown fox jumps over the lazy dog. ".repeat(10),
340
+ "Hola mundo! Este es texto en español. El rápido zorro marrón salta sobre el perro perezoso. ".repeat(10)
341
+ );
342
+ let high_confidence_config = LanguageDetectionConfig {
343
+ enabled: true,
344
+ min_confidence: 0.5,
345
+ detect_multiple: true,
346
+ };
347
+
348
+ let result = detect_languages(&text, &high_confidence_config).unwrap();
349
+ if let Some(langs) = result {
350
+ assert!(
351
+ !langs.is_empty(),
352
+ "If detection succeeds, should find at least one language"
353
+ );
354
+ let has_expected = langs.contains(&"eng".to_string())
355
+ || langs.contains(&"spa".to_string())
356
+ || langs.contains(&"fra".to_string());
357
+ assert!(has_expected, "Should detect at least one of the languages in the text");
358
+ }
359
+ }
360
+
361
+ #[test]
362
+ fn test_confidence_threshold_filters_all_chunks() {
363
+ let text = "a b c d e f g h i j k ".repeat(50);
364
+ let high_confidence_config = LanguageDetectionConfig {
365
+ enabled: true,
366
+ min_confidence: 0.95,
367
+ detect_multiple: true,
368
+ };
369
+
370
+ let result = detect_languages(&text, &high_confidence_config).unwrap();
371
+ assert!(result.is_none() || result.unwrap().is_empty());
372
+ }
373
+
374
+ #[test]
375
+ fn test_default_confidence_threshold() {
376
+ let text = "This is a clear English sentence. The quick brown fox jumps over the lazy dog. \
377
+ English text is easy to detect when there is sufficient content to analyze. \
378
+ Language detection works best with longer text passages that provide more context.";
379
+ let config = LanguageDetectionConfig {
380
+ enabled: true,
381
+ min_confidence: 0.5,
382
+ detect_multiple: false,
383
+ };
384
+
385
+ let result = detect_languages(text, &config).unwrap();
386
+ if let Some(langs) = result {
387
+ assert_eq!(langs.len(), 1, "Single language mode should return one language");
388
+ assert_eq!(langs[0], "eng", "Should detect English");
389
+ }
390
+ }
391
+
392
+ #[test]
393
+ fn test_english_spanish_document() {
394
+ let text = format!(
395
+ "{}{}",
396
+ "The global economy has been experiencing significant changes in recent years. International cooperation is essential for addressing climate change and sustainable development. ".repeat(5),
397
+ "La economía global ha estado experimentando cambios significativos en los últimos años. La cooperación internacional es esencial para abordar el cambio climático y el desarrollo sostenible. ".repeat(5)
398
+ );
399
+ let config = LanguageDetectionConfig {
400
+ enabled: true,
401
+ min_confidence: 0.5,
402
+ detect_multiple: true,
403
+ };
404
+
405
+ let result = detect_languages(&text, &config).unwrap();
406
+ assert!(result.is_some());
407
+ let langs = result.unwrap();
408
+ assert!(!langs.is_empty());
409
+ assert!(langs.contains(&"eng".to_string()) || langs.contains(&"spa".to_string()));
410
+ }
411
+
412
+ #[test]
413
+ fn test_chinese_english_document() {
414
+ let text = format!(
415
+ "{}{}",
416
+ "中国是世界上人口最多的国家。中文是世界上使用人数最多的语言之一。中华文明有着五千年的悠久历史。".repeat(5),
417
+ "China is the most populous country in the world. Chinese is one of the most widely spoken languages. Chinese civilization has a long history of five thousand years. ".repeat(5)
418
+ );
419
+ let config = LanguageDetectionConfig {
420
+ enabled: true,
421
+ min_confidence: 0.4,
422
+ detect_multiple: true,
423
+ };
424
+
425
+ let result = detect_languages(&text, &config).unwrap();
426
+ assert!(result.is_some());
427
+ let langs = result.unwrap();
428
+ assert!(!langs.is_empty());
429
+ assert!(langs.contains(&"cmn".to_string()) || langs.contains(&"eng".to_string()));
430
+ }
431
+
432
+ #[test]
433
+ fn test_french_german_document() {
434
+ let text = format!(
435
+ "{}{}",
436
+ "La France est connue pour sa culture riche et sa cuisine délicieuse. Paris est la capitale de la France et une destination touristique populaire. ".repeat(5),
437
+ "Deutschland ist bekannt für seine Ingenieurskunst und seine reiche Geschichte. Berlin ist die Hauptstadt Deutschlands und eine lebendige Metropole. ".repeat(5)
438
+ );
439
+ let config = LanguageDetectionConfig {
440
+ enabled: true,
441
+ min_confidence: 0.5,
442
+ detect_multiple: true,
443
+ };
444
+
445
+ let result = detect_languages(&text, &config).unwrap();
446
+ assert!(result.is_some());
447
+ let langs = result.unwrap();
448
+ assert!(!langs.is_empty());
449
+ }
450
+
451
+ #[test]
452
+ fn test_russian_ukrainian_document() {
453
+ let text = format!(
454
+ "{}{}",
455
+ "Россия является крупнейшей страной в мире по территории. Москва - столица России и крупнейший город страны. ".repeat(5),
456
+ "Україна є країною в Східній Європі. Київ - столиця України та найбільше місто країни. ".repeat(5)
457
+ );
458
+ let config = LanguageDetectionConfig {
459
+ enabled: true,
460
+ min_confidence: 0.5,
461
+ detect_multiple: true,
462
+ };
463
+
464
+ let result = detect_languages(&text, &config).unwrap();
465
+ assert!(result.is_some());
466
+ let langs = result.unwrap();
467
+ assert!(!langs.is_empty());
468
+ }
469
+
470
+ #[test]
471
+ fn test_romance_languages() {
472
+ let text = "L'Italia è famosa per la sua arte e architettura. O português é falado em vários países. El español es uno de los idiomas más hablados del mundo. ".repeat(3);
473
+ let config = LanguageDetectionConfig {
474
+ enabled: true,
475
+ min_confidence: 0.5,
476
+ detect_multiple: true,
477
+ };
478
+
479
+ let result = detect_languages(&text, &config).unwrap();
480
+ assert!(result.is_some());
481
+ let langs = result.unwrap();
482
+ assert!(!langs.is_empty());
483
+ }
484
+
485
+ #[test]
486
+ fn test_germanic_languages() {
487
+ let text = "Deutschland hat eine reiche Kulturgeschichte. Nederland is bekend om zijn tulpen en windmolens. Sverige är känt för sina skogar och innovationer. ".repeat(3);
488
+ let config = LanguageDetectionConfig {
489
+ enabled: true,
490
+ min_confidence: 0.5,
491
+ detect_multiple: true,
492
+ };
493
+
494
+ let result = detect_languages(&text, &config).unwrap();
495
+ assert!(result.is_some());
496
+ let langs = result.unwrap();
497
+ assert!(!langs.is_empty());
498
+ }
499
+
500
+ #[test]
501
+ fn test_slavic_languages() {
502
+ let text = "Polska jest krajem w Europie Środkowej. Česká republika má bohatou historii. България е страна на Балканския полуостров. ".repeat(3);
503
+ let config = LanguageDetectionConfig {
504
+ enabled: true,
505
+ min_confidence: 0.5,
506
+ detect_multiple: true,
507
+ };
508
+
509
+ let result = detect_languages(&text, &config).unwrap();
510
+ assert!(result.is_some());
511
+ let langs = result.unwrap();
512
+ assert!(!langs.is_empty());
513
+ }
514
+
515
+ #[test]
516
+ fn test_cjk_languages() {
517
+ let text = "中国是一个历史悠久的国家。日本は美しい桜の国です。한국은 아시아의 선진국입니다。".repeat(3);
518
+ let config = LanguageDetectionConfig {
519
+ enabled: true,
520
+ min_confidence: 0.4,
521
+ detect_multiple: true,
522
+ };
523
+
524
+ let result = detect_languages(&text, &config).unwrap();
525
+ assert!(result.is_some());
526
+ let langs = result.unwrap();
527
+ assert!(!langs.is_empty());
528
+ }
529
+
530
+ #[test]
531
+ fn test_arabic_persian() {
532
+ let text = "اللغة العربية هي واحدة من أقدم اللغات في العالم. زبان فارسی زبانی زیبا و شاعرانه است. ".repeat(5);
533
+ let config = LanguageDetectionConfig {
534
+ enabled: true,
535
+ min_confidence: 0.4,
536
+ detect_multiple: true,
537
+ };
538
+
539
+ let result = detect_languages(&text, &config).unwrap();
540
+ assert!(result.is_some());
541
+ let langs = result.unwrap();
542
+ assert!(!langs.is_empty());
543
+ }
544
+
545
+ #[test]
546
+ fn test_very_short_text() {
547
+ let text = "Hello";
548
+ let config = LanguageDetectionConfig {
549
+ enabled: true,
550
+ min_confidence: 0.5,
551
+ detect_multiple: false,
552
+ };
553
+
554
+ let result = detect_languages(text, &config).unwrap();
555
+ if let Some(langs) = result {
556
+ assert!(!langs.is_empty());
557
+ }
558
+ }
559
+
560
+ #[test]
561
+ fn test_medium_length_text() {
562
+ let text = "Machine learning is a subset of artificial intelligence that enables computers to learn from data.";
563
+ let config = LanguageDetectionConfig {
564
+ enabled: true,
565
+ min_confidence: 0.5,
566
+ detect_multiple: false,
567
+ };
568
+
569
+ let result = detect_languages(text, &config).unwrap();
570
+ assert!(result.is_some());
571
+ let langs = result.unwrap();
572
+ assert_eq!(langs.len(), 1);
573
+ assert_eq!(langs[0], "eng");
574
+ }
575
+
576
+ #[test]
577
+ fn test_very_long_text() {
578
+ let paragraph = "The advancement of technology in the twenty-first century has transformed how we live, work, and communicate. \
579
+ From smartphones to artificial intelligence, these innovations have created unprecedented opportunities and challenges. \
580
+ Understanding the implications of technological progress requires careful consideration of ethical, social, and economic factors. ";
581
+ let text = paragraph.repeat(20);
582
+ let config = LanguageDetectionConfig {
583
+ enabled: true,
584
+ min_confidence: 0.7,
585
+ detect_multiple: false,
586
+ };
587
+
588
+ let result = detect_languages(&text, &config).unwrap();
589
+ assert!(result.is_some());
590
+ let langs = result.unwrap();
591
+ assert_eq!(langs.len(), 1);
592
+ assert_eq!(langs[0], "eng");
593
+ }
594
+
595
+ #[test]
596
+ fn test_numbers_only() {
597
+ let text = "123456789 0123456789 987654321";
598
+ let config = LanguageDetectionConfig {
599
+ enabled: true,
600
+ min_confidence: 0.5,
601
+ detect_multiple: false,
602
+ };
603
+
604
+ let result = detect_languages(text, &config).unwrap();
605
+ assert!(result.is_none());
606
+ }
607
+
608
+ #[test]
609
+ fn test_punctuation_only() {
610
+ let text = "!!! ??? ... --- *** @@@ ###";
611
+ let config = LanguageDetectionConfig {
612
+ enabled: true,
613
+ min_confidence: 0.5,
614
+ detect_multiple: false,
615
+ };
616
+
617
+ let result = detect_languages(text, &config).unwrap();
618
+ assert!(result.is_none());
619
+ }
620
+
621
+ #[test]
622
+ fn test_whitespace_only() {
623
+ let text = " \t\n \n\n\t\t ";
624
+ let config = LanguageDetectionConfig {
625
+ enabled: true,
626
+ min_confidence: 0.5,
627
+ detect_multiple: false,
628
+ };
629
+
630
+ let result = detect_languages(text, &config).unwrap();
631
+ assert!(result.is_none());
632
+ }
633
+
634
+ #[test]
635
+ fn test_mixed_numbers_and_text() {
636
+ let text = "The year 2024 marks the 100th anniversary of the founding. Over 50 countries participated in the event with more than 10,000 attendees.";
637
+ let config = LanguageDetectionConfig {
638
+ enabled: true,
639
+ min_confidence: 0.5,
640
+ detect_multiple: false,
641
+ };
642
+
643
+ let result = detect_languages(text, &config).unwrap();
644
+ assert!(result.is_some());
645
+ let langs = result.unwrap();
646
+ assert_eq!(langs[0], "eng");
647
+ }
648
+
649
+ #[test]
650
+ fn test_text_with_urls() {
651
+ let text = "Visit our website at https://example.com for more information. You can also contact us at info@example.com or follow us on social media.";
652
+ let config = LanguageDetectionConfig {
653
+ enabled: true,
654
+ min_confidence: 0.5,
655
+ detect_multiple: false,
656
+ };
657
+
658
+ let result = detect_languages(text, &config).unwrap();
659
+ assert!(result.is_some());
660
+ let langs = result.unwrap();
661
+ assert_eq!(langs[0], "eng");
662
+ }
663
+
664
+ #[test]
665
+ fn test_text_with_email_addresses() {
666
+ let text = "Please send your resume to jobs@company.com or contact.us@example.org for inquiries. Our support team at support@help.com is available 24/7.";
667
+ let config = LanguageDetectionConfig {
668
+ enabled: true,
669
+ min_confidence: 0.5,
670
+ detect_multiple: false,
671
+ };
672
+
673
+ let result = detect_languages(text, &config).unwrap();
674
+ assert!(result.is_some());
675
+ let langs = result.unwrap();
676
+ assert_eq!(langs[0], "eng");
677
+ }
678
+
679
+ #[test]
680
+ fn test_code_with_comments() {
681
+ let text = r#"
682
+ // This function calculates the factorial of a number
683
+ fn factorial(n: u64) -> u64 {
684
+ if n == 0 {
685
+ return 1;
686
+ }
687
+ n * factorial(n - 1)
688
+ }
689
+
690
+ // The algorithm uses recursion to compute the result efficiently
691
+ // It handles edge cases like zero and negative numbers appropriately
692
+ "#;
693
+ let config = LanguageDetectionConfig {
694
+ enabled: true,
695
+ min_confidence: 0.4,
696
+ detect_multiple: false,
697
+ };
698
+
699
+ let result = detect_languages(text, &config).unwrap();
700
+ if let Some(langs) = result {
701
+ assert!(!langs.is_empty());
702
+ }
703
+ }
704
+
705
+ #[test]
706
+ fn test_predominantly_code() {
707
+ let text = r#"
708
+ let x = 42;
709
+ let y = x * 2;
710
+ println!("{}", y);
711
+ fn main() {
712
+ let vec = vec![1, 2, 3];
713
+ for i in vec {
714
+ println!("{}", i);
715
+ }
716
+ }
717
+ "#;
718
+ let config = LanguageDetectionConfig {
719
+ enabled: true,
720
+ min_confidence: 0.5,
721
+ detect_multiple: false,
722
+ };
723
+
724
+ let result = detect_languages(text, &config).unwrap();
725
+ assert!(result.is_none() || result.as_ref().unwrap().is_empty() || result.as_ref().unwrap().len() <= 1);
726
+ }
727
+
728
+ #[test]
729
+ fn test_documentation_with_code() {
730
+ let text = r#"
731
+ Language detection is an important feature in document processing systems.
732
+ It allows applications to automatically identify the language of text content.
733
+ This is particularly useful for multilingual documents and international applications.
734
+
735
+ Example code:
736
+ let config = LanguageDetectionConfig::default();
737
+ let result = detect_languages(text, &config);
738
+
739
+ The detection algorithm analyzes character patterns and word frequencies to determine the most likely language.
740
+ Modern detection systems achieve high accuracy rates across dozens of languages.
741
+ "#;
742
+ let config = LanguageDetectionConfig {
743
+ enabled: true,
744
+ min_confidence: 0.5,
745
+ detect_multiple: false,
746
+ };
747
+
748
+ let result = detect_languages(text, &config).unwrap();
749
+ assert!(result.is_some());
750
+ let langs = result.unwrap();
751
+ assert_eq!(langs[0], "eng");
752
+ }
753
+
754
+ #[test]
755
+ fn test_medical_terminology() {
756
+ let text = "The patient presented with acute myocardial infarction and was administered thrombolytic therapy. \
757
+ The electrocardiogram showed significant ST-segment elevation in the anterior leads. \
758
+ Cardiac biomarkers including troponin and creatine kinase were significantly elevated.";
759
+ let config = LanguageDetectionConfig {
760
+ enabled: true,
761
+ min_confidence: 0.5,
762
+ detect_multiple: false,
763
+ };
764
+
765
+ let result = detect_languages(text, &config).unwrap();
766
+ assert!(result.is_some());
767
+ let langs = result.unwrap();
768
+ assert_eq!(langs[0], "eng");
769
+ }
770
+
771
+ #[test]
772
+ fn test_legal_terminology() {
773
+ let text = "The plaintiff hereby alleges that the defendant breached the contractual obligations as stipulated in the aforementioned agreement. \
774
+ Pursuant to clause 5.2, the defendant was required to provide adequate consideration within thirty days of execution. \
775
+ The court finds that the preponderance of evidence supports the plaintiff's claims.";
776
+ let config = LanguageDetectionConfig {
777
+ enabled: true,
778
+ min_confidence: 0.5,
779
+ detect_multiple: false,
780
+ };
781
+
782
+ let result = detect_languages(text, &config).unwrap();
783
+ assert!(result.is_some());
784
+ let langs = result.unwrap();
785
+ assert_eq!(langs[0], "eng");
786
+ }
787
+
788
+ #[test]
789
+ fn test_scientific_terminology() {
790
+ let text = "The experimental protocol involved spectrophotometric analysis using ultraviolet-visible spectroscopy. \
791
+ Quantum mechanical calculations were performed using density functional theory at the B3LYP level. \
792
+ The results demonstrated significant correlation between molecular structure and optical properties.";
793
+ let config = LanguageDetectionConfig {
794
+ enabled: true,
795
+ min_confidence: 0.5,
796
+ detect_multiple: false,
797
+ };
798
+
799
+ let result = detect_languages(text, &config).unwrap();
800
+ assert!(result.is_some());
801
+ let langs = result.unwrap();
802
+ assert_eq!(langs[0], "eng");
803
+ }
804
+
805
+ #[test]
806
+ fn test_latin_cyrillic_mix() {
807
+ let text = format!(
808
+ "{}{}",
809
+ "Modern technology enables global communication across language barriers. ".repeat(5),
810
+ "Современные технологии позволяют общаться по всему миру. ".repeat(5)
811
+ );
812
+ let config = LanguageDetectionConfig {
813
+ enabled: true,
814
+ min_confidence: 0.5,
815
+ detect_multiple: true,
816
+ };
817
+
818
+ let result = detect_languages(&text, &config).unwrap();
819
+ assert!(result.is_some());
820
+ let langs = result.unwrap();
821
+ assert!(!langs.is_empty());
822
+ }
823
+
824
+ #[test]
825
+ fn test_latin_cjk_mix() {
826
+ let text = format!(
827
+ "{}{}",
828
+ "Technology companies are expanding into Asian markets. ".repeat(5),
829
+ "科技公司正在进军亚洲市场。".repeat(5)
830
+ );
831
+ let config = LanguageDetectionConfig {
832
+ enabled: true,
833
+ min_confidence: 0.4,
834
+ detect_multiple: true,
835
+ };
836
+
837
+ let result = detect_languages(&text, &config).unwrap();
838
+ assert!(result.is_some());
839
+ let langs = result.unwrap();
840
+ assert!(!langs.is_empty());
841
+ }
842
+
843
+ #[test]
844
+ fn test_latin_arabic_mix() {
845
+ let text = format!(
846
+ "{}{}",
847
+ "International cooperation is essential for global peace and prosperity. ".repeat(5),
848
+ "التعاون الدولي ضروري للسلام والازدهار العالمي. ".repeat(5)
849
+ );
850
+ let config = LanguageDetectionConfig {
851
+ enabled: true,
852
+ min_confidence: 0.4,
853
+ detect_multiple: true,
854
+ };
855
+
856
+ let result = detect_languages(&text, &config).unwrap();
857
+ assert!(result.is_some());
858
+ let langs = result.unwrap();
859
+ assert!(!langs.is_empty());
860
+ }
861
+
862
+ #[test]
863
+ fn test_single_word_detection() {
864
+ let words = vec![("hello", "eng"), ("bonjour", "fra"), ("hola", "spa"), ("привет", "rus")];
865
+
866
+ let config = LanguageDetectionConfig {
867
+ enabled: true,
868
+ min_confidence: 0.3,
869
+ detect_multiple: false,
870
+ };
871
+
872
+ for (word, _expected_lang) in words {
873
+ let result = detect_languages(word, &config).unwrap();
874
+ if let Some(langs) = result {
875
+ assert!(!langs.is_empty());
876
+ }
877
+ }
878
+ }
879
+
880
+ #[test]
881
+ fn test_repetitive_text() {
882
+ let text = "test test test test test ".repeat(100);
883
+ let config = LanguageDetectionConfig {
884
+ enabled: true,
885
+ min_confidence: 0.5,
886
+ detect_multiple: false,
887
+ };
888
+
889
+ let result = detect_languages(&text, &config).unwrap();
890
+ if let Some(langs) = result {
891
+ assert!(!langs.is_empty());
892
+ }
893
+ }
894
+
895
+ #[test]
896
+ fn test_detection_consistency() {
897
+ let text = "This is a consistent test of language detection capabilities across multiple runs.";
898
+ let config = LanguageDetectionConfig {
899
+ enabled: true,
900
+ min_confidence: 0.5,
901
+ detect_multiple: false,
902
+ };
903
+
904
+ let result1 = detect_languages(text, &config).unwrap();
905
+ let result2 = detect_languages(text, &config).unwrap();
906
+
907
+ assert_eq!(result1, result2, "Detection should be deterministic");
908
+ }
909
+
910
+ #[test]
911
+ fn test_chunk_size_boundary() {
912
+ let chunk_text = "a".repeat(500);
913
+ let config = LanguageDetectionConfig {
914
+ enabled: true,
915
+ min_confidence: 0.5,
916
+ detect_multiple: true,
917
+ };
918
+
919
+ let result = detect_languages(&chunk_text, &config).unwrap();
920
+ assert!(result.is_none() || result.is_some());
921
+
922
+ let over_chunk = "This is English text. ".repeat(30);
923
+ let result2 = detect_languages(&over_chunk, &config).unwrap();
924
+ assert!(result2.is_none() || result2.is_some());
925
+ }
926
+
927
+ #[test]
928
+ fn test_special_characters_with_text() {
929
+ let text =
930
+ "The company's revenue increased by 25% year-over-year. CEO said: \"We're excited!\" #growth @investors";
931
+ let config = LanguageDetectionConfig {
932
+ enabled: true,
933
+ min_confidence: 0.5,
934
+ detect_multiple: false,
935
+ };
936
+
937
+ let result = detect_languages(text, &config).unwrap();
938
+ assert!(result.is_some());
939
+ let langs = result.unwrap();
940
+ assert_eq!(langs[0], "eng");
941
+ }
942
+ }