kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,146 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'layout'
4
+ require_relative 'layouts/qwerty'
5
+ require_relative 'layouts/qwertz'
6
+ require_relative 'layouts/azerty'
7
+ require_relative 'layouts/jcuken'
8
+ require_relative 'layouts/dvorak'
9
+
10
+ module Kotoshu
11
+ module Keyboard
12
+ # Registry for keyboard layouts
13
+ #
14
+ # The registry provides a centralized way to access keyboard layouts
15
+ # and automatically selects the appropriate layout for a given language.
16
+ #
17
+ # @example Getting layout for a language
18
+ # layout = Keyboard::Registry.layout_for('de')
19
+ # layout.name # => "QWERTZ"
20
+ #
21
+ # @example Getting layout by name
22
+ # layout = Keyboard::Registry.layout_by_name('Dvorak')
23
+ # layout.name # => "Dvorak"
24
+ #
25
+ # @example Listing all available layouts
26
+ # Keyboard::Registry.available_layouts.each do |layout|
27
+ # puts "#{layout.name}: #{layout.language_codes.join(', ')}"
28
+ # end
29
+ #
30
+ class Registry
31
+ class << self
32
+ # Register a keyboard layout
33
+ #
34
+ # @param layout_class [Class<Layout>] the layout class to register
35
+ # @return [Layout] the instantiated layout
36
+ def register(layout_class)
37
+ layouts[layout_class.name] = layout_class.new
38
+ end
39
+
40
+ # Get layout for a specific language code
41
+ #
42
+ # Searches for a layout that supports the given language code.
43
+ # Returns QWERTY as fallback if no matching layout is found.
44
+ #
45
+ # @param language_code [String] the language code (e.g., 'en', 'de', 'fr', 'ru')
46
+ # @return [Layout] the keyboard layout for the language
47
+ def layout_for(language_code)
48
+ # Try exact match first
49
+ layout = layouts.values.find { |l| l.supports_language?(language_code) }
50
+
51
+ # Try base language if variant (e.g., 'en-GB' -> 'en')
52
+ unless layout
53
+ base_lang = language_code.to_s.split('-').first
54
+ layout = layouts.values.find { |l| l.supports_language?(base_lang) }
55
+ end
56
+
57
+ layout || default_layout
58
+ end
59
+
60
+ # Get layout by name
61
+ #
62
+ # @param name [String, Symbol] the layout name (e.g., 'QWERTY', 'Dvorak')
63
+ # @return [Layout] the layout, or QWERTY as fallback if not found
64
+ def layout_by_name(name)
65
+ name_str = name.to_s
66
+ result = layouts.values.find do |layout|
67
+ layout.name == name_str ||
68
+ layout.class.name.end_with?("::#{name_str}")
69
+ end
70
+
71
+ # Return QWERTY as fallback (not default_layout to avoid recursion)
72
+ result || layouts['Kotoshu::Keyboard::Layouts::QWERTY']
73
+ end
74
+
75
+ # Get all available layouts
76
+ #
77
+ # @return [Array<Layout>] list of all registered layouts
78
+ def available_layouts
79
+ layouts.values
80
+ end
81
+
82
+ # Get all supported language codes
83
+ #
84
+ # @return [Array<String>] list of all language codes across all layouts
85
+ def supported_languages
86
+ layouts.values.flat_map(&:language_codes).uniq.sort
87
+ end
88
+
89
+ # Set the default layout
90
+ #
91
+ # @param layout_name [String, Symbol] the name of the layout to use as default
92
+ def register_default(layout_name)
93
+ @default_layout_name = layout_name
94
+ end
95
+
96
+ # Check if a language is supported
97
+ #
98
+ # @param language_code [String] the language code to check
99
+ # @return [Boolean] true if the language is supported by any layout
100
+ def supports_language?(language_code)
101
+ layouts.values.any? { |l| l.supports_language?(language_code) }
102
+ end
103
+
104
+ # Clear all registered layouts (mainly for testing)
105
+ #
106
+ # @return [void]
107
+ def clear!
108
+ @layouts = nil
109
+ @default_layout_name = nil
110
+ end
111
+
112
+ private
113
+
114
+ # Get or initialize the layouts hash
115
+ #
116
+ # @return [Hash] hash of layout class names to instances
117
+ def layouts
118
+ @layouts ||= {}
119
+ end
120
+
121
+ # Get the default layout
122
+ #
123
+ # @return [Layout] the default layout (QWERTY if none specified)
124
+ def default_layout
125
+ if @default_layout_name
126
+ name_str = @default_layout_name.to_s
127
+ layout = layouts.values.find do |l|
128
+ l.name == name_str || l.class.name.end_with?("::#{name_str}")
129
+ end
130
+ return layout if layout
131
+ end
132
+
133
+ # Return QWERTY as the ultimate fallback
134
+ layouts['Kotoshu::Keyboard::Layouts::QWERTY'] || layouts.values.first
135
+ end
136
+ end
137
+
138
+ # Auto-register all layout classes on load
139
+ register(Layouts::QWERTY)
140
+ register(Layouts::QWERTZ)
141
+ register(Layouts::AZERTY)
142
+ register(Layouts::JCUKEN)
143
+ register(Layouts::Dvorak)
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'keyboard/registry'
4
+
5
+ module Kotoshu
6
+ # Keyboard layout system for Kotoshu
7
+ #
8
+ # This module provides access to keyboard layouts for typo detection
9
+ # and suggestion ranking in multi-language spell checking.
10
+ #
11
+ # @example Getting a keyboard layout for a language
12
+ # layout = Kotoshu::Keyboard.layout_for('de')
13
+ # layout.distance('z', 'y') # => 1 (adjacent on QWERTZ)
14
+ #
15
+ # @example Getting a layout by name
16
+ # dvorak = Kotoshu::Keyboard.layout_by_name('Dvorak')
17
+ # dvorak.distance('a', 'e') # => 2 (home row on Dvorak)
18
+ #
19
+ module Keyboard
20
+ class << self
21
+ # Get keyboard layout for a language code
22
+ #
23
+ # @param language_code [String] the language code (e.g., 'en', 'de', 'fr', 'ru')
24
+ # @return [Layout] the keyboard layout for the language
25
+ def layout_for(language_code)
26
+ Registry.layout_for(language_code)
27
+ end
28
+
29
+ # Get keyboard layout by name
30
+ #
31
+ # @param name [String, Symbol] the layout name (e.g., 'QWERTY', 'Dvorak')
32
+ # @return [Layout, nil] the layout, or nil if not found
33
+ def layout_by_name(name)
34
+ Registry.layout_by_name(name)
35
+ end
36
+
37
+ # Get all available layouts
38
+ #
39
+ # @return [Array<Layout>] list of all registered layouts
40
+ def available_layouts
41
+ Registry.available_layouts
42
+ end
43
+
44
+ # Get all supported language codes
45
+ #
46
+ # @return [Array<String>] list of all language codes across all layouts
47
+ def supported_languages
48
+ Registry.supported_languages
49
+ end
50
+
51
+ # Check if a language is supported
52
+ #
53
+ # @param language_code [String] the language code to check
54
+ # @return [Boolean] true if the language is supported by any layout
55
+ def supports_language?(language_code)
56
+ Registry.supports_language?(language_code)
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,242 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Language
5
+ # Language detection based on character sets and patterns.
6
+ #
7
+ # Uses Unicode character ranges to identify probable language.
8
+ # Provides confidence scoring for multiple matches.
9
+ #
10
+ # @example Detect language
11
+ # Language::Detector.detect("Hello world") # => "en"
12
+ class Detector
13
+ # Character set ranges for language detection
14
+ CHARACTER_SETS = {
15
+ cyrillic: /\p{Cyrillic}/,
16
+ hiragana: /[\u3040-\u309F]/,
17
+ katakana: /[\u30A0-\u30FF]/,
18
+ cjk: /[\u4E00-\u9FFF]/,
19
+ hangul: /[\uAC00-\uD7AF]/,
20
+ latin: /[a-zA-Zà-ÿ]/
21
+ }.freeze
22
+
23
+ # Language-specific patterns
24
+ LANGUAGE_PATTERNS = {
25
+ # Russian: Cyrillic
26
+ russian: {
27
+ pattern: /\p{Cyrillic}[а-яА-ЯёЁ]/,
28
+ min_ratio: 0.3,
29
+ scripts: [:cyrillic]
30
+ },
31
+
32
+ # Japanese: Mixed script (Hiragana + Katakana + Kanji)
33
+ japanese: {
34
+ pattern: /[\u3040-\u309F]|[\u30A0-\u30FF]|[\u4E00-\u9FFF]/,
35
+ min_ratio: 0.2,
36
+ scripts: [:hiragana, :katakana, :cjk],
37
+ must_have: [:hiragana] # Only require hiragana, not both
38
+ },
39
+
40
+ # Portuguese: Latin with specific accents
41
+ portuguese: {
42
+ pattern: /[ãõáàâãéêíóôõúç]/i,
43
+ min_ratio: 0.05,
44
+ scripts: [:latin]
45
+ },
46
+
47
+ # French: Latin with specific accents (NOT German umlauts)
48
+ french: {
49
+ pattern: /[éèêëàâùûüîïôç]/i, # Removed ä, ö (not French)
50
+ min_ratio: 0.02, # Lower threshold
51
+ scripts: [:latin],
52
+ priority: 1 # Higher priority than English
53
+ },
54
+
55
+ # Spanish: Latin with inverted punctuation
56
+ spanish: {
57
+ pattern: /[áéíóúüñ¿¡]/i,
58
+ min_ratio: 0.02, # Lower threshold
59
+ scripts: [:latin],
60
+ priority: 1
61
+ },
62
+
63
+ # German: Latin with umlauts and eszett
64
+ german: {
65
+ pattern: /[äöüßÄÖÜ]/, # Explicitly include uppercase
66
+ min_ratio: 0.02, # Lower threshold
67
+ scripts: [:latin],
68
+ priority: 1
69
+ },
70
+
71
+ # English: Latin with minimal accents
72
+ english: {
73
+ pattern: /[a-zA-Z]/,
74
+ min_ratio: 0.3,
75
+ scripts: [:latin],
76
+ max_accent_ratio: 0.02
77
+ }
78
+ }.freeze
79
+
80
+ # Language code mapping
81
+ CODE_MAPPING = {
82
+ russian: "ru",
83
+ japanese: "ja",
84
+ portuguese: "pt",
85
+ french: "fr",
86
+ spanish: "es",
87
+ german: "de",
88
+ english: "en"
89
+ }.freeze
90
+
91
+ class << self
92
+ # Detect language from text.
93
+ #
94
+ # Returns the most probable language code based on character analysis.
95
+ #
96
+ # @param text [String] Text to analyze
97
+ # @return [String, nil] Detected language code or nil if uncertain
98
+ def detect(text)
99
+ return nil if text.nil? || text.strip.empty?
100
+
101
+ scores = analyze_languages(text)
102
+ return nil if scores.empty?
103
+
104
+ # Sort by score, then by priority (higher priority first)
105
+ result = scores.max_by do |code, score|
106
+ config = LANGUAGE_PATTERNS.find { |k, v| CODE_MAPPING[k] == code }
107
+ priority = config&.last&.dig(:priority) || 0
108
+ [score, priority]
109
+ end
110
+
111
+ result&.first
112
+ end
113
+
114
+ # Detect with confidence score.
115
+ #
116
+ # @param text [String] Text to analyze
117
+ # @return [Array<String, Float>] Language code and confidence (0-1)
118
+ def detect_with_confidence(text)
119
+ return [nil, 0.0] if text.nil? || text.strip.empty?
120
+
121
+ scores = analyze_languages(text)
122
+ return [nil, 0.0] if scores.empty?
123
+
124
+ top_language, top_score = scores.max_by { |_, score| score }
125
+ confidence = normalize_confidence(top_score, scores.values)
126
+
127
+ [top_language, confidence]
128
+ end
129
+
130
+ # Get multiple language candidates.
131
+ #
132
+ # @param text [String] Text to analyze
133
+ # @param limit [Integer] Maximum candidates to return
134
+ # @return [Array<Array<String, Float>>] Array of [code, confidence] pairs
135
+ def detect_candidates(text, limit: 3)
136
+ return [] if text.nil? || text.strip.empty?
137
+
138
+ scores = analyze_languages(text)
139
+ return [] if scores.empty?
140
+
141
+ total_score = scores.values.sum.to_f
142
+ scores
143
+ .sort_by { |_, score| -score }
144
+ .first(limit)
145
+ .map { |code, score| [code, score / total_score] }
146
+ end
147
+
148
+ private
149
+
150
+ # Analyze text and score each language.
151
+ #
152
+ # @param text [String] Text to analyze
153
+ # @return [Hash] Hash mapping language codes to scores
154
+ def analyze_languages(text)
155
+ text_length = text.length.to_f
156
+ return {} if text_length.zero?
157
+
158
+ scores = {}
159
+
160
+ LANGUAGE_PATTERNS.each do |language, config|
161
+ score = score_language(text, language, config, text_length)
162
+ scores[CODE_MAPPING[language]] = score if score > 0
163
+ end
164
+
165
+ scores
166
+ end
167
+
168
+ # Score a specific language against text.
169
+ #
170
+ # @param text [String] Text to analyze
171
+ # @param language [Symbol] Language key
172
+ # @param config [Hash] Language configuration
173
+ # @param text_length [Float] Length of text
174
+ # @return [Float] Score (0-1)
175
+ def score_language(text, language, config, text_length)
176
+ # Check required scripts
177
+ if config[:must_have]
178
+ return 0 unless config[:must_have].all? do |script|
179
+ text.match?(CHARACTER_SETS[script])
180
+ end
181
+ end
182
+
183
+ # Check forbidden scripts
184
+ if config[:must_not_have]
185
+ return 0 if config[:must_not_have].any? do |script|
186
+ text.match?(CHARACTER_SETS[script])
187
+ end
188
+ end
189
+
190
+ # Count matching characters
191
+ matches = text.scan(config[:pattern]).length
192
+ ratio = matches / text_length
193
+
194
+ # Check minimum ratio
195
+ return 0 if ratio < config[:min_ratio]
196
+
197
+ # Check maximum accent ratio (for English)
198
+ if config[:max_accent_ratio]
199
+ accent_chars = text.scan(/[à-ÿ]/).length
200
+ accent_ratio = accent_chars / text_length
201
+ return 0 if accent_ratio > config[:max_accent_ratio]
202
+ end
203
+
204
+ # Bonus for having required scripts
205
+ score = ratio
206
+ if config[:scripts]
207
+ script_bonus = config[:scripts].count do |script|
208
+ text.match?(CHARACTER_SETS[script])
209
+ end
210
+ score *= (1 + script_bonus * 0.1)
211
+ end
212
+
213
+ # Extra bonus for non-Latin specific characters (accents, umlauts, etc.)
214
+ # This helps distinguish languages with special characters from plain English
215
+ if language != :english && matches > 0
216
+ # Calculate what portion of the text is the special characters
217
+ special_char_ratio = matches / text_length
218
+ # Give bonus proportional to special character presence
219
+ score *= (1 + special_char_ratio)
220
+ end
221
+
222
+ [score, 1.0].min
223
+ end
224
+
225
+ # Normalize confidence score.
226
+ #
227
+ # @param top_score [Float] Highest score
228
+ # @param all_scores [Array<Float>] All scores
229
+ # @return [Float] Normalized confidence (0-1)
230
+ def normalize_confidence(top_score, all_scores)
231
+ return 0.0 if top_score.zero?
232
+
233
+ second_best = all_scores.sort { |a, b| b <=> a }[1] || 0
234
+ return 1.0 if second_best.zero?
235
+
236
+ ratio = top_score / (top_score + second_best)
237
+ (ratio * 0.8 + 0.2).clamp(0.0, 1.0) # Minimum confidence 0.2
238
+ end
239
+ end
240
+ end
241
+ end
242
+ end