kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,404 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../readers/lookup_builder'
4
+ require_relative '../../components/spell_checker'
5
+ require_relative '../../components/pos_tagger'
6
+ require_relative '../../language/normalizer/base'
7
+
8
+ module Kotoshu
9
+ module Languages
10
+ # Russian language implementation.
11
+ #
12
+ # Supports multiple dialects: ru-RU, ru-BY, ru-KZ, ru-KG, ru-MD
13
+ #
14
+ # Full Hunspell integration with spell checking, POS tagging, and grammar rules
15
+ # specifically handling Russian Cyrillic script and case system.
16
+ class Russian < Language::Base
17
+ # Russian spell checker with Hunspell integration.
18
+ class SpellChecker < Components::SpellChecker
19
+ attr_reader :aff_path, :dic_path, :script
20
+
21
+ def initialize(aff_path:, dic_path:, script: :cyrillic, encoding: 'UTF-8')
22
+ @aff_path = aff_path
23
+ @dic_path = dic_path
24
+ @script = script
25
+ @encoding = encoding
26
+ @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
27
+ end
28
+
29
+ def check(word)
30
+ return { found: false, stem: nil, flags: [] } if word.nil? || word.empty?
31
+ first_form = @lookuper.good_forms(word).first
32
+ if first_form
33
+ { found: true, stem: first_form.stem || word, flags: first_form.flags&.to_a || [] }
34
+ else
35
+ { found: false, stem: nil, flags: [] }
36
+ end
37
+ end
38
+
39
+ def suggest(word, max_suggestions: 10)
40
+ return [] if word.nil? || word.empty?
41
+ first_form = @lookuper.good_forms(word).first
42
+ return [] if first_form
43
+ generate_suggestions(word, max_suggestions).take(max_suggestions)
44
+ end
45
+
46
+ def correct?(word)
47
+ check(word)[:found]
48
+ end
49
+
50
+ def lookuper
51
+ @lookuper
52
+ end
53
+
54
+ private
55
+
56
+ def calculate_distance(a, b)
57
+ return a.length if b.empty?
58
+ return b.length if a.empty?
59
+ matrix = Array.new(a.length + 1) { |i| [i] + [0] * b.length }
60
+ (1..b.length).each { |j| matrix[0][j] = j }
61
+ (1..a.length).each do |i|
62
+ (1..b.length).each do |j|
63
+ cost = a[i - 1] == b[j - 1] ? 0 : 1
64
+ matrix[i][j] = [matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost].min
65
+ end
66
+ end
67
+ matrix[a.length][b.length]
68
+ end
69
+
70
+ def calculate_score(original, suggestion, rank)
71
+ distance = calculate_distance(original, suggestion)
72
+ max_len = [original.length, suggestion.length].max
73
+ distance_score = 1.0 - (distance.to_f / max_len)
74
+ rank_penalty = rank * 0.05
75
+ [distance_score - rank_penalty, 0.0].max
76
+ end
77
+
78
+ def generate_suggestions(word, max_suggestions)
79
+ variations = []
80
+
81
+ # Russian character substitutions (common Cyrillic errors)
82
+ cyrillic_substitutions = {
83
+ 'а' => %w[о и е я],
84
+ 'о' => %w[а е и],
85
+ 'е' => %w[и э а],
86
+ 'и' => %w[е е],
87
+ 'п' => %w[т к],
88
+ 'т' => %w[п д],
89
+ 'к' => %w[г х],
90
+ 'н' => %w[т п],
91
+ 'с' => %w[з ш],
92
+ 'ш' => %w[с щ],
93
+ 'щ' => %w[ш],
94
+ 'б' => %w[п в],
95
+ 'в' => %w[б ф],
96
+ 'ф' => %w[в в],
97
+ 'д' => %w[т],
98
+ 'г' => %w[к х],
99
+ 'х' => %w[г к],
100
+ 'я' => %w[а е],
101
+ 'ю' => %w[у],
102
+ 'ё' => %w[е],
103
+ 'ж' => %w[з ш],
104
+ 'з' => %w[с ж],
105
+ 'ь' => %w[ъ],
106
+ 'ъ' => %w[ь],
107
+ }
108
+
109
+ word.chars.each_with_index do |char, i|
110
+ next unless cyrillic_substitutions.key?(char.downcase)
111
+ cyrillic_substitutions[char.downcase].each do |sub|
112
+ substituted = word.dup
113
+ substituted[i] = sub
114
+ variations << substituted if @lookuper.good_forms(substituted).first
115
+ end
116
+ end
117
+
118
+ # Doubled and deleted letters
119
+ word.chars.each_with_index do |char, i|
120
+ next if i == 0
121
+ doubled = word.dup
122
+ doubled.insert(i, char)
123
+ variations << doubled if @lookuper.good_forms(doubled).first
124
+ end
125
+
126
+ (0...word.length).each do |i|
127
+ deleted = word.dup
128
+ deleted.slice!(i)
129
+ next if deleted.empty?
130
+ variations << deleted if @lookuper.good_forms(deleted).first
131
+ end
132
+
133
+ variations.uniq!
134
+ variations.map do |suggestion|
135
+ { word: suggestion, distance: calculate_distance(word, suggestion), score: calculate_score(word, suggestion, 0) }
136
+ end.sort_by { |s| s[:distance] }
137
+ end
138
+ end
139
+
140
+ # Russian tokenizer with abbreviation handling.
141
+ class Tokenizer < Language::Tokenizer::RussianTokenizer
142
+ end
143
+
144
+ # Russian POS tagger.
145
+ class POSTagger < Components::PosTagger
146
+ FLAG_TO_POS = {
147
+ 'N' => 'NOUN', 'NN' => 'NOUN', 'NNS' => 'NOUN', 'NNP' => 'NOUN_PROPER',
148
+ 'S' => 'NOUN', 'Sub' => 'NOUN',
149
+ 'V' => 'VERB', 'VB' => 'VERB', 'VBD' => 'VERB', 'VBG' => 'VERB', 'VBN' => 'VERB',
150
+ 'VBP' => 'VERB', 'VBZ' => 'VERB',
151
+ 'A' => 'ADJ', 'JJ' => 'ADJ', 'JJR' => 'ADJ', 'JJS' => 'ADJ',
152
+ 'Adj' => 'ADJ',
153
+ 'R' => 'ADV', 'RB' => 'ADV', 'RBR' => 'ADV', 'RBS' => 'ADV',
154
+ 'Adv' => 'ADV',
155
+ 'D' => 'DET', 'DT' => 'DET', 'PDT' => 'DET',
156
+ 'P' => 'PRON', 'PP' => 'PRON', 'PRP' => 'PRON', 'PRP$' => 'PRON_POSS',
157
+ 'WP' => 'PRON', 'WP$' => 'PRON_POSS',
158
+ 'Pro' => 'PRON',
159
+ 'I' => 'PREP', 'IN' => 'PREP',
160
+ 'Präp' => 'PREP',
161
+ 'C' => 'CONJ', 'CC' => 'CONJ',
162
+ 'Conj' => 'CONJ',
163
+ 'U' => 'PART', 'RP' => 'PART',
164
+ 'Pt' => 'PART',
165
+ 'INTJ' => 'INTJ', 'UH' => 'INTJ',
166
+ 'Int' => 'INTJ',
167
+ 'CD' => 'NUM',
168
+ 'FW' => 'X',
169
+ 'PUNCT' => 'PUNCT', '.' => 'PUNCT', ',' => 'PUNCT', '!' => 'PUNCT',
170
+ '?' => 'PUNCT', ';' => 'PUNCT', ':' => 'PUNCT'
171
+ }.freeze
172
+
173
+ attr_reader :aff_path, :dic_path, :script
174
+
175
+ def initialize(aff_path:, dic_path:, script: :cyrillic, encoding: 'UTF-8', flag_mapping: FLAG_TO_POS)
176
+ @aff_path = aff_path
177
+ @dic_path = dic_path
178
+ @script = script
179
+ @encoding = encoding
180
+ @flag_mapping = flag_mapping
181
+ @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
182
+ @lookup_cache = {}
183
+ end
184
+
185
+ def tag(tokens)
186
+ return [] if tokens.nil? || tokens.empty?
187
+ tokens.map do |token|
188
+ word = token[:token]
189
+ if word.nil? || word.empty?
190
+ token.merge(pos_tag: nil, lemma: nil)
191
+ else
192
+ lookup_result = lookup_with_pos(word)
193
+ token.merge(pos_tag: lookup_result[:pos_tag], lemma: lookup_result[:lemma] || word)
194
+ end
195
+ end
196
+ end
197
+
198
+ def flag_mapping
199
+ @flag_mapping
200
+ end
201
+
202
+ def flag_mapping=(mapping)
203
+ @flag_mapping = mapping
204
+ end
205
+
206
+ def clear_cache
207
+ @lookup_cache.clear
208
+ end
209
+
210
+ private
211
+
212
+ def lookup_with_pos(word)
213
+ return { pos_tag: nil, lemma: nil } if word.nil? || word.empty?
214
+ return @lookup_cache[word] if @lookup_cache.key?(word)
215
+ first_form = @lookuper.good_forms(word).first
216
+ pos_tag = derive_pos_tag(first_form)
217
+ cache_result = { pos_tag: pos_tag, lemma: first_form&.stem }
218
+ @lookup_cache[word] = cache_result
219
+ cache_result
220
+ end
221
+
222
+ def derive_pos_tag(result)
223
+ return nil unless result
224
+ flags = result.flags&.to_a || []
225
+ return guess_pos_from_affix(result) if flags.empty?
226
+ flags.each do |flag|
227
+ pos_tag = flag_to_pos(flag)
228
+ return pos_tag if pos_tag
229
+ end
230
+ guess_pos_from_affix(result)
231
+ end
232
+
233
+ def flag_to_pos(flag)
234
+ return @flag_mapping[flag] if @flag_mapping.key?(flag)
235
+ first_char = flag[0]
236
+ @flag_mapping[first_char]
237
+ end
238
+
239
+ def guess_pos_from_affix(result)
240
+ suffix = result.suffix
241
+ return guess_pos_from_suffix(suffix) if suffix
242
+ nil
243
+ end
244
+
245
+ def guess_pos_from_suffix(suffix)
246
+ # Russian suffix patterns
247
+ return 'VERB' if suffix.match?(/^(ть|ти|чь|л|ла|ло|ли|ют|ют|ешь|ишь|им|ите|ат|ят)$/)
248
+ return 'ADV' if suffix.match?(/^(о|е|и)$/)
249
+ return 'NOUN' if suffix.match?(/^(ость|ение|ание|ка|ник|чик|щик|ство|тель|ение|ство)$/)
250
+ return 'ADJ' if suffix.match?(/^(ый|ий|ой|ое|ая|ое|ые|их|ем|им|ом|ого|ому)$/)
251
+ nil
252
+ end
253
+ end
254
+
255
+ # Russian grammar rules module.
256
+ module GrammarRules
257
+ class Rule
258
+ attr_reader :id, :name, :description
259
+
260
+ def initialize(id, name, description)
261
+ @id = id
262
+ @name = name
263
+ @description = description
264
+ end
265
+
266
+ def check(tokens)
267
+ raise NotImplementedError, "#{self.class} must implement #check"
268
+ end
269
+ end
270
+
271
+ # Rule: Verbal aspect consistency
272
+ class VerbalAspectRule < Rule
273
+ IMPERFECTIVE_SUFFIXES = %w[ать ять].freeze
274
+ PERFECTIVE_SUFFIXES = %w[ить по].freeze
275
+
276
+ def initialize
277
+ super('RU_VERBAL_ASPECT', 'Verbal Aspect', 'Russian verbs should use consistent aspect (imperfective/perfective).')
278
+ end
279
+
280
+ def check(tokens)
281
+ # Simplified implementation
282
+ []
283
+ end
284
+ end
285
+
286
+ # Rule: Case agreement
287
+ class CaseAgreementRule < Rule
288
+ def initialize
289
+ super('RU_CASE_AGREEMENT', 'Case Agreement', 'Nouns, adjectives, and verbs must agree in case.')
290
+ end
291
+
292
+ def check(tokens)
293
+ # Simplified implementation
294
+ []
295
+ end
296
+ end
297
+
298
+ class RuleRegistry
299
+ class << self
300
+ def default_rules
301
+ [VerbalAspectRule.new, CaseAgreementRule.new]
302
+ end
303
+
304
+ def get_rule(id)
305
+ default_rules.find { |rule| rule.id == id }
306
+ end
307
+ end
308
+ end
309
+ end
310
+
311
+ # Registration
312
+ register "ru"
313
+ register "ru-RU"
314
+ register "ru-BY"
315
+ register "ru-KZ"
316
+ register "ru-KG"
317
+ register "ru-MD"
318
+
319
+ HUNSPELL_DICTIONARIES = {
320
+ 'ru-RU' => {
321
+ aff: 'spec/integrational/fixtures/ru_RU.aff',
322
+ dic: 'spec/integrational/fixtures/ru_RU.dic'
323
+ }
324
+ }.freeze
325
+
326
+ VARIANT_NAMES = {
327
+ 'RU' => 'Russian',
328
+ 'BY' => 'Belarusian',
329
+ 'KZ' => 'Kazakh',
330
+ 'KG' => 'Kyrgyz',
331
+ 'MD' => 'Moldovan'
332
+ }.freeze
333
+
334
+ def initialize(code: "ru", name: "Russian", variant: nil)
335
+ variant ||= extract_region_code(code)
336
+ super(code: code, name: name, variant: variant)
337
+ @hunspell_paths = resolve_hunspell_paths(code)
338
+ end
339
+
340
+ def description
341
+ return name unless variant
342
+ variant_name = VARIANT_NAMES[variant] || variant
343
+ "#{name} (#{variant_name})"
344
+ end
345
+
346
+ def tokenizer
347
+ @tokenizer ||= Tokenizer.new
348
+ end
349
+
350
+ def normalizer
351
+ @normalizer ||= Language::Normalizer::Base.new
352
+ end
353
+
354
+ def dictionary_class
355
+ Dictionary::UnixWords
356
+ end
357
+
358
+ def default_dictionary_paths
359
+ case code
360
+ when "ru-RU"
361
+ ["/usr/share/dict/russian"]
362
+ else
363
+ ["/usr/share/dict/words"]
364
+ end
365
+ end
366
+
367
+ def script_type
368
+ :cyrillic
369
+ end
370
+
371
+ def create_spell_checker
372
+ SpellChecker.new(
373
+ aff_path: @hunspell_paths[:aff],
374
+ dic_path: @hunspell_paths[:dic],
375
+ script: :cyrillic
376
+ )
377
+ end
378
+
379
+ def create_tokenizer
380
+ Tokenizer.new
381
+ end
382
+
383
+ def create_pos_tagger
384
+ POSTagger.new(
385
+ aff_path: @hunspell_paths[:aff],
386
+ dic_path: @hunspell_paths[:dic],
387
+ script: :cyrillic,
388
+ flag_mapping: POSTagger::FLAG_TO_POS
389
+ )
390
+ end
391
+
392
+ private
393
+
394
+ def extract_region_code(code)
395
+ return nil unless code.include?("-")
396
+ code.split("-", 2).last.upcase
397
+ end
398
+
399
+ def resolve_hunspell_paths(code)
400
+ HUNSPELL_DICTIONARIES[code] || HUNSPELL_DICTIONARIES['ru-RU']
401
+ end
402
+ end
403
+ end
404
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Load all language-specific modules
4
+ require_relative 'languages/en/language'
5
+ require_relative 'languages/fr/language'
6
+ require_relative 'languages/de/language'
7
+ require_relative 'languages/ja/language'
8
+ require_relative 'languages/pt/language'
9
+ require_relative 'languages/ru/language'
10
+ require_relative 'languages/es/language'
11
+
12
+ module Kotoshu
13
+ # Languages module for language-specific implementations.
14
+ #
15
+ # Each language has its own namespace under this module,
16
+ # allowing for clean organization and scalability.
17
+ #
18
+ # @example English components
19
+ # Kotoshu::Languages::English::SpellChecker
20
+ # Kotoshu::Languages::English::Tokenizer
21
+ # Kotoshu::Languages::English::POSTagger
22
+ # Kotoshu::Languages::English::GrammarRules
23
+ #
24
+ # @example French components
25
+ # Kotoshu::Languages::French::Tokenizer
26
+ #
27
+ # @example German components
28
+ # Kotoshu::Languages::German::Tokenizer
29
+ #
30
+ # @example Japanese components
31
+ # Kotoshu::Languages::Japanese::Tokenizer
32
+ #
33
+ # @example Portuguese components
34
+ # Kotoshu::Languages::Portuguese::Tokenizer
35
+ #
36
+ # @example Russian components
37
+ # Kotoshu::Languages::Russian::Tokenizer
38
+ #
39
+ # @example Spanish components
40
+ # Kotoshu::Languages::Spanish::Tokenizer
41
+ module Languages
42
+ end
43
+ end
@@ -0,0 +1,222 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Metrics
5
+ # Thread-safe metrics collector.
6
+ #
7
+ # Tracks performance metrics for spellchecking operations:
8
+ # - Lookup counts and timing
9
+ # - Cache hit/miss rates
10
+ # - Suggestion generation stats
11
+ #
12
+ # @example
13
+ # collector = Kotoshu::Metrics::Collector.new
14
+ # collector.record_lookup("hello", result: true, time: 0.5)
15
+ # collector.stats
16
+ # # => { lookups: 1, correct_lookups: 1, avg_lookup_time: 0.5, ... }
17
+ class Collector
18
+ # Initialize a new collector.
19
+ def initialize
20
+ @mutex = Mutex.new
21
+ reset
22
+ end
23
+
24
+ # Record a lookup operation.
25
+ #
26
+ # @param word [String] The word being looked up
27
+ # @param result [Boolean] The lookup result
28
+ # @param time [Float] Time taken in milliseconds
29
+ def record_lookup(_word, result:, time:)
30
+ @mutex.synchronize do
31
+ @metrics[:lookups] += 1
32
+ @metrics[:correct_lookups] += 1 if result
33
+ @metrics[:misspelled_lookups] += 1 unless result
34
+
35
+ @metrics[:lookup_times] << time
36
+ end
37
+ end
38
+
39
+ # Record a cache operation.
40
+ #
41
+ # @param cache_type [String] Type of cache (lookup, suggestion)
42
+ # @param hit [Boolean] True if cache hit
43
+ def record_cache(cache_type, hit:)
44
+ @mutex.synchronize do
45
+ key = "#{cache_type}_cache_hits".to_sym
46
+ miss_key = "#{cache_type}_cache_misses".to_sym
47
+
48
+ if hit
49
+ @metrics[key] += 1
50
+ else
51
+ @metrics[miss_key] += 1
52
+ end
53
+ end
54
+ end
55
+
56
+ # Record suggestion generation.
57
+ #
58
+ # @param word [String] The input word
59
+ # @param count [Integer] Number of suggestions generated
60
+ # @param time [Float] Time taken in milliseconds
61
+ def record_suggestions(_word, count:, time:)
62
+ @mutex.synchronize do
63
+ @metrics[:suggestion_requests] += 1
64
+ @metrics[:suggestions_generated] += count
65
+
66
+ @metrics[:suggestion_times] << time
67
+ end
68
+ end
69
+
70
+ # Get current metrics statistics.
71
+ #
72
+ # @return [Hash] Current statistics with computed averages
73
+ def stats
74
+ @mutex.synchronize do
75
+ calculate_stats
76
+ end
77
+ end
78
+
79
+ # Reset all metrics.
80
+ def reset
81
+ @mutex.synchronize do
82
+ @metrics = {
83
+ lookups: 0,
84
+ correct_lookups: 0,
85
+ misspelled_lookups: 0,
86
+ lookup_times: [],
87
+
88
+ lookup_cache_hits: 0,
89
+ lookup_cache_misses: 0,
90
+ suggestion_cache_hits: 0,
91
+ suggestion_cache_misses: 0,
92
+
93
+ suggestion_requests: 0,
94
+ suggestions_generated: 0,
95
+ suggestion_times: [],
96
+
97
+ started_at: Time.now
98
+ }
99
+ end
100
+ end
101
+
102
+ # Export metrics in StatsD format.
103
+ #
104
+ # @return [String] StatsD protocol lines
105
+ def to_statsd
106
+ s = stats
107
+ prefix = "kotoshu"
108
+
109
+ lines = []
110
+ lines << "#{prefix}.lookups:#{s[:lookups]}|c"
111
+ lines << "#{prefix}.correct_lookups:#{s[:correct_lookups]}|c"
112
+ lines << "#{prefix}.misspelled_lookups:#{s[:misspelled_lookups]}|c"
113
+ lines << "#{prefix}.avg_lookup_time:#{s[:avg_lookup_time]}|ms"
114
+ lines << "#{prefix}.lookup_cache_hits:#{s[:lookup_cache_hits]}|c"
115
+ lines << "#{prefix}.lookup_cache_misses:#{s[:lookup_cache_misses]}|c"
116
+ lines << "#{prefix}.suggestion_requests:#{s[:suggestion_requests]}|c"
117
+ lines << "#{prefix}.suggestions_generated:#{s[:suggestions_generated]}|c"
118
+ lines << "#{prefix}.avg_suggestion_time:#{s[:avg_suggestion_time]}|ms"
119
+
120
+ lines.join("\n")
121
+ end
122
+
123
+ # Export metrics in Prometheus exposition format.
124
+ #
125
+ # @return [String] Prometheus format
126
+ def to_prometheus
127
+ s = stats
128
+
129
+ lines = []
130
+ lines << "# HELP kotoshu_lookups Total number of word lookups"
131
+ lines << "# TYPE kotoshu_lookups counter"
132
+ lines << "kotoshu_lookups #{s[:lookups]}"
133
+
134
+ lines << "# HELP kotoshu_correct_lookups Number of correct word lookups"
135
+ lines << "# TYPE kotoshu_correct_lookups counter"
136
+ lines << "kotoshu_correct_lookups #{s[:correct_lookups]}"
137
+
138
+ lines << "# HELP kotoshu_misspelled_lookups Number of misspelled word lookups"
139
+ lines << "# TYPE kotoshu_misspelled_lookups counter"
140
+ lines << "kotoshu_misspelled_lookups #{s[:misspelled_lookups]}"
141
+
142
+ lines << "# HELP kotoshu_avg_lookup_time Average lookup time in milliseconds"
143
+ lines << "# TYPE kotoshu_avg_lookup_time gauge"
144
+ lines << "kotoshu_avg_lookup_time #{s[:avg_lookup_time]}"
145
+
146
+ lines << "# HELP kotoshu_lookup_cache_hits Number of lookup cache hits"
147
+ lines << "# TYPE kotoshu_lookup_cache_hits counter"
148
+ lines << "kotoshu_lookup_cache_hits #{s[:lookup_cache_hits]}"
149
+
150
+ lines << "# HELP kotoshu_lookup_cache_misses Number of lookup cache misses"
151
+ lines << "# TYPE kotoshu_lookup_cache_misses counter"
152
+ lines << "kotoshu_lookup_cache_misses #{s[:lookup_cache_misses]}"
153
+
154
+ lines << "# HELP kotoshu_suggestion_requests Number of suggestion requests"
155
+ lines << "# TYPE kotoshu_suggestion_requests counter"
156
+ lines << "kotoshu_suggestion_requests #{s[:suggestion_requests]}"
157
+
158
+ lines << "# HELP kotoshu_suggestions_generated Total number of suggestions generated"
159
+ lines << "# TYPE kotoshu_suggestions_generated counter"
160
+ lines << "kotoshu_suggestions_generated #{s[:suggestions_generated]}"
161
+
162
+ lines << "# HELP kotoshu_avg_suggestion_time Average suggestion generation time in milliseconds"
163
+ lines << "# TYPE kotoshu_avg_suggestion_time gauge"
164
+ lines << "kotoshu_avg_suggestion_time #{s[:avg_suggestion_time]}"
165
+
166
+ lines.join("\n")
167
+ end
168
+
169
+ private
170
+
171
+ # Calculate computed statistics.
172
+ #
173
+ # @return [Hash] Statistics with computed values
174
+ def calculate_stats
175
+ lookup_times = @metrics[:lookup_times]
176
+ suggestion_times = @metrics[:suggestion_times]
177
+
178
+ avg_lookup = lookup_times.empty? ? 0 : lookup_times.sum / lookup_times.size
179
+ avg_suggestion = suggestion_times.empty? ? 0 : suggestion_times.sum / suggestion_times.size
180
+
181
+ lookup_hit_rate = calculate_hit_rate(@metrics[:lookup_cache_hits], @metrics[:lookup_cache_misses])
182
+ suggestion_hit_rate = calculate_hit_rate(@metrics[:suggestion_cache_hits], @metrics[:suggestion_cache_misses])
183
+
184
+ {
185
+ lookups: @metrics[:lookups],
186
+ correct_lookups: @metrics[:correct_lookups],
187
+ misspelled_lookups: @metrics[:misspelled_lookups],
188
+ avg_lookup_time: avg_lookup.round(3),
189
+
190
+ lookup_cache_hits: @metrics[:lookup_cache_hits],
191
+ lookup_cache_misses: @metrics[:lookup_cache_misses],
192
+ lookup_cache_hit_rate: lookup_hit_rate,
193
+
194
+ suggestion_cache_hits: @metrics[:suggestion_cache_hits],
195
+ suggestion_cache_misses: @metrics[:suggestion_cache_misses],
196
+ suggestion_cache_hit_rate: suggestion_hit_rate,
197
+
198
+ suggestion_requests: @metrics[:suggestion_requests],
199
+ suggestions_generated: @metrics[:suggestions_generated],
200
+ avg_suggestions_per_request: if @metrics[:suggestion_requests].positive?
201
+ (@metrics[:suggestions_generated].to_f / @metrics[:suggestion_requests]).round(2)
202
+ else
203
+ 0
204
+ end,
205
+ avg_suggestion_time: avg_suggestion.round(3),
206
+
207
+ uptime_seconds: (Time.now - @metrics[:started_at]).round(2)
208
+ }
209
+ end
210
+
211
+ # Calculate cache hit rate.
212
+ #
213
+ # @param hits [Integer] Number of hits
214
+ # @param misses [Integer] Number of misses
215
+ # @return [Float] Hit rate (0-1)
216
+ def calculate_hit_rate(hits, misses)
217
+ total = hits + misses
218
+ total.positive? ? (hits.to_f / total).round(4) : 0.0
219
+ end
220
+ end
221
+ end
222
+ end