kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,448 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../readers/lookup_builder'
4
+ require_relative '../../components/spell_checker'
5
+ require_relative '../../components/whitespace_tokenizer'
6
+ require_relative '../../components/pos_tagger'
7
+ require_relative '../../language/normalizer/base'
8
+ require_relative '../../grammar'
9
+
10
+ module Kotoshu
11
+ module Languages
12
+ # English language implementation.
13
+ #
14
+ # Supports multiple dialects: en-US, en-GB, en-AU, en-CA, en-NZ, en-ZA
15
+ #
16
+ # @example American English
17
+ # lang = Kotoshu::Languages::English.new(code: "en-US")
18
+ # checker = lang.create_spell_checker
19
+ # checker.correct?("color") # => true
20
+ # checker.correct?("colour") # => false
21
+ #
22
+ # @example British English
23
+ # lang = Kotoshu::Languages::English.new(code: "en-GB")
24
+ # checker.correct?("colour") # => true
25
+ class English < Language::Base
26
+ # English spell checker.
27
+ #
28
+ # Uses the Lookup algorithm with Hunspell-format dictionaries.
29
+ class SpellChecker < Components::SpellChecker
30
+ attr_reader :aff_path, :dic_path, :script
31
+
32
+ def initialize(aff_path:, dic_path:, script: :latin, encoding: 'ISO-8859-1')
33
+ @aff_path = aff_path
34
+ @dic_path = dic_path
35
+ @script = script
36
+ @encoding = encoding
37
+ @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
38
+ end
39
+
40
+ def check(word)
41
+ return { found: false, stem: nil, flags: [] } if word.nil? || word.empty?
42
+ first_form = @lookuper.good_forms(word).first
43
+ if first_form
44
+ { found: true, stem: first_form.stem || word, flags: first_form.flags&.to_a || [] }
45
+ else
46
+ { found: false, stem: nil, flags: [] }
47
+ end
48
+ end
49
+
50
+ def suggest(word, max_suggestions: 10)
51
+ return [] if word.nil? || word.empty?
52
+ first_form = @lookuper.good_forms(word).first
53
+ return [] if first_form
54
+ generate_suggestions(word, max_suggestions).take(max_suggestions)
55
+ end
56
+
57
+ def correct?(word)
58
+ check(word)[:found]
59
+ end
60
+
61
+ def lookuper
62
+ @lookuper
63
+ end
64
+
65
+ private
66
+
67
+ def calculate_distance(a, b)
68
+ return a.length if b.empty?
69
+ return b.length if a.empty?
70
+ matrix = Array.new(a.length + 1) { |i| [i] + [0] * b.length }
71
+ (1..b.length).each { |j| matrix[0][j] = j }
72
+ (1..a.length).each do |i|
73
+ (1..b.length).each do |j|
74
+ cost = a[i - 1] == b[j - 1] ? 0 : 1
75
+ matrix[i][j] = [matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost].min
76
+ end
77
+ end
78
+ matrix[a.length][b.length]
79
+ end
80
+
81
+ def calculate_score(original, suggestion, rank)
82
+ distance = calculate_distance(original, suggestion)
83
+ max_len = [original.length, suggestion.length].max
84
+ distance_score = 1.0 - (distance.to_f / max_len)
85
+ rank_penalty = rank * 0.05
86
+ [distance_score - rank_penalty, 0.0].max
87
+ end
88
+
89
+ def generate_suggestions(word, max_suggestions)
90
+ variations = []
91
+ word.chars.each_with_index do |char, i|
92
+ next if i == 0
93
+ doubled = word.dup
94
+ doubled.insert(i, char)
95
+ variations << doubled if @lookuper.good_forms(doubled).first
96
+ end
97
+ (0...word.length).each do |i|
98
+ deleted = word.dup
99
+ deleted.slice!(i)
100
+ next if deleted.empty?
101
+ variations << deleted if @lookuper.good_forms(deleted).first
102
+ end
103
+ common_substitutions = {
104
+ 'a' => %w[e i o u],
105
+ 'e' => %w[a i o u],
106
+ 'i' => %w[a e o u],
107
+ 'o' => %w[a e i u],
108
+ 'u' => %w[a e i o],
109
+ 's' => %w[z c],
110
+ 'z' => %w[s],
111
+ 'c' => %w[k s],
112
+ 'k' => %w[c],
113
+ 'ph' => %w[f],
114
+ 'f' => %w[ph]
115
+ }
116
+ word.chars.each_with_index do |char, i|
117
+ next unless common_substitutions.key?(char.downcase)
118
+ common_substitutions[char.downcase].each do |sub|
119
+ substituted = word.dup
120
+ substituted[i] = sub
121
+ variations << substituted if @lookuper.good_forms(substituted).first
122
+ end
123
+ end
124
+ variations.uniq!
125
+ variations.map do |suggestion|
126
+ { word: suggestion, distance: calculate_distance(word, suggestion), score: calculate_score(word, suggestion, 0) }
127
+ end.sort_by { |s| s[:distance] }
128
+ end
129
+ end
130
+
131
+ # English tokenizer with contraction handling.
132
+ class Tokenizer < Components::WhitespaceTokenizer
133
+ CONTRACTIONS = {
134
+ "n't" => ['not', 'NEG'],
135
+ "'ll" => ['will', 'MD'],
136
+ "'ve" => ['have', 'VBP'],
137
+ "'re" => ['are', 'VBP'],
138
+ "'m" => ['am', 'VBP'],
139
+ "'d" => ['would', 'MD'],
140
+ "'s" => ['is', 'VBZ'],
141
+ "'clock" => ['of', 'IN'],
142
+ }.freeze
143
+ WONT_EXCEPTION = { "won't" => ['will', 'not'] }.freeze
144
+ CANT_EXCEPTION = { "can't" => ['can', "'t"] }.freeze
145
+ POSSESSIVE_PATTERN = /([A-Za-z]+)('s)(?=[A-Za-z]|$)/
146
+ CONTRACTION_WITH_S = %w[it he that what who there].freeze
147
+
148
+ def initialize(expand_contractions: true)
149
+ super()
150
+ @expand_contractions = expand_contractions
151
+ end
152
+
153
+ def tokenize(text)
154
+ return [] if text.nil? || text.empty?
155
+ tokens = super
156
+ if @expand_contractions
157
+ tokens = expand_contractions(tokens)
158
+ end
159
+ tokens
160
+ end
161
+
162
+ private
163
+
164
+ def expand_contractions(tokens)
165
+ result = []
166
+ i = 0
167
+ while i < tokens.length
168
+ token = tokens[i]
169
+ if token[:token] == "won't"
170
+ result << { token: 'will', position: token[:position], length: 5 }
171
+ result << { token: 'not', position: token[:position] + 5, length: 3 }
172
+ i += 1
173
+ next
174
+ end
175
+ if token[:token] == "can't"
176
+ result << { token: 'can', position: token[:position], length: 3 }
177
+ result << { token: "'t", position: token[:position] + 3, length: 2 }
178
+ i += 1
179
+ next
180
+ end
181
+ expanded = expand_single_contraction(token)
182
+ if expanded
183
+ result.concat(expanded)
184
+ else
185
+ result << token
186
+ end
187
+ i += 1
188
+ end
189
+ result
190
+ end
191
+
192
+ def expand_single_contraction(token)
193
+ word = token[:token]
194
+ if word =~ POSSESSIVE_PATTERN
195
+ base = word[0..-3]
196
+ if CONTRACTION_WITH_S.include?(base.downcase)
197
+ return [{ token: base, position: token[:position], length: base.length }, { token: "'s", position: token[:position] + base.length, length: 2 }]
198
+ else
199
+ return [{ token: base, position: token[:position], length: base.length }, { token: "'s", position: token[:position] + base.length, length: 2 }]
200
+ end
201
+ end
202
+ CONTRACTIONS.each do |suffix, expansion|
203
+ next if suffix == "'s" || suffix == "'clock"
204
+ if word.end_with?(suffix) && word.length > suffix.length
205
+ prefix = word[0...-suffix.length]
206
+ return [{ token: prefix, position: token[:position], length: prefix.length }, { token: suffix, position: token[:position] + prefix.length, length: suffix.length }]
207
+ end
208
+ end
209
+ nil
210
+ end
211
+ end
212
+
213
+ # English POS tagger.
214
+ class POSTagger < Components::PosTagger
215
+ FLAG_TO_POS = {
216
+ 'N' => 'NOUN', 'NN' => 'NOUN', 'NNS' => 'NOUN', 'NNP' => 'NOUN', 'NP' => 'NOUN_PROPER',
217
+ 'V' => 'VERB', 'VB' => 'VERB', 'VBD' => 'VERB', 'VBG' => 'VERB', 'VBN' => 'VERB',
218
+ 'VBP' => 'VERB', 'VBZ' => 'VERB', 'MD' => 'VERB_MODAL',
219
+ 'A' => 'ADJ', 'JJ' => 'ADJ', 'JJR' => 'ADJ', 'JJS' => 'ADJ',
220
+ 'R' => 'ADV', 'RB' => 'ADV', 'RBR' => 'ADV', 'RBS' => 'ADV',
221
+ 'D' => 'DET', 'DT' => 'DET', 'PDT' => 'DET',
222
+ 'P' => 'PRON', 'PP' => 'PRON', 'PRP' => 'PRON', 'PRP$' => 'PRON_POSS',
223
+ 'WP' => 'PRON', 'WP$' => 'PRON_POSS',
224
+ 'I' => 'PREP', 'IN' => 'PREP',
225
+ 'C' => 'CONJ', 'CC' => 'CONJ',
226
+ 'U' => 'PART', 'RP' => 'PART',
227
+ 'INTJ' => 'INTJ', 'UH' => 'INTJ',
228
+ 'CD' => 'NUM', 'FW' => 'X',
229
+ 'PUNCT' => 'PUNCT', '.' => 'PUNCT', ',' => 'PUNCT', '!' => 'PUNCT',
230
+ '?' => 'PUNCT', ';' => 'PUNCT', ':' => 'PUNCT'
231
+ }.freeze
232
+
233
+ attr_reader :aff_path, :dic_path, :script
234
+
235
+ def initialize(aff_path:, dic_path:, script: :latin, encoding: 'ISO-8859-1', flag_mapping: FLAG_TO_POS)
236
+ @aff_path = aff_path
237
+ @dic_path = dic_path
238
+ @script = script
239
+ @encoding = encoding
240
+ @flag_mapping = flag_mapping
241
+ @lookuper = Readers::LookupBuilder.new(aff_path, dic_path, encoding: encoding, script: script).build
242
+ @lookup_cache = {}
243
+ end
244
+
245
+ def tag(tokens)
246
+ return [] if tokens.nil? || tokens.empty?
247
+ tokens.map do |token|
248
+ word = token[:token]
249
+ if word.nil? || word.empty?
250
+ token.merge(pos_tag: nil, lemma: nil)
251
+ else
252
+ lookup_result = lookup_with_pos(word)
253
+ token.merge(pos_tag: lookup_result[:pos_tag], lemma: lookup_result[:lemma] || word)
254
+ end
255
+ end
256
+ end
257
+
258
+ def flag_mapping
259
+ @flag_mapping
260
+ end
261
+
262
+ def flag_mapping=(mapping)
263
+ @flag_mapping = mapping
264
+ end
265
+
266
+ def clear_cache
267
+ @lookup_cache.clear
268
+ end
269
+
270
+ private
271
+
272
+ def lookup_with_pos(word)
273
+ return { pos_tag: nil, lemma: nil } if word.nil? || word.empty?
274
+ return @lookup_cache[word] if @lookup_cache.key?(word)
275
+ first_form = @lookuper.good_forms(word).first
276
+ pos_tag = derive_pos_tag(first_form)
277
+ cache_result = { pos_tag: pos_tag, lemma: first_form&.stem }
278
+ @lookup_cache[word] = cache_result
279
+ cache_result
280
+ end
281
+
282
+ def derive_pos_tag(result)
283
+ return nil unless result
284
+ flags = result.flags&.to_a || []
285
+ return guess_pos_from_affix(result) if flags.empty?
286
+ flags.each do |flag|
287
+ pos_tag = flag_to_pos(flag)
288
+ return pos_tag if pos_tag
289
+ end
290
+ guess_pos_from_affix(result)
291
+ end
292
+
293
+ def flag_to_pos(flag)
294
+ return @flag_mapping[flag] if @flag_mapping.key?(flag)
295
+ first_char = flag[0]
296
+ @flag_mapping[first_char]
297
+ end
298
+
299
+ def guess_pos_from_affix(result)
300
+ suffix = result.suffix
301
+ if suffix
302
+ # suffix is a Hash with :add, :strip, :type keys
303
+ suffix_text = suffix[:add] || suffix['add']
304
+ return guess_pos_from_suffix(suffix_text) if suffix_text
305
+ end
306
+ prefix = result.prefix
307
+ return nil unless prefix
308
+ nil
309
+ end
310
+
311
+ def guess_pos_from_suffix(suffix)
312
+ return nil unless suffix.is_a?(String)
313
+ return 'VERB' if suffix.match?(/^(ing|ed|es|s)$/)
314
+ return 'ADV' if suffix.end_with?('ly')
315
+ return 'NOUN' if suffix.match?(/^(tion|sion|ment|ness|ity|ship|er|or|ist)$/)
316
+ return 'ADJ' if suffix.match?(/^(able|ible|al|ial|ic|ive|ful|less|ous)$/)
317
+ nil
318
+ end
319
+ end
320
+
321
+ # Registration and configuration
322
+ register "en"
323
+ register "en-US"
324
+ register "en-GB"
325
+ register "en-AU"
326
+ register "en-CA"
327
+ register "en-NZ"
328
+ register "en-ZA"
329
+
330
+ HUNSPELL_DICTIONARIES = {
331
+ 'en-US' => {
332
+ aff: 'spec/integrational/fixtures/en_US.aff',
333
+ dic: 'spec/integrational/fixtures/en_US.dic'
334
+ },
335
+ }.freeze
336
+
337
+ VARIANT_NAMES = {
338
+ 'US' => 'American',
339
+ 'GB' => 'British',
340
+ 'CA' => 'Canadian',
341
+ 'AU' => 'Australian',
342
+ 'NZ' => 'New Zealand',
343
+ 'ZA' => 'South African'
344
+ }.freeze
345
+
346
+ def initialize(code: "en", name: "English", variant: nil)
347
+ variant ||= extract_region_code(code)
348
+ super(code: code, name: name, variant: variant)
349
+ @hunspell_paths = resolve_hunspell_paths(code)
350
+ end
351
+
352
+ def description
353
+ return name unless variant
354
+ variant_name = VARIANT_NAMES[variant] || variant
355
+ "#{name} (#{variant_name})"
356
+ end
357
+
358
+ def tokenizer
359
+ @tokenizer ||= Tokenizer.new
360
+ end
361
+
362
+ def normalizer
363
+ @normalizer ||= Language::Normalizer::Base.new
364
+ end
365
+
366
+ def dictionary_class
367
+ Dictionary::UnixWords
368
+ end
369
+
370
+ def default_dictionary_paths
371
+ case code
372
+ when "en-GB", "en-AU", "en-NZ", "en-ZA"
373
+ ["/usr/share/dict/british-english"]
374
+ when "en-US", "en-CA"
375
+ ["/usr/share/dict/american-english"]
376
+ else
377
+ ["/usr/share/dict/words"]
378
+ end
379
+ end
380
+
381
+ def script_type
382
+ :latin
383
+ end
384
+
385
+ def create_spell_checker
386
+ SpellChecker.new(
387
+ aff_path: @hunspell_paths[:aff],
388
+ dic_path: @hunspell_paths[:dic],
389
+ script: :latin
390
+ )
391
+ end
392
+
393
+ def create_tokenizer
394
+ Tokenizer.new
395
+ end
396
+
397
+ def create_pos_tagger
398
+ POSTagger.new(
399
+ aff_path: @hunspell_paths[:aff],
400
+ dic_path: @hunspell_paths[:dic],
401
+ script: :latin,
402
+ flag_mapping: english_pos_flag_mapping
403
+ )
404
+ end
405
+
406
+ def create_grammar_rules
407
+ Grammar::RuleEngine.new(language: 'en')
408
+ end
409
+
410
+ def valid_in_other_variant?(word)
411
+ return nil if @variant.nil? || @code == 'en'
412
+ HUNSPELL_DICTIONARIES.each do |variant_code, paths|
413
+ next if variant_code == @code
414
+ next unless File.exist?(paths[:aff]) && File.exist?(paths[:dic])
415
+ checker = SpellChecker.new(aff_path: paths[:aff], dic_path: paths[:dic], script: :latin, encoding: 'ISO-8859-1')
416
+ if checker.correct?(word)
417
+ region = variant_code.split('-').last.upcase
418
+ variant_name = VARIANT_NAMES[region] || variant_code
419
+ return { variant: variant_name, code: "en-#{region}" }
420
+ end
421
+ end
422
+ nil
423
+ end
424
+
425
+ private
426
+
427
+ def extract_region_code(code)
428
+ return nil unless code.include?("-")
429
+ code.split("-", 2).last.upcase
430
+ end
431
+
432
+ def resolve_hunspell_paths(code)
433
+ HUNSPELL_DICTIONARIES[code] || HUNSPELL_DICTIONARIES['en-US']
434
+ end
435
+
436
+ def english_pos_flag_mapping
437
+ mappings = POSTagger::FLAG_TO_POS.dup
438
+ mappings.merge!(
439
+ 'VBD' => 'VERB', 'VBG' => 'VERB', 'VBN' => 'VERB',
440
+ 'VBP' => 'VERB', 'VBZ' => 'VERB',
441
+ 'DT' => 'DET', 'WDT' => 'DET',
442
+ 'PRP' => 'PRON', 'PRP$' => 'PRON_POSS',
443
+ 'WP' => 'PRON', 'WP$' => 'PRON_POSS'
444
+ )
445
+ end
446
+ end
447
+ end
448
+ end