kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,378 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Language
5
+ # Language identification using FastText LID model.
6
+ #
7
+ # Identifies the language of text using FastText's pretrained
8
+ # language identification model (lid.176.ftz).
9
+ #
10
+ # @example Detect language
11
+ # lid = LanguageIdentifier.new
12
+ # result = lid.detect("Hello world")
13
+ # result.language # => "en"
14
+ # result.confidence # => 0.95
15
+ #
16
+ # @example Detect from file
17
+ # results = lid.detect_from_file("document.txt", top_k: 3)
18
+ # results.map(&:language) # => ["en", "de", "fr"]
19
+ class LanguageIdentifier
20
+ # FastText LID model URL
21
+ MODEL_URL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
22
+
23
+ # Language code mapping (FastText LID → ISO 639-1)
24
+ LANGUAGE_MAPPING = {
25
+ # FastText uses format like "__label__en" for English
26
+ 'en' => 'en',
27
+ 'de' => 'de',
28
+ 'es' => 'es',
29
+ 'fr' => 'fr',
30
+ 'pt' => 'pt',
31
+ 'ru' => 'ru',
32
+ 'it' => 'it',
33
+ 'nl' => 'nl',
34
+ 'pl' => 'pl',
35
+ 'sv' => 'sv',
36
+ 'da' => 'da',
37
+ 'no' => 'no',
38
+ 'fi' => 'fi',
39
+ 'cs' => 'cs',
40
+ 'el' => 'el',
41
+ 'hu' => 'hu',
42
+ 'ro' => 'ro',
43
+ 'bg' => 'bg',
44
+ 'sk' => 'sk',
45
+ 'sl' => 'sl',
46
+ 'hr' => 'hr',
47
+ 'sr' => 'sr',
48
+ 'et' => 'et',
49
+ 'lv' => 'lv',
50
+ 'lt' => 'lt',
51
+ 'mt' => 'mt',
52
+ 'ga' => 'ga',
53
+ 'cy' => 'cy',
54
+ 'tr' => 'tr',
55
+ 'ar' => 'ar',
56
+ 'he' => 'he',
57
+ 'fa' => 'fa',
58
+ 'ur' => 'ur',
59
+ 'hi' => 'hi',
60
+ 'bn' => 'bn',
61
+ 'ta' => 'ta',
62
+ 'te' => 'te',
63
+ 'ml' => 'ml',
64
+ 'kn' => 'kn',
65
+ 'th' => 'th',
66
+ 'vi' => 'vi',
67
+ 'id' => 'id',
68
+ 'ms' => 'ms',
69
+ 'sw' => 'sw',
70
+ 'zh' => 'zh',
71
+ 'ja' => 'ja',
72
+ 'ko' => 'ko'
73
+ }.freeze
74
+
75
+ # Value object for detection result.
76
+ #
77
+ # @attr_reader [String] language ISO 639-1 language code
78
+ # @attr_reader [Float] confidence Confidence score (0.0 to 1.0)
79
+ # @attr_reader [String] label Raw FastText label
80
+ DetectionResult = Struct.new(:language, :confidence, :label, keyword_init: true) do
81
+ def to_s
82
+ "#{language} (#{(confidence * 100).round(1)}%)"
83
+ end
84
+ end
85
+
86
+ attr_reader :model_path, :loaded
87
+
88
+ # Create a new language identifier.
89
+ #
90
+ # @param model_path [String] Path to lid.176.ftz model
91
+ # @param auto_download [Boolean] Download model if not found
92
+ def initialize(model_path: nil, auto_download: true)
93
+ @model_path = model_path || default_model_path
94
+ @auto_download = auto_download
95
+ @loaded = false
96
+ end
97
+
98
+ # Detect language of text.
99
+ #
100
+ # @param text [String] Text to analyze
101
+ # @param top_k [Integer] Number of top results to return
102
+ # @return [Array<DetectionResult>] Detection results sorted by confidence
103
+ def detect(text, top_k: 1)
104
+ ensure_model_loaded
105
+
106
+ # Preprocess text
107
+ text = preprocess_text(text)
108
+
109
+ # Run detection
110
+ results = run_detection(text, top_k)
111
+
112
+ results
113
+ end
114
+
115
+ # Detect language from file.
116
+ #
117
+ # @param filepath [String] Path to file
118
+ # @param top_k [Integer] Number of top results
119
+ # @return [Array<DetectionResult>] Detection results
120
+ def detect_from_file(filepath, top_k: 1)
121
+ text = File.read(filepath, encoding: 'UTF-8')
122
+ detect(text, top_k: top_k)
123
+ end
124
+
125
+ # Get the most likely language.
126
+ #
127
+ # @param text [String] Text to analyze
128
+ # @return [DetectionResult, nil] Top detection result
129
+ def detect_primary(text)
130
+ detect(text, top_k: 1).first
131
+ end
132
+
133
+ # Check if model is downloaded.
134
+ #
135
+ # @return [Boolean] True if model file exists
136
+ def model_downloaded?
137
+ File.exist?(@model_path)
138
+ end
139
+
140
+ # Download the FastText LID model.
141
+ #
142
+ # @return [String] Path to downloaded model
143
+ def download_model
144
+ require 'net/http'
145
+ require 'uri'
146
+ require 'fileutils'
147
+
148
+ # Create directory
149
+ FileUtils.mkdir_p(File.dirname(@model_path))
150
+
151
+ puts "Downloading language identification model..."
152
+ puts " From: #{MODEL_URL}"
153
+ puts " To: #{@model_path}"
154
+
155
+ uri = URI.parse(MODEL_URL)
156
+ http = Net::HTTP.new(uri.host, uri.port)
157
+ http.use_ssl = true
158
+
159
+ request = Net::HTTP::Get.new(uri.request_uri)
160
+
161
+ http.request(request) do |response|
162
+ case response
163
+ when Net::HTTPSuccess
164
+ File.open(@model_path, 'wb') do |file|
165
+ response.read_body do |chunk|
166
+ file.write(chunk)
167
+ end
168
+ end
169
+ puts " ✓ Download complete"
170
+ when Net::HTTPRedirection
171
+ # Follow redirect
172
+ follow_redirect(response['location'])
173
+ else
174
+ raise "Failed to download model: #{response.code} #{response.message}"
175
+ end
176
+ end
177
+
178
+ @model_path
179
+ end
180
+
181
+ # Get supported languages.
182
+ #
183
+ # @return [Array<String>] List of supported ISO 639-1 codes
184
+ def self.supported_languages
185
+ LANGUAGE_MAPPING.keys
186
+ end
187
+
188
+ private
189
+
190
+ # Get default model path.
191
+ #
192
+ # @return [String] Default path for lid.176.ftz
193
+ def default_model_path
194
+ File.join(Kotoshu::Paths.cache_path, 'models', 'lid.176.ftz')
195
+ end
196
+
197
+ # Ensure model is loaded.
198
+ def ensure_model_loaded
199
+ # Download if needed
200
+ download_model unless model_downloaded? if @auto_download
201
+
202
+ raise "Model not found: #{@model_path}" unless model_downloaded?
203
+
204
+ # Load model (lazy)
205
+ return if @loaded
206
+
207
+ load_model
208
+ end
209
+
210
+ # Load the FastText model.
211
+ def load_model
212
+ # Try to use fasttext CLI
213
+ if fasttext_available?
214
+ @loaded = true
215
+ return
216
+ end
217
+
218
+ # Try to use Python fasttext library
219
+ if python_fasttext_available?
220
+ @loaded = true
221
+ return
222
+ end
223
+
224
+ raise "FastText not available. Install fasttext CLI or Python library"
225
+ end
226
+
227
+ # Check if fasttext CLI is available.
228
+ #
229
+ # @return [Boolean] True if fasttext command exists
230
+ def fasttext_available?
231
+ system('which', 'fasttext', out: File::NULL, err: File::NULL)
232
+ end
233
+
234
+ # Check if Python fasttext library is available.
235
+ #
236
+ # @return [Boolean] True if fasttext Python package is installed
237
+ def python_fasttext_available?
238
+ system('python3', '-c', 'import fasttext', out: File::NULL, err: File::NULL)
239
+ end
240
+
241
+ # Preprocess text for detection.
242
+ #
243
+ # @param text [String] Raw text
244
+ # @return [String] Preprocessed text
245
+ def preprocess_text(text)
246
+ # Remove leading/trailing whitespace
247
+ text = text.strip
248
+
249
+ # Take first N characters (FastText LID works best with 100-1000 chars)
250
+ # Taking first 500 characters as default
251
+ text = text[0..500] if text.length > 500
252
+
253
+ # Normalize whitespace
254
+ text = text.gsub(/\s+/, ' ')
255
+
256
+ text
257
+ end
258
+
259
+ # Run language detection.
260
+ #
261
+ # @param text [String] Preprocessed text
262
+ # @param top_k [Integer] Number of results
263
+ # @return [Array<DetectionResult>] Detection results
264
+ def run_detection(text, top_k)
265
+ # Create temp file with text
266
+ require 'tempfile'
267
+ Tempfile.create('lid_input_', encoding: 'UTF-8') do |f|
268
+ f.write(text)
269
+ f.flush
270
+
271
+ # Run fasttext command
272
+ if fasttext_available?
273
+ return run_fasttext_cli(f.path, top_k)
274
+ end
275
+
276
+ # Run Python fasttext
277
+ if python_fasttext_available?
278
+ return run_python_fasttext(f.path, top_k)
279
+ end
280
+ end
281
+ end
282
+
283
+ # Run detection using fasttext CLI.
284
+ #
285
+ # @param input_file [String] Path to input file
286
+ # @param top_k [Integer] Number of results
287
+ # @return [Array<DetectionResult>] Detection results
288
+ def run_fasttext_cli(input_file, top_k)
289
+ require 'open3'
290
+
291
+ cmd = [
292
+ 'fasttext',
293
+ 'predict',
294
+ @model_path,
295
+ input_file,
296
+ top_k.to_s
297
+ ]
298
+
299
+ output, = Open3.capture3(*cmd)
300
+
301
+ parse_fasttext_output(output)
302
+ end
303
+
304
+ # Run detection using Python fasttext.
305
+ #
306
+ # @param input_file [String] Path to input file
307
+ # @param top_k [Integer] Number of results
308
+ # @return [Array<DetectionResult>] Detection results
309
+ def run_python_fasttext(input_file, top_k)
310
+ require 'open3'
311
+
312
+ script = <<~PYTHON
313
+ import fasttext
314
+ model = fasttext.load_model('#{@model_path}')
315
+ with open('#{input_file}', 'r') as f:
316
+ text = f.read().strip()
317
+ labels, probs = model.predict(text, k=#{top_k})
318
+ for label, prob in zip(labels, probs):
319
+ print(f"{label} {prob}")
320
+ PYTHON
321
+
322
+ output, = Open3.capture3('python3', '-c', script)
323
+
324
+ parse_fasttext_output(output)
325
+ end
326
+
327
+ # Parse FastText output.
328
+ #
329
+ # @param output [String] Raw output from fasttext
330
+ # @return [Array<DetectionResult>] Parsed results
331
+ def parse_fasttext_output(output)
332
+ output.split("\n").map do |line|
333
+ next if line.empty?
334
+
335
+ # Parse: __label__en 0.95
336
+ parts = line.strip.split
337
+ next unless parts.size == 2
338
+
339
+ label = parts[0].sub('__label__', '')
340
+ confidence = parts[1].to_f
341
+
342
+ # Map to ISO 639-1
343
+ language = LANGUAGE_MAPPING[label] || label
344
+
345
+ DetectionResult.new(
346
+ language: language,
347
+ confidence: confidence,
348
+ label: label
349
+ )
350
+ end.compact.sort_by { |r| -r.confidence }
351
+ end
352
+
353
+ # Follow HTTP redirect.
354
+ #
355
+ # @param url [String] Redirect URL
356
+ def follow_redirect(url)
357
+ uri = URI.parse(url)
358
+ http = Net::HTTP.new(uri.host, uri.port)
359
+ http.use_ssl = true if uri.scheme == 'https'
360
+
361
+ request = Net::HTTP::Get.new(uri.request_uri)
362
+
363
+ http.request(request) do |response|
364
+ case response
365
+ when Net::HTTPSuccess
366
+ File.open(@model_path, 'wb') do |file|
367
+ response.read_body do |chunk|
368
+ file.write(chunk)
369
+ end
370
+ end
371
+ when Net::HTTPRedirection
372
+ follow_redirect(response['location'])
373
+ end
374
+ end
375
+ end
376
+ end
377
+ end
378
+ end
@@ -0,0 +1,256 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Language
5
+ # Abstract base class for language implementations.
6
+ #
7
+ # Uses Template Method pattern to define the interface that all
8
+ # language implementations must follow.
9
+ #
10
+ # Each language implementation should:
11
+ # 1. Inherit from this class
12
+ # 2. Implement the required template methods
13
+ # 3. Register itself with Language::Registry
14
+ #
15
+ # @example Implement a language
16
+ # class English < Kotoshu::Language::Base
17
+ # register "en"
18
+ #
19
+ # def initialize
20
+ # super(code: "en", name: "English")
21
+ # end
22
+ #
23
+ # def tokenizer
24
+ # @tokenizer ||= Tokenizer::LatinTokenizer.new
25
+ # end
26
+ #
27
+ # def normalizer
28
+ # @normalizer ||= Normalizer::Base.new
29
+ # end
30
+ #
31
+ # def dictionary_class
32
+ # Dictionary::UnixWords
33
+ # end
34
+ # end
35
+ class Base
36
+ attr_reader :code, :name, :variant, :region
37
+
38
+ # Initialize language.
39
+ #
40
+ # @param code [String] Language code (e.g., "en", "en-US", "de-DE")
41
+ # @param name [String] Human-readable name
42
+ # @param variant [String, nil] Variant name (e.g., "American", "British")
43
+ def initialize(code:, name:, variant: nil)
44
+ @code = code
45
+ @name = name
46
+ @variant = variant
47
+ @region = extract_region(code)
48
+ end
49
+
50
+ # Get tokenizer for this language.
51
+ #
52
+ # Subclasses must implement.
53
+ #
54
+ # @return [Tokenizer::Base] Language-specific tokenizer
55
+ # @raise [NotImplementedError] If not implemented
56
+ def tokenizer
57
+ raise NotImplementedError, "#{self.class} must implement #tokenizer"
58
+ end
59
+
60
+ # Get normalizer for this language.
61
+ #
62
+ # Subclasses must implement.
63
+ #
64
+ # @return [Normalizer::Base] Language-specific normalizer
65
+ # @raise [NotImplementedError] If not implemented
66
+ def normalizer
67
+ raise NotImplementedError, "#{self.class} must implement #normalizer"
68
+ end
69
+
70
+ # Get dictionary class for this language.
71
+ #
72
+ # Subclasses must implement.
73
+ #
74
+ # @return [Class] Dictionary backend class
75
+ # @raise [NotImplementedError] If not implemented
76
+ def dictionary_class
77
+ raise NotImplementedError, "#{self.class} must implement #dictionary_class"
78
+ end
79
+
80
+ # Get default dictionary paths for this language.
81
+ #
82
+ # Subclasses can override to provide language-specific paths.
83
+ #
84
+ # @return [Array<String>] List of dictionary paths to search
85
+ def default_dictionary_paths
86
+ []
87
+ end
88
+
89
+ # Get character encoding for this language.
90
+ #
91
+ # Default is UTF-8 for all languages.
92
+ #
93
+ # @return [String] Character encoding name
94
+ def encoding
95
+ "UTF-8"
96
+ end
97
+
98
+ # Check if language uses right-to-left script.
99
+ #
100
+ # Default is false. Override for Arabic, Hebrew, etc.
101
+ #
102
+ # @return [Boolean] True if RTL
103
+ def rtl?
104
+ false
105
+ end
106
+
107
+ # Get script type for this language.
108
+ #
109
+ # Possible values: :latin, :cyrillic, :arabic, :cjk, :mixed
110
+ #
111
+ # @return [Symbol] Script type
112
+ def script_type
113
+ :latin
114
+ end
115
+
116
+ # Tokenize text using language-specific tokenizer.
117
+ #
118
+ # @param text [String] Text to tokenize
119
+ # @return [Array<String>] Array of tokens
120
+ def tokenize(text)
121
+ tokenizer.tokenize(text)
122
+ end
123
+
124
+ # Normalize text using language-specific normalizer.
125
+ #
126
+ # @param text [String] Text to normalize
127
+ # @param options [Hash] Normalization options
128
+ # @return [String] Normalized text
129
+ def normalize(text, options = {})
130
+ normalizer.normalize(text, options)
131
+ end
132
+
133
+ # Check if a word is valid in this language.
134
+ #
135
+ # Uses dictionary lookup.
136
+ #
137
+ # @param word [String] Word to check
138
+ # @param dictionary [Dictionary::Base] Dictionary to use
139
+ # @return [Boolean] True if word is valid
140
+ def valid_word?(word, dictionary:)
141
+ normalized = normalize_word(word)
142
+ dictionary.lookup(normalized)
143
+ end
144
+
145
+ # Normalize a word for checking.
146
+ #
147
+ # @param word [String] Word to normalize
148
+ # @return [String] Normalized word
149
+ def normalize_word(word)
150
+ normalizer.normalize_word(word)
151
+ end
152
+
153
+ # Get language info hash.
154
+ #
155
+ # @return [Hash] Language information
156
+ def info
157
+ {
158
+ code: code,
159
+ name: name,
160
+ variant: variant,
161
+ region: region,
162
+ encoding: encoding,
163
+ rtl?: rtl?,
164
+ script_type: script_type,
165
+ dictionary_class: dictionary_class.name
166
+ }
167
+ end
168
+
169
+ # Check if this language matches given code.
170
+ #
171
+ # Supports base language matching (e.g., "en" matches "en-US").
172
+ #
173
+ # @param other_code [String] Code to compare
174
+ # @return [Boolean] True if matches
175
+ def matches_code?(other_code)
176
+ return false if other_code.nil?
177
+
178
+ code == other_code ||
179
+ code.split("-").first == other_code.split("-").first
180
+ end
181
+
182
+ # Get full language name with variant.
183
+ #
184
+ # @return [String] Full name
185
+ def full_name
186
+ return name unless variant
187
+
188
+ "#{name} (#{variant})"
189
+ end
190
+
191
+ # Check if this is a base language (no region).
192
+ #
193
+ # @return [Boolean] True if base language
194
+ def base_language?
195
+ !code.include?("-")
196
+ end
197
+
198
+ # Get base language code.
199
+ #
200
+ # @return [String] Base language code (e.g., "en" from "en-US")
201
+ def base_code
202
+ code.split("-").first
203
+ end
204
+
205
+ # Get region code.
206
+ #
207
+ # @return [String, nil] Region code or nil
208
+ def region_code
209
+ return nil unless code.include?("-")
210
+
211
+ code.split("-", 2).last
212
+ end
213
+
214
+ # Check if another language is compatible.
215
+ #
216
+ # Languages are compatible if they share the same base code.
217
+ #
218
+ # @param other [Base] Other language
219
+ # @return [Boolean] True if compatible
220
+ def compatible_with?(other)
221
+ return false unless other.is_a?(Base)
222
+
223
+ base_code == other.base_code
224
+ end
225
+
226
+ class << self
227
+ # Register this language with the registry.
228
+ #
229
+ # @param code [String] Language code
230
+ # @return [void]
231
+ def register(code)
232
+ Kotoshu::Language::Registry.register(code, self)
233
+ end
234
+
235
+ # Get or create singleton instance.
236
+ #
237
+ # @return [Base] Language instance
238
+ def instance
239
+ @instance ||= new
240
+ end
241
+ end
242
+
243
+ private
244
+
245
+ # Extract region from language code.
246
+ #
247
+ # @param code [String] Language code
248
+ # @return [String, nil] Region or nil
249
+ def extract_region(code)
250
+ return nil unless code.include?("-")
251
+
252
+ code.split("-", 2).last.upcase
253
+ end
254
+ end
255
+ end
256
+ end