kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Language
|
|
5
|
+
# Language identification using FastText LID model.
|
|
6
|
+
#
|
|
7
|
+
# Identifies the language of text using FastText's pretrained
|
|
8
|
+
# language identification model (lid.176.ftz).
|
|
9
|
+
#
|
|
10
|
+
# @example Detect language
|
|
11
|
+
# lid = LanguageIdentifier.new
|
|
12
|
+
# result = lid.detect("Hello world")
|
|
13
|
+
# result.language # => "en"
|
|
14
|
+
# result.confidence # => 0.95
|
|
15
|
+
#
|
|
16
|
+
# @example Detect from file
|
|
17
|
+
# results = lid.detect_from_file("document.txt", top_k: 3)
|
|
18
|
+
# results.map(&:language) # => ["en", "de", "fr"]
|
|
19
|
+
class LanguageIdentifier
|
|
20
|
+
# FastText LID model URL
|
|
21
|
+
MODEL_URL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
|
|
22
|
+
|
|
23
|
+
# Language code mapping (FastText LID → ISO 639-1)
|
|
24
|
+
LANGUAGE_MAPPING = {
|
|
25
|
+
# FastText uses format like "__label__en" for English
|
|
26
|
+
'en' => 'en',
|
|
27
|
+
'de' => 'de',
|
|
28
|
+
'es' => 'es',
|
|
29
|
+
'fr' => 'fr',
|
|
30
|
+
'pt' => 'pt',
|
|
31
|
+
'ru' => 'ru',
|
|
32
|
+
'it' => 'it',
|
|
33
|
+
'nl' => 'nl',
|
|
34
|
+
'pl' => 'pl',
|
|
35
|
+
'sv' => 'sv',
|
|
36
|
+
'da' => 'da',
|
|
37
|
+
'no' => 'no',
|
|
38
|
+
'fi' => 'fi',
|
|
39
|
+
'cs' => 'cs',
|
|
40
|
+
'el' => 'el',
|
|
41
|
+
'hu' => 'hu',
|
|
42
|
+
'ro' => 'ro',
|
|
43
|
+
'bg' => 'bg',
|
|
44
|
+
'sk' => 'sk',
|
|
45
|
+
'sl' => 'sl',
|
|
46
|
+
'hr' => 'hr',
|
|
47
|
+
'sr' => 'sr',
|
|
48
|
+
'et' => 'et',
|
|
49
|
+
'lv' => 'lv',
|
|
50
|
+
'lt' => 'lt',
|
|
51
|
+
'mt' => 'mt',
|
|
52
|
+
'ga' => 'ga',
|
|
53
|
+
'cy' => 'cy',
|
|
54
|
+
'tr' => 'tr',
|
|
55
|
+
'ar' => 'ar',
|
|
56
|
+
'he' => 'he',
|
|
57
|
+
'fa' => 'fa',
|
|
58
|
+
'ur' => 'ur',
|
|
59
|
+
'hi' => 'hi',
|
|
60
|
+
'bn' => 'bn',
|
|
61
|
+
'ta' => 'ta',
|
|
62
|
+
'te' => 'te',
|
|
63
|
+
'ml' => 'ml',
|
|
64
|
+
'kn' => 'kn',
|
|
65
|
+
'th' => 'th',
|
|
66
|
+
'vi' => 'vi',
|
|
67
|
+
'id' => 'id',
|
|
68
|
+
'ms' => 'ms',
|
|
69
|
+
'sw' => 'sw',
|
|
70
|
+
'zh' => 'zh',
|
|
71
|
+
'ja' => 'ja',
|
|
72
|
+
'ko' => 'ko'
|
|
73
|
+
}.freeze
|
|
74
|
+
|
|
75
|
+
# Value object for detection result.
|
|
76
|
+
#
|
|
77
|
+
# @attr_reader [String] language ISO 639-1 language code
|
|
78
|
+
# @attr_reader [Float] confidence Confidence score (0.0 to 1.0)
|
|
79
|
+
# @attr_reader [String] label Raw FastText label
|
|
80
|
+
DetectionResult = Struct.new(:language, :confidence, :label, keyword_init: true) do
|
|
81
|
+
def to_s
|
|
82
|
+
"#{language} (#{(confidence * 100).round(1)}%)"
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
attr_reader :model_path, :loaded
|
|
87
|
+
|
|
88
|
+
# Create a new language identifier.
|
|
89
|
+
#
|
|
90
|
+
# @param model_path [String] Path to lid.176.ftz model
|
|
91
|
+
# @param auto_download [Boolean] Download model if not found
|
|
92
|
+
def initialize(model_path: nil, auto_download: true)
|
|
93
|
+
@model_path = model_path || default_model_path
|
|
94
|
+
@auto_download = auto_download
|
|
95
|
+
@loaded = false
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Detect language of text.
|
|
99
|
+
#
|
|
100
|
+
# @param text [String] Text to analyze
|
|
101
|
+
# @param top_k [Integer] Number of top results to return
|
|
102
|
+
# @return [Array<DetectionResult>] Detection results sorted by confidence
|
|
103
|
+
def detect(text, top_k: 1)
|
|
104
|
+
ensure_model_loaded
|
|
105
|
+
|
|
106
|
+
# Preprocess text
|
|
107
|
+
text = preprocess_text(text)
|
|
108
|
+
|
|
109
|
+
# Run detection
|
|
110
|
+
results = run_detection(text, top_k)
|
|
111
|
+
|
|
112
|
+
results
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Detect language from file.
|
|
116
|
+
#
|
|
117
|
+
# @param filepath [String] Path to file
|
|
118
|
+
# @param top_k [Integer] Number of top results
|
|
119
|
+
# @return [Array<DetectionResult>] Detection results
|
|
120
|
+
def detect_from_file(filepath, top_k: 1)
|
|
121
|
+
text = File.read(filepath, encoding: 'UTF-8')
|
|
122
|
+
detect(text, top_k: top_k)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Get the most likely language.
|
|
126
|
+
#
|
|
127
|
+
# @param text [String] Text to analyze
|
|
128
|
+
# @return [DetectionResult, nil] Top detection result
|
|
129
|
+
def detect_primary(text)
|
|
130
|
+
detect(text, top_k: 1).first
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Check if model is downloaded.
|
|
134
|
+
#
|
|
135
|
+
# @return [Boolean] True if model file exists
|
|
136
|
+
def model_downloaded?
|
|
137
|
+
File.exist?(@model_path)
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Download the FastText LID model.
|
|
141
|
+
#
|
|
142
|
+
# @return [String] Path to downloaded model
|
|
143
|
+
def download_model
|
|
144
|
+
require 'net/http'
|
|
145
|
+
require 'uri'
|
|
146
|
+
require 'fileutils'
|
|
147
|
+
|
|
148
|
+
# Create directory
|
|
149
|
+
FileUtils.mkdir_p(File.dirname(@model_path))
|
|
150
|
+
|
|
151
|
+
puts "Downloading language identification model..."
|
|
152
|
+
puts " From: #{MODEL_URL}"
|
|
153
|
+
puts " To: #{@model_path}"
|
|
154
|
+
|
|
155
|
+
uri = URI.parse(MODEL_URL)
|
|
156
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
157
|
+
http.use_ssl = true
|
|
158
|
+
|
|
159
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
|
160
|
+
|
|
161
|
+
http.request(request) do |response|
|
|
162
|
+
case response
|
|
163
|
+
when Net::HTTPSuccess
|
|
164
|
+
File.open(@model_path, 'wb') do |file|
|
|
165
|
+
response.read_body do |chunk|
|
|
166
|
+
file.write(chunk)
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
puts " ✓ Download complete"
|
|
170
|
+
when Net::HTTPRedirection
|
|
171
|
+
# Follow redirect
|
|
172
|
+
follow_redirect(response['location'])
|
|
173
|
+
else
|
|
174
|
+
raise "Failed to download model: #{response.code} #{response.message}"
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
@model_path
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Get supported languages.
|
|
182
|
+
#
|
|
183
|
+
# @return [Array<String>] List of supported ISO 639-1 codes
|
|
184
|
+
def self.supported_languages
|
|
185
|
+
LANGUAGE_MAPPING.keys
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
private
|
|
189
|
+
|
|
190
|
+
# Get default model path.
|
|
191
|
+
#
|
|
192
|
+
# @return [String] Default path for lid.176.ftz
|
|
193
|
+
def default_model_path
|
|
194
|
+
File.join(Kotoshu::Paths.cache_path, 'models', 'lid.176.ftz')
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Ensure model is loaded.
|
|
198
|
+
def ensure_model_loaded
|
|
199
|
+
# Download if needed
|
|
200
|
+
download_model unless model_downloaded? if @auto_download
|
|
201
|
+
|
|
202
|
+
raise "Model not found: #{@model_path}" unless model_downloaded?
|
|
203
|
+
|
|
204
|
+
# Load model (lazy)
|
|
205
|
+
return if @loaded
|
|
206
|
+
|
|
207
|
+
load_model
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# Load the FastText model.
|
|
211
|
+
def load_model
|
|
212
|
+
# Try to use fasttext CLI
|
|
213
|
+
if fasttext_available?
|
|
214
|
+
@loaded = true
|
|
215
|
+
return
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# Try to use Python fasttext library
|
|
219
|
+
if python_fasttext_available?
|
|
220
|
+
@loaded = true
|
|
221
|
+
return
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
raise "FastText not available. Install fasttext CLI or Python library"
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# Check if fasttext CLI is available.
|
|
228
|
+
#
|
|
229
|
+
# @return [Boolean] True if fasttext command exists
|
|
230
|
+
def fasttext_available?
|
|
231
|
+
system('which', 'fasttext', out: File::NULL, err: File::NULL)
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# Check if Python fasttext library is available.
|
|
235
|
+
#
|
|
236
|
+
# @return [Boolean] True if fasttext Python package is installed
|
|
237
|
+
def python_fasttext_available?
|
|
238
|
+
system('python3', '-c', 'import fasttext', out: File::NULL, err: File::NULL)
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Preprocess text for detection.
|
|
242
|
+
#
|
|
243
|
+
# @param text [String] Raw text
|
|
244
|
+
# @return [String] Preprocessed text
|
|
245
|
+
def preprocess_text(text)
|
|
246
|
+
# Remove leading/trailing whitespace
|
|
247
|
+
text = text.strip
|
|
248
|
+
|
|
249
|
+
# Take first N characters (FastText LID works best with 100-1000 chars)
|
|
250
|
+
# Taking first 500 characters as default
|
|
251
|
+
text = text[0..500] if text.length > 500
|
|
252
|
+
|
|
253
|
+
# Normalize whitespace
|
|
254
|
+
text = text.gsub(/\s+/, ' ')
|
|
255
|
+
|
|
256
|
+
text
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# Run language detection.
|
|
260
|
+
#
|
|
261
|
+
# @param text [String] Preprocessed text
|
|
262
|
+
# @param top_k [Integer] Number of results
|
|
263
|
+
# @return [Array<DetectionResult>] Detection results
|
|
264
|
+
def run_detection(text, top_k)
|
|
265
|
+
# Create temp file with text
|
|
266
|
+
require 'tempfile'
|
|
267
|
+
Tempfile.create('lid_input_', encoding: 'UTF-8') do |f|
|
|
268
|
+
f.write(text)
|
|
269
|
+
f.flush
|
|
270
|
+
|
|
271
|
+
# Run fasttext command
|
|
272
|
+
if fasttext_available?
|
|
273
|
+
return run_fasttext_cli(f.path, top_k)
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# Run Python fasttext
|
|
277
|
+
if python_fasttext_available?
|
|
278
|
+
return run_python_fasttext(f.path, top_k)
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
# Run detection using fasttext CLI.
|
|
284
|
+
#
|
|
285
|
+
# @param input_file [String] Path to input file
|
|
286
|
+
# @param top_k [Integer] Number of results
|
|
287
|
+
# @return [Array<DetectionResult>] Detection results
|
|
288
|
+
def run_fasttext_cli(input_file, top_k)
|
|
289
|
+
require 'open3'
|
|
290
|
+
|
|
291
|
+
cmd = [
|
|
292
|
+
'fasttext',
|
|
293
|
+
'predict',
|
|
294
|
+
@model_path,
|
|
295
|
+
input_file,
|
|
296
|
+
top_k.to_s
|
|
297
|
+
]
|
|
298
|
+
|
|
299
|
+
output, = Open3.capture3(*cmd)
|
|
300
|
+
|
|
301
|
+
parse_fasttext_output(output)
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
# Run detection using Python fasttext.
|
|
305
|
+
#
|
|
306
|
+
# @param input_file [String] Path to input file
|
|
307
|
+
# @param top_k [Integer] Number of results
|
|
308
|
+
# @return [Array<DetectionResult>] Detection results
|
|
309
|
+
def run_python_fasttext(input_file, top_k)
|
|
310
|
+
require 'open3'
|
|
311
|
+
|
|
312
|
+
script = <<~PYTHON
|
|
313
|
+
import fasttext
|
|
314
|
+
model = fasttext.load_model('#{@model_path}')
|
|
315
|
+
with open('#{input_file}', 'r') as f:
|
|
316
|
+
text = f.read().strip()
|
|
317
|
+
labels, probs = model.predict(text, k=#{top_k})
|
|
318
|
+
for label, prob in zip(labels, probs):
|
|
319
|
+
print(f"{label} {prob}")
|
|
320
|
+
PYTHON
|
|
321
|
+
|
|
322
|
+
output, = Open3.capture3('python3', '-c', script)
|
|
323
|
+
|
|
324
|
+
parse_fasttext_output(output)
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
# Parse FastText output.
|
|
328
|
+
#
|
|
329
|
+
# @param output [String] Raw output from fasttext
|
|
330
|
+
# @return [Array<DetectionResult>] Parsed results
|
|
331
|
+
def parse_fasttext_output(output)
|
|
332
|
+
output.split("\n").map do |line|
|
|
333
|
+
next if line.empty?
|
|
334
|
+
|
|
335
|
+
# Parse: __label__en 0.95
|
|
336
|
+
parts = line.strip.split
|
|
337
|
+
next unless parts.size == 2
|
|
338
|
+
|
|
339
|
+
label = parts[0].sub('__label__', '')
|
|
340
|
+
confidence = parts[1].to_f
|
|
341
|
+
|
|
342
|
+
# Map to ISO 639-1
|
|
343
|
+
language = LANGUAGE_MAPPING[label] || label
|
|
344
|
+
|
|
345
|
+
DetectionResult.new(
|
|
346
|
+
language: language,
|
|
347
|
+
confidence: confidence,
|
|
348
|
+
label: label
|
|
349
|
+
)
|
|
350
|
+
end.compact.sort_by { |r| -r.confidence }
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
# Follow HTTP redirect.
|
|
354
|
+
#
|
|
355
|
+
# @param url [String] Redirect URL
|
|
356
|
+
def follow_redirect(url)
|
|
357
|
+
uri = URI.parse(url)
|
|
358
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
359
|
+
http.use_ssl = true if uri.scheme == 'https'
|
|
360
|
+
|
|
361
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
|
362
|
+
|
|
363
|
+
http.request(request) do |response|
|
|
364
|
+
case response
|
|
365
|
+
when Net::HTTPSuccess
|
|
366
|
+
File.open(@model_path, 'wb') do |file|
|
|
367
|
+
response.read_body do |chunk|
|
|
368
|
+
file.write(chunk)
|
|
369
|
+
end
|
|
370
|
+
end
|
|
371
|
+
when Net::HTTPRedirection
|
|
372
|
+
follow_redirect(response['location'])
|
|
373
|
+
end
|
|
374
|
+
end
|
|
375
|
+
end
|
|
376
|
+
end
|
|
377
|
+
end
|
|
378
|
+
end
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Language
|
|
5
|
+
# Abstract base class for language implementations.
|
|
6
|
+
#
|
|
7
|
+
# Uses Template Method pattern to define the interface that all
|
|
8
|
+
# language implementations must follow.
|
|
9
|
+
#
|
|
10
|
+
# Each language implementation should:
|
|
11
|
+
# 1. Inherit from this class
|
|
12
|
+
# 2. Implement the required template methods
|
|
13
|
+
# 3. Register itself with Language::Registry
|
|
14
|
+
#
|
|
15
|
+
# @example Implement a language
|
|
16
|
+
# class English < Kotoshu::Language::Base
|
|
17
|
+
# register "en"
|
|
18
|
+
#
|
|
19
|
+
# def initialize
|
|
20
|
+
# super(code: "en", name: "English")
|
|
21
|
+
# end
|
|
22
|
+
#
|
|
23
|
+
# def tokenizer
|
|
24
|
+
# @tokenizer ||= Tokenizer::LatinTokenizer.new
|
|
25
|
+
# end
|
|
26
|
+
#
|
|
27
|
+
# def normalizer
|
|
28
|
+
# @normalizer ||= Normalizer::Base.new
|
|
29
|
+
# end
|
|
30
|
+
#
|
|
31
|
+
# def dictionary_class
|
|
32
|
+
# Dictionary::UnixWords
|
|
33
|
+
# end
|
|
34
|
+
# end
|
|
35
|
+
class Base
|
|
36
|
+
attr_reader :code, :name, :variant, :region
|
|
37
|
+
|
|
38
|
+
# Initialize language.
|
|
39
|
+
#
|
|
40
|
+
# @param code [String] Language code (e.g., "en", "en-US", "de-DE")
|
|
41
|
+
# @param name [String] Human-readable name
|
|
42
|
+
# @param variant [String, nil] Variant name (e.g., "American", "British")
|
|
43
|
+
def initialize(code:, name:, variant: nil)
|
|
44
|
+
@code = code
|
|
45
|
+
@name = name
|
|
46
|
+
@variant = variant
|
|
47
|
+
@region = extract_region(code)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Get tokenizer for this language.
|
|
51
|
+
#
|
|
52
|
+
# Subclasses must implement.
|
|
53
|
+
#
|
|
54
|
+
# @return [Tokenizer::Base] Language-specific tokenizer
|
|
55
|
+
# @raise [NotImplementedError] If not implemented
|
|
56
|
+
def tokenizer
|
|
57
|
+
raise NotImplementedError, "#{self.class} must implement #tokenizer"
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Get normalizer for this language.
|
|
61
|
+
#
|
|
62
|
+
# Subclasses must implement.
|
|
63
|
+
#
|
|
64
|
+
# @return [Normalizer::Base] Language-specific normalizer
|
|
65
|
+
# @raise [NotImplementedError] If not implemented
|
|
66
|
+
def normalizer
|
|
67
|
+
raise NotImplementedError, "#{self.class} must implement #normalizer"
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Get dictionary class for this language.
|
|
71
|
+
#
|
|
72
|
+
# Subclasses must implement.
|
|
73
|
+
#
|
|
74
|
+
# @return [Class] Dictionary backend class
|
|
75
|
+
# @raise [NotImplementedError] If not implemented
|
|
76
|
+
def dictionary_class
|
|
77
|
+
raise NotImplementedError, "#{self.class} must implement #dictionary_class"
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Get default dictionary paths for this language.
|
|
81
|
+
#
|
|
82
|
+
# Subclasses can override to provide language-specific paths.
|
|
83
|
+
#
|
|
84
|
+
# @return [Array<String>] List of dictionary paths to search
|
|
85
|
+
def default_dictionary_paths
|
|
86
|
+
[]
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Get character encoding for this language.
|
|
90
|
+
#
|
|
91
|
+
# Default is UTF-8 for all languages.
|
|
92
|
+
#
|
|
93
|
+
# @return [String] Character encoding name
|
|
94
|
+
def encoding
|
|
95
|
+
"UTF-8"
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Check if language uses right-to-left script.
|
|
99
|
+
#
|
|
100
|
+
# Default is false. Override for Arabic, Hebrew, etc.
|
|
101
|
+
#
|
|
102
|
+
# @return [Boolean] True if RTL
|
|
103
|
+
def rtl?
|
|
104
|
+
false
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Get script type for this language.
|
|
108
|
+
#
|
|
109
|
+
# Possible values: :latin, :cyrillic, :arabic, :cjk, :mixed
|
|
110
|
+
#
|
|
111
|
+
# @return [Symbol] Script type
|
|
112
|
+
def script_type
|
|
113
|
+
:latin
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Tokenize text using language-specific tokenizer.
|
|
117
|
+
#
|
|
118
|
+
# @param text [String] Text to tokenize
|
|
119
|
+
# @return [Array<String>] Array of tokens
|
|
120
|
+
def tokenize(text)
|
|
121
|
+
tokenizer.tokenize(text)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Normalize text using language-specific normalizer.
|
|
125
|
+
#
|
|
126
|
+
# @param text [String] Text to normalize
|
|
127
|
+
# @param options [Hash] Normalization options
|
|
128
|
+
# @return [String] Normalized text
|
|
129
|
+
def normalize(text, options = {})
|
|
130
|
+
normalizer.normalize(text, options)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Check if a word is valid in this language.
|
|
134
|
+
#
|
|
135
|
+
# Uses dictionary lookup.
|
|
136
|
+
#
|
|
137
|
+
# @param word [String] Word to check
|
|
138
|
+
# @param dictionary [Dictionary::Base] Dictionary to use
|
|
139
|
+
# @return [Boolean] True if word is valid
|
|
140
|
+
def valid_word?(word, dictionary:)
|
|
141
|
+
normalized = normalize_word(word)
|
|
142
|
+
dictionary.lookup(normalized)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Normalize a word for checking.
|
|
146
|
+
#
|
|
147
|
+
# @param word [String] Word to normalize
|
|
148
|
+
# @return [String] Normalized word
|
|
149
|
+
def normalize_word(word)
|
|
150
|
+
normalizer.normalize_word(word)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Get language info hash.
|
|
154
|
+
#
|
|
155
|
+
# @return [Hash] Language information
|
|
156
|
+
def info
|
|
157
|
+
{
|
|
158
|
+
code: code,
|
|
159
|
+
name: name,
|
|
160
|
+
variant: variant,
|
|
161
|
+
region: region,
|
|
162
|
+
encoding: encoding,
|
|
163
|
+
rtl?: rtl?,
|
|
164
|
+
script_type: script_type,
|
|
165
|
+
dictionary_class: dictionary_class.name
|
|
166
|
+
}
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Check if this language matches given code.
|
|
170
|
+
#
|
|
171
|
+
# Supports base language matching (e.g., "en" matches "en-US").
|
|
172
|
+
#
|
|
173
|
+
# @param other_code [String] Code to compare
|
|
174
|
+
# @return [Boolean] True if matches
|
|
175
|
+
def matches_code?(other_code)
|
|
176
|
+
return false if other_code.nil?
|
|
177
|
+
|
|
178
|
+
code == other_code ||
|
|
179
|
+
code.split("-").first == other_code.split("-").first
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Get full language name with variant.
|
|
183
|
+
#
|
|
184
|
+
# @return [String] Full name
|
|
185
|
+
def full_name
|
|
186
|
+
return name unless variant
|
|
187
|
+
|
|
188
|
+
"#{name} (#{variant})"
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# Check if this is a base language (no region).
|
|
192
|
+
#
|
|
193
|
+
# @return [Boolean] True if base language
|
|
194
|
+
def base_language?
|
|
195
|
+
!code.include?("-")
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Get base language code.
|
|
199
|
+
#
|
|
200
|
+
# @return [String] Base language code (e.g., "en" from "en-US")
|
|
201
|
+
def base_code
|
|
202
|
+
code.split("-").first
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
# Get region code.
|
|
206
|
+
#
|
|
207
|
+
# @return [String, nil] Region code or nil
|
|
208
|
+
def region_code
|
|
209
|
+
return nil unless code.include?("-")
|
|
210
|
+
|
|
211
|
+
code.split("-", 2).last
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
# Check if another language is compatible.
|
|
215
|
+
#
|
|
216
|
+
# Languages are compatible if they share the same base code.
|
|
217
|
+
#
|
|
218
|
+
# @param other [Base] Other language
|
|
219
|
+
# @return [Boolean] True if compatible
|
|
220
|
+
def compatible_with?(other)
|
|
221
|
+
return false unless other.is_a?(Base)
|
|
222
|
+
|
|
223
|
+
base_code == other.base_code
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
class << self
|
|
227
|
+
# Register this language with the registry.
|
|
228
|
+
#
|
|
229
|
+
# @param code [String] Language code
|
|
230
|
+
# @return [void]
|
|
231
|
+
def register(code)
|
|
232
|
+
Kotoshu::Language::Registry.register(code, self)
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
# Get or create singleton instance.
|
|
236
|
+
#
|
|
237
|
+
# @return [Base] Language instance
|
|
238
|
+
def instance
|
|
239
|
+
@instance ||= new
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
private
|
|
244
|
+
|
|
245
|
+
# Extract region from language code.
|
|
246
|
+
#
|
|
247
|
+
# @param code [String] Language code
|
|
248
|
+
# @return [String, nil] Region or nil
|
|
249
|
+
def extract_region(code)
|
|
250
|
+
return nil unless code.include?("-")
|
|
251
|
+
|
|
252
|
+
code.split("-", 2).last.upcase
|
|
253
|
+
end
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
end
|