kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'layout'
|
|
4
|
+
require_relative 'layouts/qwerty'
|
|
5
|
+
require_relative 'layouts/qwertz'
|
|
6
|
+
require_relative 'layouts/azerty'
|
|
7
|
+
require_relative 'layouts/jcuken'
|
|
8
|
+
require_relative 'layouts/dvorak'
|
|
9
|
+
|
|
10
|
+
module Kotoshu
|
|
11
|
+
module Keyboard
|
|
12
|
+
# Registry for keyboard layouts
|
|
13
|
+
#
|
|
14
|
+
# The registry provides a centralized way to access keyboard layouts
|
|
15
|
+
# and automatically selects the appropriate layout for a given language.
|
|
16
|
+
#
|
|
17
|
+
# @example Getting layout for a language
|
|
18
|
+
# layout = Keyboard::Registry.layout_for('de')
|
|
19
|
+
# layout.name # => "QWERTZ"
|
|
20
|
+
#
|
|
21
|
+
# @example Getting layout by name
|
|
22
|
+
# layout = Keyboard::Registry.layout_by_name('Dvorak')
|
|
23
|
+
# layout.name # => "Dvorak"
|
|
24
|
+
#
|
|
25
|
+
# @example Listing all available layouts
|
|
26
|
+
# Keyboard::Registry.available_layouts.each do |layout|
|
|
27
|
+
# puts "#{layout.name}: #{layout.language_codes.join(', ')}"
|
|
28
|
+
# end
|
|
29
|
+
#
|
|
30
|
+
class Registry
|
|
31
|
+
class << self
|
|
32
|
+
# Register a keyboard layout
|
|
33
|
+
#
|
|
34
|
+
# @param layout_class [Class<Layout>] the layout class to register
|
|
35
|
+
# @return [Layout] the instantiated layout
|
|
36
|
+
def register(layout_class)
|
|
37
|
+
layouts[layout_class.name] = layout_class.new
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Get layout for a specific language code
|
|
41
|
+
#
|
|
42
|
+
# Searches for a layout that supports the given language code.
|
|
43
|
+
# Returns QWERTY as fallback if no matching layout is found.
|
|
44
|
+
#
|
|
45
|
+
# @param language_code [String] the language code (e.g., 'en', 'de', 'fr', 'ru')
|
|
46
|
+
# @return [Layout] the keyboard layout for the language
|
|
47
|
+
def layout_for(language_code)
|
|
48
|
+
# Try exact match first
|
|
49
|
+
layout = layouts.values.find { |l| l.supports_language?(language_code) }
|
|
50
|
+
|
|
51
|
+
# Try base language if variant (e.g., 'en-GB' -> 'en')
|
|
52
|
+
unless layout
|
|
53
|
+
base_lang = language_code.to_s.split('-').first
|
|
54
|
+
layout = layouts.values.find { |l| l.supports_language?(base_lang) }
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
layout || default_layout
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Get layout by name
|
|
61
|
+
#
|
|
62
|
+
# @param name [String, Symbol] the layout name (e.g., 'QWERTY', 'Dvorak')
|
|
63
|
+
# @return [Layout] the layout, or QWERTY as fallback if not found
|
|
64
|
+
def layout_by_name(name)
|
|
65
|
+
name_str = name.to_s
|
|
66
|
+
result = layouts.values.find do |layout|
|
|
67
|
+
layout.name == name_str ||
|
|
68
|
+
layout.class.name.end_with?("::#{name_str}")
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Return QWERTY as fallback (not default_layout to avoid recursion)
|
|
72
|
+
result || layouts['Kotoshu::Keyboard::Layouts::QWERTY']
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Get all available layouts
|
|
76
|
+
#
|
|
77
|
+
# @return [Array<Layout>] list of all registered layouts
|
|
78
|
+
def available_layouts
|
|
79
|
+
layouts.values
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Get all supported language codes
|
|
83
|
+
#
|
|
84
|
+
# @return [Array<String>] list of all language codes across all layouts
|
|
85
|
+
def supported_languages
|
|
86
|
+
layouts.values.flat_map(&:language_codes).uniq.sort
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Set the default layout
|
|
90
|
+
#
|
|
91
|
+
# @param layout_name [String, Symbol] the name of the layout to use as default
|
|
92
|
+
def register_default(layout_name)
|
|
93
|
+
@default_layout_name = layout_name
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Check if a language is supported
|
|
97
|
+
#
|
|
98
|
+
# @param language_code [String] the language code to check
|
|
99
|
+
# @return [Boolean] true if the language is supported by any layout
|
|
100
|
+
def supports_language?(language_code)
|
|
101
|
+
layouts.values.any? { |l| l.supports_language?(language_code) }
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Clear all registered layouts (mainly for testing)
|
|
105
|
+
#
|
|
106
|
+
# @return [void]
|
|
107
|
+
def clear!
|
|
108
|
+
@layouts = nil
|
|
109
|
+
@default_layout_name = nil
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
private
|
|
113
|
+
|
|
114
|
+
# Get or initialize the layouts hash
|
|
115
|
+
#
|
|
116
|
+
# @return [Hash] hash of layout class names to instances
|
|
117
|
+
def layouts
|
|
118
|
+
@layouts ||= {}
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Get the default layout
|
|
122
|
+
#
|
|
123
|
+
# @return [Layout] the default layout (QWERTY if none specified)
|
|
124
|
+
def default_layout
|
|
125
|
+
if @default_layout_name
|
|
126
|
+
name_str = @default_layout_name.to_s
|
|
127
|
+
layout = layouts.values.find do |l|
|
|
128
|
+
l.name == name_str || l.class.name.end_with?("::#{name_str}")
|
|
129
|
+
end
|
|
130
|
+
return layout if layout
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Return QWERTY as the ultimate fallback
|
|
134
|
+
layouts['Kotoshu::Keyboard::Layouts::QWERTY'] || layouts.values.first
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Auto-register all layout classes on load
|
|
139
|
+
register(Layouts::QWERTY)
|
|
140
|
+
register(Layouts::QWERTZ)
|
|
141
|
+
register(Layouts::AZERTY)
|
|
142
|
+
register(Layouts::JCUKEN)
|
|
143
|
+
register(Layouts::Dvorak)
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'keyboard/registry'
|
|
4
|
+
|
|
5
|
+
module Kotoshu
|
|
6
|
+
# Keyboard layout system for Kotoshu
|
|
7
|
+
#
|
|
8
|
+
# This module provides access to keyboard layouts for typo detection
|
|
9
|
+
# and suggestion ranking in multi-language spell checking.
|
|
10
|
+
#
|
|
11
|
+
# @example Getting a keyboard layout for a language
|
|
12
|
+
# layout = Kotoshu::Keyboard.layout_for('de')
|
|
13
|
+
# layout.distance('z', 'y') # => 1 (adjacent on QWERTZ)
|
|
14
|
+
#
|
|
15
|
+
# @example Getting a layout by name
|
|
16
|
+
# dvorak = Kotoshu::Keyboard.layout_by_name('Dvorak')
|
|
17
|
+
# dvorak.distance('a', 'e') # => 2 (home row on Dvorak)
|
|
18
|
+
#
|
|
19
|
+
module Keyboard
|
|
20
|
+
class << self
|
|
21
|
+
# Get keyboard layout for a language code
|
|
22
|
+
#
|
|
23
|
+
# @param language_code [String] the language code (e.g., 'en', 'de', 'fr', 'ru')
|
|
24
|
+
# @return [Layout] the keyboard layout for the language
|
|
25
|
+
def layout_for(language_code)
|
|
26
|
+
Registry.layout_for(language_code)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Get keyboard layout by name
|
|
30
|
+
#
|
|
31
|
+
# @param name [String, Symbol] the layout name (e.g., 'QWERTY', 'Dvorak')
|
|
32
|
+
# @return [Layout, nil] the layout, or nil if not found
|
|
33
|
+
def layout_by_name(name)
|
|
34
|
+
Registry.layout_by_name(name)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Get all available layouts
|
|
38
|
+
#
|
|
39
|
+
# @return [Array<Layout>] list of all registered layouts
|
|
40
|
+
def available_layouts
|
|
41
|
+
Registry.available_layouts
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Get all supported language codes
|
|
45
|
+
#
|
|
46
|
+
# @return [Array<String>] list of all language codes across all layouts
|
|
47
|
+
def supported_languages
|
|
48
|
+
Registry.supported_languages
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Check if a language is supported
|
|
52
|
+
#
|
|
53
|
+
# @param language_code [String] the language code to check
|
|
54
|
+
# @return [Boolean] true if the language is supported by any layout
|
|
55
|
+
def supports_language?(language_code)
|
|
56
|
+
Registry.supports_language?(language_code)
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Language
|
|
5
|
+
# Language detection based on character sets and patterns.
|
|
6
|
+
#
|
|
7
|
+
# Uses Unicode character ranges to identify probable language.
|
|
8
|
+
# Provides confidence scoring for multiple matches.
|
|
9
|
+
#
|
|
10
|
+
# @example Detect language
|
|
11
|
+
# Language::Detector.detect("Hello world") # => "en"
|
|
12
|
+
class Detector
|
|
13
|
+
# Character set ranges for language detection
|
|
14
|
+
CHARACTER_SETS = {
|
|
15
|
+
cyrillic: /\p{Cyrillic}/,
|
|
16
|
+
hiragana: /[\u3040-\u309F]/,
|
|
17
|
+
katakana: /[\u30A0-\u30FF]/,
|
|
18
|
+
cjk: /[\u4E00-\u9FFF]/,
|
|
19
|
+
hangul: /[\uAC00-\uD7AF]/,
|
|
20
|
+
latin: /[a-zA-Zà-ÿ]/
|
|
21
|
+
}.freeze
|
|
22
|
+
|
|
23
|
+
# Language-specific patterns
|
|
24
|
+
LANGUAGE_PATTERNS = {
|
|
25
|
+
# Russian: Cyrillic
|
|
26
|
+
russian: {
|
|
27
|
+
pattern: /\p{Cyrillic}[а-яА-ЯёЁ]/,
|
|
28
|
+
min_ratio: 0.3,
|
|
29
|
+
scripts: [:cyrillic]
|
|
30
|
+
},
|
|
31
|
+
|
|
32
|
+
# Japanese: Mixed script (Hiragana + Katakana + Kanji)
|
|
33
|
+
japanese: {
|
|
34
|
+
pattern: /[\u3040-\u309F]|[\u30A0-\u30FF]|[\u4E00-\u9FFF]/,
|
|
35
|
+
min_ratio: 0.2,
|
|
36
|
+
scripts: [:hiragana, :katakana, :cjk],
|
|
37
|
+
must_have: [:hiragana] # Only require hiragana, not both
|
|
38
|
+
},
|
|
39
|
+
|
|
40
|
+
# Portuguese: Latin with specific accents
|
|
41
|
+
portuguese: {
|
|
42
|
+
pattern: /[ãõáàâãéêíóôõúç]/i,
|
|
43
|
+
min_ratio: 0.05,
|
|
44
|
+
scripts: [:latin]
|
|
45
|
+
},
|
|
46
|
+
|
|
47
|
+
# French: Latin with specific accents (NOT German umlauts)
|
|
48
|
+
french: {
|
|
49
|
+
pattern: /[éèêëàâùûüîïôç]/i, # Removed ä, ö (not French)
|
|
50
|
+
min_ratio: 0.02, # Lower threshold
|
|
51
|
+
scripts: [:latin],
|
|
52
|
+
priority: 1 # Higher priority than English
|
|
53
|
+
},
|
|
54
|
+
|
|
55
|
+
# Spanish: Latin with inverted punctuation
|
|
56
|
+
spanish: {
|
|
57
|
+
pattern: /[áéíóúüñ¿¡]/i,
|
|
58
|
+
min_ratio: 0.02, # Lower threshold
|
|
59
|
+
scripts: [:latin],
|
|
60
|
+
priority: 1
|
|
61
|
+
},
|
|
62
|
+
|
|
63
|
+
# German: Latin with umlauts and eszett
|
|
64
|
+
german: {
|
|
65
|
+
pattern: /[äöüßÄÖÜ]/, # Explicitly include uppercase
|
|
66
|
+
min_ratio: 0.02, # Lower threshold
|
|
67
|
+
scripts: [:latin],
|
|
68
|
+
priority: 1
|
|
69
|
+
},
|
|
70
|
+
|
|
71
|
+
# English: Latin with minimal accents
|
|
72
|
+
english: {
|
|
73
|
+
pattern: /[a-zA-Z]/,
|
|
74
|
+
min_ratio: 0.3,
|
|
75
|
+
scripts: [:latin],
|
|
76
|
+
max_accent_ratio: 0.02
|
|
77
|
+
}
|
|
78
|
+
}.freeze
|
|
79
|
+
|
|
80
|
+
# Language code mapping
|
|
81
|
+
CODE_MAPPING = {
|
|
82
|
+
russian: "ru",
|
|
83
|
+
japanese: "ja",
|
|
84
|
+
portuguese: "pt",
|
|
85
|
+
french: "fr",
|
|
86
|
+
spanish: "es",
|
|
87
|
+
german: "de",
|
|
88
|
+
english: "en"
|
|
89
|
+
}.freeze
|
|
90
|
+
|
|
91
|
+
class << self
|
|
92
|
+
# Detect language from text.
|
|
93
|
+
#
|
|
94
|
+
# Returns the most probable language code based on character analysis.
|
|
95
|
+
#
|
|
96
|
+
# @param text [String] Text to analyze
|
|
97
|
+
# @return [String, nil] Detected language code or nil if uncertain
|
|
98
|
+
def detect(text)
|
|
99
|
+
return nil if text.nil? || text.strip.empty?
|
|
100
|
+
|
|
101
|
+
scores = analyze_languages(text)
|
|
102
|
+
return nil if scores.empty?
|
|
103
|
+
|
|
104
|
+
# Sort by score, then by priority (higher priority first)
|
|
105
|
+
result = scores.max_by do |code, score|
|
|
106
|
+
config = LANGUAGE_PATTERNS.find { |k, v| CODE_MAPPING[k] == code }
|
|
107
|
+
priority = config&.last&.dig(:priority) || 0
|
|
108
|
+
[score, priority]
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
result&.first
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Detect with confidence score.
|
|
115
|
+
#
|
|
116
|
+
# @param text [String] Text to analyze
|
|
117
|
+
# @return [Array<String, Float>] Language code and confidence (0-1)
|
|
118
|
+
def detect_with_confidence(text)
|
|
119
|
+
return [nil, 0.0] if text.nil? || text.strip.empty?
|
|
120
|
+
|
|
121
|
+
scores = analyze_languages(text)
|
|
122
|
+
return [nil, 0.0] if scores.empty?
|
|
123
|
+
|
|
124
|
+
top_language, top_score = scores.max_by { |_, score| score }
|
|
125
|
+
confidence = normalize_confidence(top_score, scores.values)
|
|
126
|
+
|
|
127
|
+
[top_language, confidence]
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Get multiple language candidates.
|
|
131
|
+
#
|
|
132
|
+
# @param text [String] Text to analyze
|
|
133
|
+
# @param limit [Integer] Maximum candidates to return
|
|
134
|
+
# @return [Array<Array<String, Float>>] Array of [code, confidence] pairs
|
|
135
|
+
def detect_candidates(text, limit: 3)
|
|
136
|
+
return [] if text.nil? || text.strip.empty?
|
|
137
|
+
|
|
138
|
+
scores = analyze_languages(text)
|
|
139
|
+
return [] if scores.empty?
|
|
140
|
+
|
|
141
|
+
total_score = scores.values.sum.to_f
|
|
142
|
+
scores
|
|
143
|
+
.sort_by { |_, score| -score }
|
|
144
|
+
.first(limit)
|
|
145
|
+
.map { |code, score| [code, score / total_score] }
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
private
|
|
149
|
+
|
|
150
|
+
# Analyze text and score each language.
|
|
151
|
+
#
|
|
152
|
+
# @param text [String] Text to analyze
|
|
153
|
+
# @return [Hash] Hash mapping language codes to scores
|
|
154
|
+
def analyze_languages(text)
|
|
155
|
+
text_length = text.length.to_f
|
|
156
|
+
return {} if text_length.zero?
|
|
157
|
+
|
|
158
|
+
scores = {}
|
|
159
|
+
|
|
160
|
+
LANGUAGE_PATTERNS.each do |language, config|
|
|
161
|
+
score = score_language(text, language, config, text_length)
|
|
162
|
+
scores[CODE_MAPPING[language]] = score if score > 0
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
scores
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Score a specific language against text.
|
|
169
|
+
#
|
|
170
|
+
# @param text [String] Text to analyze
|
|
171
|
+
# @param language [Symbol] Language key
|
|
172
|
+
# @param config [Hash] Language configuration
|
|
173
|
+
# @param text_length [Float] Length of text
|
|
174
|
+
# @return [Float] Score (0-1)
|
|
175
|
+
def score_language(text, language, config, text_length)
|
|
176
|
+
# Check required scripts
|
|
177
|
+
if config[:must_have]
|
|
178
|
+
return 0 unless config[:must_have].all? do |script|
|
|
179
|
+
text.match?(CHARACTER_SETS[script])
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Check forbidden scripts
|
|
184
|
+
if config[:must_not_have]
|
|
185
|
+
return 0 if config[:must_not_have].any? do |script|
|
|
186
|
+
text.match?(CHARACTER_SETS[script])
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Count matching characters
|
|
191
|
+
matches = text.scan(config[:pattern]).length
|
|
192
|
+
ratio = matches / text_length
|
|
193
|
+
|
|
194
|
+
# Check minimum ratio
|
|
195
|
+
return 0 if ratio < config[:min_ratio]
|
|
196
|
+
|
|
197
|
+
# Check maximum accent ratio (for English)
|
|
198
|
+
if config[:max_accent_ratio]
|
|
199
|
+
accent_chars = text.scan(/[à-ÿ]/).length
|
|
200
|
+
accent_ratio = accent_chars / text_length
|
|
201
|
+
return 0 if accent_ratio > config[:max_accent_ratio]
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Bonus for having required scripts
|
|
205
|
+
score = ratio
|
|
206
|
+
if config[:scripts]
|
|
207
|
+
script_bonus = config[:scripts].count do |script|
|
|
208
|
+
text.match?(CHARACTER_SETS[script])
|
|
209
|
+
end
|
|
210
|
+
score *= (1 + script_bonus * 0.1)
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Extra bonus for non-Latin specific characters (accents, umlauts, etc.)
|
|
214
|
+
# This helps distinguish languages with special characters from plain English
|
|
215
|
+
if language != :english && matches > 0
|
|
216
|
+
# Calculate what portion of the text is the special characters
|
|
217
|
+
special_char_ratio = matches / text_length
|
|
218
|
+
# Give bonus proportional to special character presence
|
|
219
|
+
score *= (1 + special_char_ratio)
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
[score, 1.0].min
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Normalize confidence score.
|
|
226
|
+
#
|
|
227
|
+
# @param top_score [Float] Highest score
|
|
228
|
+
# @param all_scores [Array<Float>] All scores
|
|
229
|
+
# @return [Float] Normalized confidence (0-1)
|
|
230
|
+
def normalize_confidence(top_score, all_scores)
|
|
231
|
+
return 0.0 if top_score.zero?
|
|
232
|
+
|
|
233
|
+
second_best = all_scores.sort { |a, b| b <=> a }[1] || 0
|
|
234
|
+
return 1.0 if second_best.zero?
|
|
235
|
+
|
|
236
|
+
ratio = top_score / (top_score + second_best)
|
|
237
|
+
(ratio * 0.8 + 0.2).clamp(0.0, 1.0) # Minimum confidence 0.2
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
end
|