kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
# Metrics and instrumentation for Kotoshu.
|
|
5
|
+
#
|
|
6
|
+
# Provides thread-safe collection of performance metrics:
|
|
7
|
+
# - Lookup counts and timing
|
|
8
|
+
# - Cache hit/miss rates
|
|
9
|
+
# - Suggestion generation stats
|
|
10
|
+
# - Optional export to StatsD or Prometheus
|
|
11
|
+
#
|
|
12
|
+
# @example Enable metrics
|
|
13
|
+
# Kotoshu::Metrics.enable
|
|
14
|
+
# Kotoshu.correct?("hello")
|
|
15
|
+
# Kotoshu::Metrics.stats
|
|
16
|
+
# # => { lookups: 1, cache_hits: 0, cache_misses: 1, ... }
|
|
17
|
+
module Metrics
|
|
18
|
+
class << self
|
|
19
|
+
# Enable metrics collection.
|
|
20
|
+
def enable
|
|
21
|
+
@enabled = true
|
|
22
|
+
@collector = Collector.new
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Disable metrics collection.
|
|
26
|
+
def disable
|
|
27
|
+
@enabled = false
|
|
28
|
+
@collector = nil
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Check if metrics are enabled.
|
|
32
|
+
#
|
|
33
|
+
# @return [Boolean] True if enabled
|
|
34
|
+
def enabled?
|
|
35
|
+
@enabled ||= false
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Get the metrics collector.
|
|
39
|
+
#
|
|
40
|
+
# @return [Collector, nil] The collector instance
|
|
41
|
+
attr_reader :collector
|
|
42
|
+
|
|
43
|
+
# Record a lookup operation.
|
|
44
|
+
#
|
|
45
|
+
# @param word [String] The word being looked up
|
|
46
|
+
# @param result [Boolean] The lookup result
|
|
47
|
+
# @param time [Float] Time taken in milliseconds
|
|
48
|
+
def record_lookup(word, result:, time:)
|
|
49
|
+
return unless enabled?
|
|
50
|
+
|
|
51
|
+
collector&.record_lookup(word, result: result, time: time)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Record a cache operation.
|
|
55
|
+
#
|
|
56
|
+
# @param cache_type [String] Type of cache (lookup, suggestion)
|
|
57
|
+
# @param hit [Boolean] True if cache hit
|
|
58
|
+
def record_cache(cache_type, hit:)
|
|
59
|
+
return unless enabled?
|
|
60
|
+
|
|
61
|
+
collector&.record_cache(cache_type, hit: hit)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Record suggestion generation.
|
|
65
|
+
#
|
|
66
|
+
# @param word [String] The input word
|
|
67
|
+
# @param count [Integer] Number of suggestions generated
|
|
68
|
+
# @param time [Float] Time taken in milliseconds
|
|
69
|
+
def record_suggestions(word, count:, time:)
|
|
70
|
+
return unless enabled?
|
|
71
|
+
|
|
72
|
+
collector&.record_suggestions(word, count: count, time: time)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Get current metrics statistics.
|
|
76
|
+
#
|
|
77
|
+
# @return [Hash] Current statistics
|
|
78
|
+
def stats
|
|
79
|
+
return {} unless enabled?
|
|
80
|
+
|
|
81
|
+
collector&.stats || {}
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Reset all metrics.
|
|
85
|
+
def reset
|
|
86
|
+
return unless enabled?
|
|
87
|
+
|
|
88
|
+
collector&.reset
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Get metrics as StatsD format.
|
|
92
|
+
#
|
|
93
|
+
# @return [String] StatsD protocol lines
|
|
94
|
+
def to_statsd
|
|
95
|
+
return "" unless enabled?
|
|
96
|
+
|
|
97
|
+
collector&.to_statsd || ""
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Get metrics as Prometheus format.
|
|
101
|
+
#
|
|
102
|
+
# @return [String] Prometheus exposition format
|
|
103
|
+
def to_prometheus
|
|
104
|
+
return "" unless enabled?
|
|
105
|
+
|
|
106
|
+
collector&.to_prometheus || ""
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Models
|
|
5
|
+
# Value object for text context around an error.
|
|
6
|
+
#
|
|
7
|
+
# Provides the surrounding text before, current, and after
|
|
8
|
+
# an error location for context display and analysis.
|
|
9
|
+
#
|
|
10
|
+
# @example Creating context
|
|
11
|
+
# context = Context.new(
|
|
12
|
+
# before: "The quick brown",
|
|
13
|
+
# current: "fox",
|
|
14
|
+
# after: "jumps over",
|
|
15
|
+
# location: Location.new(line: 5, column: 16)
|
|
16
|
+
# )
|
|
17
|
+
# context.full_context # => "The quick brown fox jumps over"
|
|
18
|
+
class Context
|
|
19
|
+
attr_reader :before, :current, :after, :location, :window, :full_context
|
|
20
|
+
|
|
21
|
+
# Create a new context object.
|
|
22
|
+
#
|
|
23
|
+
# @param before [String] Text before the error
|
|
24
|
+
# @param current [String] The current line/text containing the error
|
|
25
|
+
# @param after [String] Text after the error
|
|
26
|
+
# @param location [Documents::Location] The error location
|
|
27
|
+
# @param window [Integer] Window size used for context (default: 5)
|
|
28
|
+
def initialize(before:, current:, after:, location:, window: 5)
|
|
29
|
+
@before = before
|
|
30
|
+
@current = current
|
|
31
|
+
@after = after
|
|
32
|
+
@location = location
|
|
33
|
+
@window = window
|
|
34
|
+
@full_context = [before, current, after].compact.join("\n")
|
|
35
|
+
freeze
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Get surrounding words around the error location.
|
|
39
|
+
#
|
|
40
|
+
# @param n [Integer] Number of words on each side (default: 3)
|
|
41
|
+
# @return [Array<String>] Surrounding words
|
|
42
|
+
def surrounding_words(n = 3)
|
|
43
|
+
return [] if @current.nil? || @current.empty?
|
|
44
|
+
|
|
45
|
+
words = @current.split
|
|
46
|
+
return [] if words.empty?
|
|
47
|
+
|
|
48
|
+
# Try to find the word at the error location
|
|
49
|
+
target_word = word_at_location
|
|
50
|
+
return words unless target_word
|
|
51
|
+
|
|
52
|
+
idx = words.index(target_word)
|
|
53
|
+
return words unless idx
|
|
54
|
+
|
|
55
|
+
# Get n words before and after
|
|
56
|
+
start_idx = [0, idx - n].max
|
|
57
|
+
end_idx = [words.size - 1, idx + n].min
|
|
58
|
+
|
|
59
|
+
words[start_idx..end_idx].to_a
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Get the word at the error location.
|
|
63
|
+
#
|
|
64
|
+
# @return [String, nil] The word at the error location
|
|
65
|
+
def word_at_location
|
|
66
|
+
return nil unless @location
|
|
67
|
+
|
|
68
|
+
if @location.column
|
|
69
|
+
# Get character at column
|
|
70
|
+
return @current[@location.column] if @current && @location.column < @current.length
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# For node-based locations, return the current text
|
|
74
|
+
@current
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Check if this context equals another.
|
|
78
|
+
#
|
|
79
|
+
# @param other [Object] Another object
|
|
80
|
+
# @return [Boolean] True if contexts match
|
|
81
|
+
def ==(other)
|
|
82
|
+
return false unless other.is_a?(Context)
|
|
83
|
+
|
|
84
|
+
@location == other.location && @full_context == other.full_context
|
|
85
|
+
end
|
|
86
|
+
alias_method :eql?, :==
|
|
87
|
+
|
|
88
|
+
# Hash code for hash table usage.
|
|
89
|
+
#
|
|
90
|
+
# @return [Integer] Hash code
|
|
91
|
+
def hash
|
|
92
|
+
[@location, @full_context].hash
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# String representation.
|
|
96
|
+
#
|
|
97
|
+
# @return [String] Human-readable representation
|
|
98
|
+
def to_s
|
|
99
|
+
if @location.line
|
|
100
|
+
"Line #{@location.line}: #{@full_context}"
|
|
101
|
+
else
|
|
102
|
+
@full_context
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
alias_method :inspect, :to_s
|
|
106
|
+
|
|
107
|
+
# Get context as a formatted string with error highlighting.
|
|
108
|
+
#
|
|
109
|
+
# @param error_word [String] The error word to highlight
|
|
110
|
+
# @return [String] Formatted context with ANSI codes
|
|
111
|
+
def with_highlight(error_word)
|
|
112
|
+
return @full_context unless error_word
|
|
113
|
+
|
|
114
|
+
# Find and highlight the error word
|
|
115
|
+
@full_context.gsub(error_word) { |m| "\033[4m#{m}\033[0m" }
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Models
|
|
5
|
+
# Abstract base class for word embedding models.
|
|
6
|
+
#
|
|
7
|
+
# Provides a unified interface for loading and querying word embeddings
|
|
8
|
+
# from different sources (FastText, Word2Vec, GloVe, ONNX, etc.).
|
|
9
|
+
#
|
|
10
|
+
# @example Using an embedding model
|
|
11
|
+
# model = FastTextModel.new('cc.en.300.vec')
|
|
12
|
+
# embedding = model.embedding_for('hello')
|
|
13
|
+
# similarity = model.similarity('hello', 'world')
|
|
14
|
+
# neighbors = model.nearest_neighbors('hello', k: 10)
|
|
15
|
+
#
|
|
16
|
+
# @abstract Subclasses must implement {#embedding_for} and {#vocabulary}
|
|
17
|
+
class EmbeddingModel
|
|
18
|
+
attr_reader :language_code, :dimension, :vocabulary_size
|
|
19
|
+
|
|
20
|
+
# Create a new embedding model.
|
|
21
|
+
#
|
|
22
|
+
# @param language_code [String] ISO 639-1 language code
|
|
23
|
+
# @param dimension [Integer] Vector dimensionality (e.g., 300)
|
|
24
|
+
def initialize(language_code:, dimension:)
|
|
25
|
+
raise ArgumentError, "Language code cannot be nil" if language_code.nil?
|
|
26
|
+
raise ArgumentError, "Dimension must be positive" unless dimension&.positive?
|
|
27
|
+
|
|
28
|
+
@language_code = language_code
|
|
29
|
+
@dimension = dimension
|
|
30
|
+
@vocabulary_size = 0
|
|
31
|
+
freeze
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Get embedding vector for a word.
|
|
35
|
+
#
|
|
36
|
+
# @param word [String] The word to lookup
|
|
37
|
+
# @return [WordEmbedding, nil] Embedding vector or nil if not found
|
|
38
|
+
# @abstract Subclass must implement
|
|
39
|
+
def embedding_for(word)
|
|
40
|
+
raise NotImplementedError, "#{self.class} must implement #embedding_for"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Check if a word is in the vocabulary.
|
|
44
|
+
#
|
|
45
|
+
# @param word [String] The word to check
|
|
46
|
+
# @return [Boolean] True if word exists in vocabulary
|
|
47
|
+
def has_word?(word)
|
|
48
|
+
vocabulary.include?(word)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Calculate cosine similarity between two words.
|
|
52
|
+
#
|
|
53
|
+
# @param word1 [String] First word
|
|
54
|
+
# @param word2 [String] Second word
|
|
55
|
+
# @return [Float, nil] Similarity score (0.0 to 1.0) or nil if words not found
|
|
56
|
+
def similarity(word1, word2)
|
|
57
|
+
emb1 = embedding_for(word1)
|
|
58
|
+
emb2 = embedding_for(word2)
|
|
59
|
+
|
|
60
|
+
return nil unless emb1 && emb2
|
|
61
|
+
|
|
62
|
+
emb1.similarity(emb2)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Calculate Euclidean distance between two words.
|
|
66
|
+
#
|
|
67
|
+
# @param word1 [String] First word
|
|
68
|
+
# @param word2 [String] Second word
|
|
69
|
+
# @return [Float, nil] Distance or nil if words not found
|
|
70
|
+
def distance(word1, word2)
|
|
71
|
+
emb1 = embedding_for(word1)
|
|
72
|
+
emb2 = embedding_for(word2)
|
|
73
|
+
|
|
74
|
+
return nil unless emb1 && emb2
|
|
75
|
+
|
|
76
|
+
emb1.distance(emb2)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Find k nearest neighbors for a word.
|
|
80
|
+
#
|
|
81
|
+
# @param word [String] The query word
|
|
82
|
+
# @param k [Integer] Number of neighbors to return
|
|
83
|
+
# @return [Array<NearestNeighbor>] Nearest neighbors sorted by similarity
|
|
84
|
+
def nearest_neighbors(word, k: 10)
|
|
85
|
+
embedding = embedding_for(word)
|
|
86
|
+
return [] unless embedding
|
|
87
|
+
|
|
88
|
+
# Calculate similarity with all words in vocabulary
|
|
89
|
+
neighbors = vocabulary.map do |vocab_word|
|
|
90
|
+
next if vocab_word == word
|
|
91
|
+
|
|
92
|
+
vocab_embedding = embedding_for(vocab_word)
|
|
93
|
+
next unless vocab_embedding
|
|
94
|
+
|
|
95
|
+
sim = embedding.similarity(vocab_embedding)
|
|
96
|
+
NearestNeighbor.new(
|
|
97
|
+
word: vocab_word,
|
|
98
|
+
similarity: sim,
|
|
99
|
+
distance: embedding.distance(vocab_embedding),
|
|
100
|
+
embedding: vocab_embedding
|
|
101
|
+
)
|
|
102
|
+
end.compact
|
|
103
|
+
|
|
104
|
+
# Sort by similarity (descending) and take top k
|
|
105
|
+
neighbors.sort.reverse.first(k)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Find k nearest neighbors for an embedding vector.
|
|
109
|
+
#
|
|
110
|
+
# @param embedding [WordEmbedding] The query embedding
|
|
111
|
+
# @param k [Integer] Number of neighbors to return
|
|
112
|
+
# @return [Array<NearestNeighbor>] Nearest neighbors sorted by similarity
|
|
113
|
+
def nearest_neighbors_for_embedding(embedding, k: 10)
|
|
114
|
+
return [] unless embedding
|
|
115
|
+
|
|
116
|
+
# Calculate similarity with all words in vocabulary
|
|
117
|
+
neighbors = vocabulary.map do |vocab_word|
|
|
118
|
+
vocab_embedding = embedding_for(vocab_word)
|
|
119
|
+
next unless vocab_embedding
|
|
120
|
+
|
|
121
|
+
sim = embedding.similarity(vocab_embedding)
|
|
122
|
+
NearestNeighbor.new(
|
|
123
|
+
word: vocab_word,
|
|
124
|
+
similarity: sim,
|
|
125
|
+
distance: embedding.distance(vocab_embedding),
|
|
126
|
+
embedding: vocab_embedding
|
|
127
|
+
)
|
|
128
|
+
end.compact
|
|
129
|
+
|
|
130
|
+
# Sort by similarity (descending) and take top k
|
|
131
|
+
neighbors.sort.reverse.first(k)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Get model metadata.
|
|
135
|
+
#
|
|
136
|
+
# @return [Hash] Model metadata
|
|
137
|
+
def metadata
|
|
138
|
+
{
|
|
139
|
+
language_code: @language_code,
|
|
140
|
+
dimension: @dimension,
|
|
141
|
+
vocabulary_size: @vocabulary_size,
|
|
142
|
+
model_type: self.class.name
|
|
143
|
+
}
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Get the vocabulary (all words in the model).
|
|
147
|
+
#
|
|
148
|
+
# @return [Array<String>] Vocabulary words
|
|
149
|
+
# @abstract Subclass must implement
|
|
150
|
+
def vocabulary
|
|
151
|
+
raise NotImplementedError, "#{self.class} must implement #vocabulary"
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Check if model is loaded.
|
|
155
|
+
#
|
|
156
|
+
# @return [Boolean] True if model is loaded and ready
|
|
157
|
+
def loaded?
|
|
158
|
+
@vocabulary_size&.positive? || vocabulary&.any?
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Get model statistics.
|
|
162
|
+
#
|
|
163
|
+
# @return [Hash] Statistics about the model
|
|
164
|
+
def statistics
|
|
165
|
+
{
|
|
166
|
+
language: @language_code,
|
|
167
|
+
dimension: @dimension,
|
|
168
|
+
vocabulary_size: @vocabulary_size,
|
|
169
|
+
loaded: loaded?
|
|
170
|
+
}
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# String representation.
|
|
174
|
+
#
|
|
175
|
+
# @return [String] Human-readable representation
|
|
176
|
+
def to_s
|
|
177
|
+
"#{self.class.name}(language: #{@language_code}, dim: #{@dimension}, vocab: #{@vocabulary_size})"
|
|
178
|
+
end
|
|
179
|
+
alias_method :inspect, :to_s
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
end
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'embedding_model'
|
|
4
|
+
require_relative 'word_embedding'
|
|
5
|
+
require_relative 'nearest_neighbor'
|
|
6
|
+
|
|
7
|
+
module Kotoshu
|
|
8
|
+
module Models
|
|
9
|
+
# FastText embedding model implementation.
|
|
10
|
+
#
|
|
11
|
+
# Loads FastText pre-trained word vectors from .vec files.
|
|
12
|
+
# Supports Common Crawl and Wikipedia trained vectors.
|
|
13
|
+
#
|
|
14
|
+
# @example Loading from file
|
|
15
|
+
# model = FastTextModel.from_file('cc.en.300.vec')
|
|
16
|
+
# model.embedding_for('hello')
|
|
17
|
+
#
|
|
18
|
+
# @example Loading from GitHub
|
|
19
|
+
# model = FastTextModel.from_github('en')
|
|
20
|
+
# model.nearest_neighbors('hello', k: 10)
|
|
21
|
+
#
|
|
22
|
+
# @see https://fasttext.cc/docs/en/crawl-vectors.html FastText crawl vectors
|
|
23
|
+
# @see https://fasttext.cc/docs/en/english-vectors.html FastText English vectors
|
|
24
|
+
class FastTextModel < EmbeddingModel
|
|
25
|
+
# Standard FastText dimension for crawl vectors
|
|
26
|
+
DEFAULT_DIMENSION = 300
|
|
27
|
+
|
|
28
|
+
# Number of vectors to load when reading from file
|
|
29
|
+
# FastText .vec files contain up to 2M words; we load a subset by default
|
|
30
|
+
DEFAULT_MAX_VECTORS = 1_000_000
|
|
31
|
+
|
|
32
|
+
attr_reader :embeddings, :max_vectors
|
|
33
|
+
|
|
34
|
+
# Create a new FastText model.
|
|
35
|
+
#
|
|
36
|
+
# @param language_code [String] ISO 639-1 language code
|
|
37
|
+
# @param dimension [Integer] Vector dimension (default: 300)
|
|
38
|
+
# @param embeddings [Hash<String, WordEmbedding>] Pre-loaded embeddings
|
|
39
|
+
# @param max_vectors [Integer] Maximum vectors to load from file
|
|
40
|
+
def initialize(language_code:, dimension: DEFAULT_DIMENSION, embeddings: {}, max_vectors: DEFAULT_MAX_VECTORS)
|
|
41
|
+
super(language_code: language_code, dimension: dimension)
|
|
42
|
+
@embeddings = embeddings.freeze
|
|
43
|
+
@max_vectors = max_vectors
|
|
44
|
+
@vocabulary_size = @embeddings.size
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Load FastText model from a .vec file.
|
|
48
|
+
#
|
|
49
|
+
# @param file_path [String] Path to FastText .vec file
|
|
50
|
+
# @param max_vectors [Integer] Maximum vectors to load (default: 1M)
|
|
51
|
+
# @param language_code [String] Language code (auto-detected from filename)
|
|
52
|
+
# @return [FastTextModel] Loaded model
|
|
53
|
+
# @raise [ArgumentError] if file doesn't exist
|
|
54
|
+
def self.from_file(file_path, max_vectors: DEFAULT_MAX_VECTORS, language_code: nil)
|
|
55
|
+
raise ArgumentError, "File not found: #{file_path}" unless File.exist?(file_path)
|
|
56
|
+
|
|
57
|
+
# Detect language from filename if not provided
|
|
58
|
+
language_code ||= detect_language_from_path(file_path)
|
|
59
|
+
|
|
60
|
+
# Parse the .vec file
|
|
61
|
+
embeddings = {}
|
|
62
|
+
dimension = nil
|
|
63
|
+
count = 0
|
|
64
|
+
|
|
65
|
+
File.open(file_path, 'r', encoding: 'UTF-8') do |file|
|
|
66
|
+
# First line: vocab_size dimension
|
|
67
|
+
first_line = file.getline
|
|
68
|
+
metadata = first_line.split
|
|
69
|
+
_vocab_size = metadata[0].to_i
|
|
70
|
+
dimension = metadata[1].to_i
|
|
71
|
+
|
|
72
|
+
# Read vectors
|
|
73
|
+
file.each_line do |line|
|
|
74
|
+
break if count >= max_vectors
|
|
75
|
+
|
|
76
|
+
parts = line.split
|
|
77
|
+
word = parts[0]
|
|
78
|
+
vector = parts[1..-1].map(&:to_f)
|
|
79
|
+
|
|
80
|
+
next unless vector.size == dimension
|
|
81
|
+
|
|
82
|
+
embeddings[word] = WordEmbedding.new(word, vector, language_code, dimension: dimension)
|
|
83
|
+
count += 1
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
new(language_code: language_code, dimension: dimension, embeddings: embeddings, max_vectors: max_vectors)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Load FastText model from GitHub (via ModelCache).
|
|
91
|
+
#
|
|
92
|
+
# Downloads the .vec file from kotoshu/dictionaries repository.
|
|
93
|
+
#
|
|
94
|
+
# @param language_code [String] ISO 639-1 language code (de, en, es, fr, pt, ru)
|
|
95
|
+
# @param max_vectors [Integer] Maximum vectors to load (default: 500K for GitHub)
|
|
96
|
+
# @param cache [ModelCache, nil] Optional cache instance
|
|
97
|
+
# @return [FastTextModel] Loaded model
|
|
98
|
+
# @raise [ArgumentError] if language not supported
|
|
99
|
+
def self.from_github(language_code, max_vectors: 500_000, cache: nil)
|
|
100
|
+
require_relative '../cache/model_cache'
|
|
101
|
+
|
|
102
|
+
cache ||= Cache::ModelCache.new
|
|
103
|
+
|
|
104
|
+
# Get the .vec file path from cache
|
|
105
|
+
vec_file = cache.get_fasttext_model(language_code)
|
|
106
|
+
|
|
107
|
+
from_file(vec_file, max_vectors: max_vectors, language_code: language_code)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Get embedding vector for a word.
|
|
111
|
+
#
|
|
112
|
+
# @param word [String] The word to lookup
|
|
113
|
+
# @return [WordEmbedding, nil] Embedding vector or nil if not found
|
|
114
|
+
def embedding_for(word)
|
|
115
|
+
return nil if word.nil? || word.empty?
|
|
116
|
+
|
|
117
|
+
# Direct lookup
|
|
118
|
+
@embeddings[word]
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Get the vocabulary (all words in the model).
|
|
122
|
+
#
|
|
123
|
+
# @return [Array<String>] Vocabulary words
|
|
124
|
+
def vocabulary
|
|
125
|
+
@embeddings.keys
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Check if model is loaded.
|
|
129
|
+
#
|
|
130
|
+
# @return [Boolean] True if embeddings are loaded
|
|
131
|
+
def loaded?
|
|
132
|
+
@embeddings&.any?
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Find k nearest neighbors for a word (optimized version).
|
|
136
|
+
#
|
|
137
|
+
# Overrides the base implementation for better performance using
|
|
138
|
+
# pre-loaded embeddings instead of repeated lookups.
|
|
139
|
+
#
|
|
140
|
+
# @param word [String] The query word
|
|
141
|
+
# @param k [Integer] Number of neighbors to return
|
|
142
|
+
# @return [Array<NearestNeighbor>] Nearest neighbors sorted by similarity
|
|
143
|
+
def nearest_neighbors(word, k: 10)
|
|
144
|
+
embedding = embedding_for(word)
|
|
145
|
+
return [] unless embedding
|
|
146
|
+
|
|
147
|
+
# Calculate similarity with all words in vocabulary
|
|
148
|
+
neighbors = @embeddings.map do |vocab_word, vocab_embedding|
|
|
149
|
+
next if vocab_word == word
|
|
150
|
+
|
|
151
|
+
sim = embedding.similarity(vocab_embedding)
|
|
152
|
+
NearestNeighbor.new(
|
|
153
|
+
word: vocab_word,
|
|
154
|
+
similarity: sim,
|
|
155
|
+
embedding: vocab_embedding
|
|
156
|
+
)
|
|
157
|
+
end.compact
|
|
158
|
+
|
|
159
|
+
# Sort by similarity (descending) and take top k
|
|
160
|
+
neighbors.sort.reverse.first(k)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Find k nearest neighbors for an embedding vector (optimized version).
|
|
164
|
+
#
|
|
165
|
+
# @param embedding [WordEmbedding] The query embedding
|
|
166
|
+
# @param k [Integer] Number of neighbors to return
|
|
167
|
+
# @return [Array<NearestNeighbor>] Nearest neighbors sorted by similarity
|
|
168
|
+
def nearest_neighbors_for_embedding(embedding, k: 10)
|
|
169
|
+
return [] unless embedding
|
|
170
|
+
|
|
171
|
+
# Calculate similarity with all words in vocabulary
|
|
172
|
+
neighbors = @embeddings.map do |vocab_word, vocab_embedding|
|
|
173
|
+
sim = embedding.similarity(vocab_embedding)
|
|
174
|
+
NearestNeighbor.new(
|
|
175
|
+
word: vocab_word,
|
|
176
|
+
similarity: sim,
|
|
177
|
+
embedding: vocab_embedding
|
|
178
|
+
)
|
|
179
|
+
end.compact
|
|
180
|
+
|
|
181
|
+
# Sort by similarity (descending) and take top k
|
|
182
|
+
neighbors.sort.reverse.first(k)
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Get batch embeddings for multiple words.
|
|
186
|
+
#
|
|
187
|
+
# @param words [Array<String>] Words to lookup
|
|
188
|
+
# @return [Hash<String, WordEmbedding>] Mapping of word to embedding
|
|
189
|
+
def batch_embeddings(words)
|
|
190
|
+
words.each_with_object({}) do |word, hash|
|
|
191
|
+
emb = embedding_for(word)
|
|
192
|
+
hash[word] = emb if emb
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Get batch similarities for word pairs.
|
|
197
|
+
#
|
|
198
|
+
# @param pairs [Array<Array<String, String>>] Word pairs
|
|
199
|
+
# @return [Array<Float>] Similarity scores
|
|
200
|
+
def batch_similarities(pairs)
|
|
201
|
+
pairs.map { |word1, word2| similarity(word1, word2) }
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
private
|
|
205
|
+
|
|
206
|
+
# Detect language code from file path.
|
|
207
|
+
#
|
|
208
|
+
# @param path [String] File path
|
|
209
|
+
# @return [String] Detected language code
|
|
210
|
+
def self.detect_language_from_path(path)
|
|
211
|
+
# Extract from path like "cc.en.300.vec" or "wiki.de.vec"
|
|
212
|
+
if path =~ /\.([a-z]{2})\./i
|
|
213
|
+
Regexp.last_match(1).downcase
|
|
214
|
+
else
|
|
215
|
+
'en' # Default to English
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
end
|