kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,110 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ # Metrics and instrumentation for Kotoshu.
5
+ #
6
+ # Provides thread-safe collection of performance metrics:
7
+ # - Lookup counts and timing
8
+ # - Cache hit/miss rates
9
+ # - Suggestion generation stats
10
+ # - Optional export to StatsD or Prometheus
11
+ #
12
+ # @example Enable metrics
13
+ # Kotoshu::Metrics.enable
14
+ # Kotoshu.correct?("hello")
15
+ # Kotoshu::Metrics.stats
16
+ # # => { lookups: 1, cache_hits: 0, cache_misses: 1, ... }
17
+ module Metrics
18
+ class << self
19
+ # Enable metrics collection.
20
+ def enable
21
+ @enabled = true
22
+ @collector = Collector.new
23
+ end
24
+
25
+ # Disable metrics collection.
26
+ def disable
27
+ @enabled = false
28
+ @collector = nil
29
+ end
30
+
31
+ # Check if metrics are enabled.
32
+ #
33
+ # @return [Boolean] True if enabled
34
+ def enabled?
35
+ @enabled ||= false
36
+ end
37
+
38
+ # Get the metrics collector.
39
+ #
40
+ # @return [Collector, nil] The collector instance
41
+ attr_reader :collector
42
+
43
+ # Record a lookup operation.
44
+ #
45
+ # @param word [String] The word being looked up
46
+ # @param result [Boolean] The lookup result
47
+ # @param time [Float] Time taken in milliseconds
48
+ def record_lookup(word, result:, time:)
49
+ return unless enabled?
50
+
51
+ collector&.record_lookup(word, result: result, time: time)
52
+ end
53
+
54
+ # Record a cache operation.
55
+ #
56
+ # @param cache_type [String] Type of cache (lookup, suggestion)
57
+ # @param hit [Boolean] True if cache hit
58
+ def record_cache(cache_type, hit:)
59
+ return unless enabled?
60
+
61
+ collector&.record_cache(cache_type, hit: hit)
62
+ end
63
+
64
+ # Record suggestion generation.
65
+ #
66
+ # @param word [String] The input word
67
+ # @param count [Integer] Number of suggestions generated
68
+ # @param time [Float] Time taken in milliseconds
69
+ def record_suggestions(word, count:, time:)
70
+ return unless enabled?
71
+
72
+ collector&.record_suggestions(word, count: count, time: time)
73
+ end
74
+
75
+ # Get current metrics statistics.
76
+ #
77
+ # @return [Hash] Current statistics
78
+ def stats
79
+ return {} unless enabled?
80
+
81
+ collector&.stats || {}
82
+ end
83
+
84
+ # Reset all metrics.
85
+ def reset
86
+ return unless enabled?
87
+
88
+ collector&.reset
89
+ end
90
+
91
+ # Get metrics as StatsD format.
92
+ #
93
+ # @return [String] StatsD protocol lines
94
+ def to_statsd
95
+ return "" unless enabled?
96
+
97
+ collector&.to_statsd || ""
98
+ end
99
+
100
+ # Get metrics as Prometheus format.
101
+ #
102
+ # @return [String] Prometheus exposition format
103
+ def to_prometheus
104
+ return "" unless enabled?
105
+
106
+ collector&.to_prometheus || ""
107
+ end
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,119 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Models
5
+ # Value object for text context around an error.
6
+ #
7
+ # Provides the surrounding text before, current, and after
8
+ # an error location for context display and analysis.
9
+ #
10
+ # @example Creating context
11
+ # context = Context.new(
12
+ # before: "The quick brown",
13
+ # current: "fox",
14
+ # after: "jumps over",
15
+ # location: Location.new(line: 5, column: 16)
16
+ # )
17
+ # context.full_context # => "The quick brown fox jumps over"
18
+ class Context
19
+ attr_reader :before, :current, :after, :location, :window, :full_context
20
+
21
+ # Create a new context object.
22
+ #
23
+ # @param before [String] Text before the error
24
+ # @param current [String] The current line/text containing the error
25
+ # @param after [String] Text after the error
26
+ # @param location [Documents::Location] The error location
27
+ # @param window [Integer] Window size used for context (default: 5)
28
+ def initialize(before:, current:, after:, location:, window: 5)
29
+ @before = before
30
+ @current = current
31
+ @after = after
32
+ @location = location
33
+ @window = window
34
+ @full_context = [before, current, after].compact.join("\n")
35
+ freeze
36
+ end
37
+
38
+ # Get surrounding words around the error location.
39
+ #
40
+ # @param n [Integer] Number of words on each side (default: 3)
41
+ # @return [Array<String>] Surrounding words
42
+ def surrounding_words(n = 3)
43
+ return [] if @current.nil? || @current.empty?
44
+
45
+ words = @current.split
46
+ return [] if words.empty?
47
+
48
+ # Try to find the word at the error location
49
+ target_word = word_at_location
50
+ return words unless target_word
51
+
52
+ idx = words.index(target_word)
53
+ return words unless idx
54
+
55
+ # Get n words before and after
56
+ start_idx = [0, idx - n].max
57
+ end_idx = [words.size - 1, idx + n].min
58
+
59
+ words[start_idx..end_idx].to_a
60
+ end
61
+
62
+ # Get the word at the error location.
63
+ #
64
+ # @return [String, nil] The word at the error location
65
+ def word_at_location
66
+ return nil unless @location
67
+
68
+ if @location.column
69
+ # Get character at column
70
+ return @current[@location.column] if @current && @location.column < @current.length
71
+ end
72
+
73
+ # For node-based locations, return the current text
74
+ @current
75
+ end
76
+
77
+ # Check if this context equals another.
78
+ #
79
+ # @param other [Object] Another object
80
+ # @return [Boolean] True if contexts match
81
+ def ==(other)
82
+ return false unless other.is_a?(Context)
83
+
84
+ @location == other.location && @full_context == other.full_context
85
+ end
86
+ alias_method :eql?, :==
87
+
88
+ # Hash code for hash table usage.
89
+ #
90
+ # @return [Integer] Hash code
91
+ def hash
92
+ [@location, @full_context].hash
93
+ end
94
+
95
+ # String representation.
96
+ #
97
+ # @return [String] Human-readable representation
98
+ def to_s
99
+ if @location.line
100
+ "Line #{@location.line}: #{@full_context}"
101
+ else
102
+ @full_context
103
+ end
104
+ end
105
+ alias_method :inspect, :to_s
106
+
107
+ # Get context as a formatted string with error highlighting.
108
+ #
109
+ # @param error_word [String] The error word to highlight
110
+ # @return [String] Formatted context with ANSI codes
111
+ def with_highlight(error_word)
112
+ return @full_context unless error_word
113
+
114
+ # Find and highlight the error word
115
+ @full_context.gsub(error_word) { |m| "\033[4m#{m}\033[0m" }
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,182 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Models
5
+ # Abstract base class for word embedding models.
6
+ #
7
+ # Provides a unified interface for loading and querying word embeddings
8
+ # from different sources (FastText, Word2Vec, GloVe, ONNX, etc.).
9
+ #
10
+ # @example Using an embedding model
11
+ # model = FastTextModel.new('cc.en.300.vec')
12
+ # embedding = model.embedding_for('hello')
13
+ # similarity = model.similarity('hello', 'world')
14
+ # neighbors = model.nearest_neighbors('hello', k: 10)
15
+ #
16
+ # @abstract Subclasses must implement {#embedding_for} and {#vocabulary}
17
+ class EmbeddingModel
18
+ attr_reader :language_code, :dimension, :vocabulary_size
19
+
20
+ # Create a new embedding model.
21
+ #
22
+ # @param language_code [String] ISO 639-1 language code
23
+ # @param dimension [Integer] Vector dimensionality (e.g., 300)
24
+ def initialize(language_code:, dimension:)
25
+ raise ArgumentError, "Language code cannot be nil" if language_code.nil?
26
+ raise ArgumentError, "Dimension must be positive" unless dimension&.positive?
27
+
28
+ @language_code = language_code
29
+ @dimension = dimension
30
+ @vocabulary_size = 0
31
+ freeze
32
+ end
33
+
34
+ # Get embedding vector for a word.
35
+ #
36
+ # @param word [String] The word to lookup
37
+ # @return [WordEmbedding, nil] Embedding vector or nil if not found
38
+ # @abstract Subclass must implement
39
+ def embedding_for(word)
40
+ raise NotImplementedError, "#{self.class} must implement #embedding_for"
41
+ end
42
+
43
+ # Check if a word is in the vocabulary.
44
+ #
45
+ # @param word [String] The word to check
46
+ # @return [Boolean] True if word exists in vocabulary
47
+ def has_word?(word)
48
+ vocabulary.include?(word)
49
+ end
50
+
51
+ # Calculate cosine similarity between two words.
52
+ #
53
+ # @param word1 [String] First word
54
+ # @param word2 [String] Second word
55
+ # @return [Float, nil] Similarity score (0.0 to 1.0) or nil if words not found
56
+ def similarity(word1, word2)
57
+ emb1 = embedding_for(word1)
58
+ emb2 = embedding_for(word2)
59
+
60
+ return nil unless emb1 && emb2
61
+
62
+ emb1.similarity(emb2)
63
+ end
64
+
65
+ # Calculate Euclidean distance between two words.
66
+ #
67
+ # @param word1 [String] First word
68
+ # @param word2 [String] Second word
69
+ # @return [Float, nil] Distance or nil if words not found
70
+ def distance(word1, word2)
71
+ emb1 = embedding_for(word1)
72
+ emb2 = embedding_for(word2)
73
+
74
+ return nil unless emb1 && emb2
75
+
76
+ emb1.distance(emb2)
77
+ end
78
+
79
+ # Find k nearest neighbors for a word.
80
+ #
81
+ # @param word [String] The query word
82
+ # @param k [Integer] Number of neighbors to return
83
+ # @return [Array<NearestNeighbor>] Nearest neighbors sorted by similarity
84
+ def nearest_neighbors(word, k: 10)
85
+ embedding = embedding_for(word)
86
+ return [] unless embedding
87
+
88
+ # Calculate similarity with all words in vocabulary
89
+ neighbors = vocabulary.map do |vocab_word|
90
+ next if vocab_word == word
91
+
92
+ vocab_embedding = embedding_for(vocab_word)
93
+ next unless vocab_embedding
94
+
95
+ sim = embedding.similarity(vocab_embedding)
96
+ NearestNeighbor.new(
97
+ word: vocab_word,
98
+ similarity: sim,
99
+ distance: embedding.distance(vocab_embedding),
100
+ embedding: vocab_embedding
101
+ )
102
+ end.compact
103
+
104
+ # Sort by similarity (descending) and take top k
105
+ neighbors.sort.reverse.first(k)
106
+ end
107
+
108
+ # Find k nearest neighbors for an embedding vector.
109
+ #
110
+ # @param embedding [WordEmbedding] The query embedding
111
+ # @param k [Integer] Number of neighbors to return
112
+ # @return [Array<NearestNeighbor>] Nearest neighbors sorted by similarity
113
+ def nearest_neighbors_for_embedding(embedding, k: 10)
114
+ return [] unless embedding
115
+
116
+ # Calculate similarity with all words in vocabulary
117
+ neighbors = vocabulary.map do |vocab_word|
118
+ vocab_embedding = embedding_for(vocab_word)
119
+ next unless vocab_embedding
120
+
121
+ sim = embedding.similarity(vocab_embedding)
122
+ NearestNeighbor.new(
123
+ word: vocab_word,
124
+ similarity: sim,
125
+ distance: embedding.distance(vocab_embedding),
126
+ embedding: vocab_embedding
127
+ )
128
+ end.compact
129
+
130
+ # Sort by similarity (descending) and take top k
131
+ neighbors.sort.reverse.first(k)
132
+ end
133
+
134
+ # Get model metadata.
135
+ #
136
+ # @return [Hash] Model metadata
137
+ def metadata
138
+ {
139
+ language_code: @language_code,
140
+ dimension: @dimension,
141
+ vocabulary_size: @vocabulary_size,
142
+ model_type: self.class.name
143
+ }
144
+ end
145
+
146
+ # Get the vocabulary (all words in the model).
147
+ #
148
+ # @return [Array<String>] Vocabulary words
149
+ # @abstract Subclass must implement
150
+ def vocabulary
151
+ raise NotImplementedError, "#{self.class} must implement #vocabulary"
152
+ end
153
+
154
+ # Check if model is loaded.
155
+ #
156
+ # @return [Boolean] True if model is loaded and ready
157
+ def loaded?
158
+ @vocabulary_size&.positive? || vocabulary&.any?
159
+ end
160
+
161
+ # Get model statistics.
162
+ #
163
+ # @return [Hash] Statistics about the model
164
+ def statistics
165
+ {
166
+ language: @language_code,
167
+ dimension: @dimension,
168
+ vocabulary_size: @vocabulary_size,
169
+ loaded: loaded?
170
+ }
171
+ end
172
+
173
+ # String representation.
174
+ #
175
+ # @return [String] Human-readable representation
176
+ def to_s
177
+ "#{self.class.name}(language: #{@language_code}, dim: #{@dimension}, vocab: #{@vocabulary_size})"
178
+ end
179
+ alias_method :inspect, :to_s
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,220 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'embedding_model'
4
+ require_relative 'word_embedding'
5
+ require_relative 'nearest_neighbor'
6
+
7
+ module Kotoshu
8
+ module Models
9
+ # FastText embedding model implementation.
10
+ #
11
+ # Loads FastText pre-trained word vectors from .vec files.
12
+ # Supports Common Crawl and Wikipedia trained vectors.
13
+ #
14
+ # @example Loading from file
15
+ # model = FastTextModel.from_file('cc.en.300.vec')
16
+ # model.embedding_for('hello')
17
+ #
18
+ # @example Loading from GitHub
19
+ # model = FastTextModel.from_github('en')
20
+ # model.nearest_neighbors('hello', k: 10)
21
+ #
22
+ # @see https://fasttext.cc/docs/en/crawl-vectors.html FastText crawl vectors
23
+ # @see https://fasttext.cc/docs/en/english-vectors.html FastText English vectors
24
+ class FastTextModel < EmbeddingModel
25
+ # Standard FastText dimension for crawl vectors
26
+ DEFAULT_DIMENSION = 300
27
+
28
+ # Number of vectors to load when reading from file
29
+ # FastText .vec files contain up to 2M words; we load a subset by default
30
+ DEFAULT_MAX_VECTORS = 1_000_000
31
+
32
+ attr_reader :embeddings, :max_vectors
33
+
34
+ # Create a new FastText model.
35
+ #
36
+ # @param language_code [String] ISO 639-1 language code
37
+ # @param dimension [Integer] Vector dimension (default: 300)
38
+ # @param embeddings [Hash<String, WordEmbedding>] Pre-loaded embeddings
39
+ # @param max_vectors [Integer] Maximum vectors to load from file
40
+ def initialize(language_code:, dimension: DEFAULT_DIMENSION, embeddings: {}, max_vectors: DEFAULT_MAX_VECTORS)
41
+ super(language_code: language_code, dimension: dimension)
42
+ @embeddings = embeddings.freeze
43
+ @max_vectors = max_vectors
44
+ @vocabulary_size = @embeddings.size
45
+ end
46
+
47
+ # Load FastText model from a .vec file.
48
+ #
49
+ # @param file_path [String] Path to FastText .vec file
50
+ # @param max_vectors [Integer] Maximum vectors to load (default: 1M)
51
+ # @param language_code [String] Language code (auto-detected from filename)
52
+ # @return [FastTextModel] Loaded model
53
+ # @raise [ArgumentError] if file doesn't exist
54
+ def self.from_file(file_path, max_vectors: DEFAULT_MAX_VECTORS, language_code: nil)
55
+ raise ArgumentError, "File not found: #{file_path}" unless File.exist?(file_path)
56
+
57
+ # Detect language from filename if not provided
58
+ language_code ||= detect_language_from_path(file_path)
59
+
60
+ # Parse the .vec file
61
+ embeddings = {}
62
+ dimension = nil
63
+ count = 0
64
+
65
+ File.open(file_path, 'r', encoding: 'UTF-8') do |file|
66
+ # First line: vocab_size dimension
67
+ first_line = file.getline
68
+ metadata = first_line.split
69
+ _vocab_size = metadata[0].to_i
70
+ dimension = metadata[1].to_i
71
+
72
+ # Read vectors
73
+ file.each_line do |line|
74
+ break if count >= max_vectors
75
+
76
+ parts = line.split
77
+ word = parts[0]
78
+ vector = parts[1..-1].map(&:to_f)
79
+
80
+ next unless vector.size == dimension
81
+
82
+ embeddings[word] = WordEmbedding.new(word, vector, language_code, dimension: dimension)
83
+ count += 1
84
+ end
85
+ end
86
+
87
+ new(language_code: language_code, dimension: dimension, embeddings: embeddings, max_vectors: max_vectors)
88
+ end
89
+
90
+ # Load FastText model from GitHub (via ModelCache).
91
+ #
92
+ # Downloads the .vec file from kotoshu/dictionaries repository.
93
+ #
94
+ # @param language_code [String] ISO 639-1 language code (de, en, es, fr, pt, ru)
95
+ # @param max_vectors [Integer] Maximum vectors to load (default: 500K for GitHub)
96
+ # @param cache [ModelCache, nil] Optional cache instance
97
+ # @return [FastTextModel] Loaded model
98
+ # @raise [ArgumentError] if language not supported
99
+ def self.from_github(language_code, max_vectors: 500_000, cache: nil)
100
+ require_relative '../cache/model_cache'
101
+
102
+ cache ||= Cache::ModelCache.new
103
+
104
+ # Get the .vec file path from cache
105
+ vec_file = cache.get_fasttext_model(language_code)
106
+
107
+ from_file(vec_file, max_vectors: max_vectors, language_code: language_code)
108
+ end
109
+
110
+ # Get embedding vector for a word.
111
+ #
112
+ # @param word [String] The word to lookup
113
+ # @return [WordEmbedding, nil] Embedding vector or nil if not found
114
+ def embedding_for(word)
115
+ return nil if word.nil? || word.empty?
116
+
117
+ # Direct lookup
118
+ @embeddings[word]
119
+ end
120
+
121
+ # Get the vocabulary (all words in the model).
122
+ #
123
+ # @return [Array<String>] Vocabulary words
124
+ def vocabulary
125
+ @embeddings.keys
126
+ end
127
+
128
+ # Check if model is loaded.
129
+ #
130
+ # @return [Boolean] True if embeddings are loaded
131
+ def loaded?
132
+ @embeddings&.any?
133
+ end
134
+
135
+ # Find k nearest neighbors for a word (optimized version).
136
+ #
137
+ # Overrides the base implementation for better performance using
138
+ # pre-loaded embeddings instead of repeated lookups.
139
+ #
140
+ # @param word [String] The query word
141
+ # @param k [Integer] Number of neighbors to return
142
+ # @return [Array<NearestNeighbor>] Nearest neighbors sorted by similarity
143
+ def nearest_neighbors(word, k: 10)
144
+ embedding = embedding_for(word)
145
+ return [] unless embedding
146
+
147
+ # Calculate similarity with all words in vocabulary
148
+ neighbors = @embeddings.map do |vocab_word, vocab_embedding|
149
+ next if vocab_word == word
150
+
151
+ sim = embedding.similarity(vocab_embedding)
152
+ NearestNeighbor.new(
153
+ word: vocab_word,
154
+ similarity: sim,
155
+ embedding: vocab_embedding
156
+ )
157
+ end.compact
158
+
159
+ # Sort by similarity (descending) and take top k
160
+ neighbors.sort.reverse.first(k)
161
+ end
162
+
163
+ # Find k nearest neighbors for an embedding vector (optimized version).
164
+ #
165
+ # @param embedding [WordEmbedding] The query embedding
166
+ # @param k [Integer] Number of neighbors to return
167
+ # @return [Array<NearestNeighbor>] Nearest neighbors sorted by similarity
168
+ def nearest_neighbors_for_embedding(embedding, k: 10)
169
+ return [] unless embedding
170
+
171
+ # Calculate similarity with all words in vocabulary
172
+ neighbors = @embeddings.map do |vocab_word, vocab_embedding|
173
+ sim = embedding.similarity(vocab_embedding)
174
+ NearestNeighbor.new(
175
+ word: vocab_word,
176
+ similarity: sim,
177
+ embedding: vocab_embedding
178
+ )
179
+ end.compact
180
+
181
+ # Sort by similarity (descending) and take top k
182
+ neighbors.sort.reverse.first(k)
183
+ end
184
+
185
+ # Get batch embeddings for multiple words.
186
+ #
187
+ # @param words [Array<String>] Words to lookup
188
+ # @return [Hash<String, WordEmbedding>] Mapping of word to embedding
189
+ def batch_embeddings(words)
190
+ words.each_with_object({}) do |word, hash|
191
+ emb = embedding_for(word)
192
+ hash[word] = emb if emb
193
+ end
194
+ end
195
+
196
+ # Get batch similarities for word pairs.
197
+ #
198
+ # @param pairs [Array<Array<String, String>>] Word pairs
199
+ # @return [Array<Float>] Similarity scores
200
+ def batch_similarities(pairs)
201
+ pairs.map { |word1, word2| similarity(word1, word2) }
202
+ end
203
+
204
+ private
205
+
206
+ # Detect language code from file path.
207
+ #
208
+ # @param path [String] File path
209
+ # @return [String] Detected language code
210
+ def self.detect_language_from_path(path)
211
+ # Extract from path like "cc.en.300.vec" or "wiki.de.vec"
212
+ if path =~ /\.([a-z]{2})\./i
213
+ Regexp.last_match(1).downcase
214
+ else
215
+ 'en' # Default to English
216
+ end
217
+ end
218
+ end
219
+ end
220
+ end