kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,165 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ # Base error class for all Kotoshu exceptions.
5
+ #
6
+ # @example Raising a custom Kotoshu error
7
+ # raise Kotoshu::Error, "Something went wrong"
8
+ class Error < StandardError; end
9
+
10
+ # Error raised when a dictionary file cannot be found.
11
+ #
12
+ # @example Dictionary not found
13
+ # raise DictionaryNotFoundError, "Dictionary not found: /path/to/dic.dic"
14
+ class DictionaryNotFoundError < Error
15
+ # Create a new dictionary not found error.
16
+ #
17
+ # @param path [String] The path that was not found
18
+ # @param message [String] Custom message (optional)
19
+ def initialize(path, message = nil)
20
+ @path = path
21
+ super(message || "Dictionary not found: #{path}")
22
+ end
23
+
24
+ # @return [String] The path that was not found
25
+ attr_reader :path
26
+ end
27
+
28
+ # Error raised when a dictionary file has an invalid format.
29
+ #
30
+ # @example Invalid dictionary format
31
+ # raise InvalidDictionaryFormatError, "Invalid .dic file format"
32
+ class InvalidDictionaryFormatError < Error
33
+ # Create a new invalid format error.
34
+ #
35
+ # @param path [String] The file path
36
+ # @param details [String] Details about the format issue
37
+ def initialize(path, details = nil)
38
+ @path = path
39
+ @details = details
40
+ super("Invalid dictionary format#{": #{details}" if details}: #{path}")
41
+ end
42
+
43
+ # @return [String] The file path
44
+ attr_reader :path
45
+
46
+ # @return [String, nil] Details about the format issue
47
+ attr_reader :details
48
+ end
49
+
50
+ # Error raised when there is a configuration issue.
51
+ #
52
+ # @example Invalid configuration
53
+ # raise ConfigurationError, "Invalid dictionary type: unknown_type"
54
+ class ConfigurationError < Error
55
+ # Create a new configuration error.
56
+ #
57
+ # @param message [String] The error message
58
+ # @param key [String, Symbol] The configuration key (optional)
59
+ def initialize(message, key: nil)
60
+ @key = key
61
+ super(message)
62
+ end
63
+
64
+ # @return [String, Symbol, nil] The configuration key
65
+ attr_reader :key
66
+ end
67
+
68
+ # Error raised during spell checking operations.
69
+ #
70
+ # @example Spell check failure
71
+ # raise SpellcheckError, "Failed to check word: encoding error"
72
+ class SpellcheckError < Error
73
+ # Create a new spellcheck error.
74
+ #
75
+ # @param message [String] The error message
76
+ # @param word [String] The word being checked (optional)
77
+ def initialize(message, word: nil)
78
+ @word = word
79
+ super(message)
80
+ end
81
+
82
+ # @return [String, nil] The word being checked
83
+ attr_reader :word
84
+ end
85
+
86
+ # Error raised when an affix rule cannot be parsed.
87
+ #
88
+ # @example Invalid affix rule
89
+ # raise AffixRuleError, "Invalid affix rule: PFX A Y 1 re"
90
+ class AffixRuleError < Error
91
+ # Create a new affix rule error.
92
+ #
93
+ # @param message [String] The error message
94
+ # @param rule [String] The rule that failed to parse (optional)
95
+ def initialize(message, rule: nil)
96
+ @rule = rule
97
+ super(message)
98
+ end
99
+
100
+ # @return [String, nil] The rule that failed to parse
101
+ attr_reader :rule
102
+ end
103
+
104
+ # Error raised when a required resource is not cached and cannot be
105
+ # downloaded (offline mode or network failure).
106
+ class ResourceNotCachedError < Error
107
+ def initialize(language, resource_type)
108
+ @language = language
109
+ @resource_type = resource_type
110
+ super("Resource not cached: #{language}:#{resource_type}. " \
111
+ "Pre-fetch with `kotoshu cache download language #{language}` " \
112
+ "or disable offline mode (KOTOSHU_OFFLINE=0).")
113
+ end
114
+
115
+ attr_reader :language, :resource_type
116
+ end
117
+
118
+ # Error raised by the hot path (Kotoshu.correct?, .suggest, .check,
119
+ # .check_file, .spellchecker_for) when a language hasn't been set up
120
+ # via Kotoshu.setup / kotoshu setup. The hot path is cache-only and
121
+ # never downloads — explicit setup is required.
122
+ class ResourceNotSetupError < Error
123
+ def initialize(language, resource_type = "spelling")
124
+ @language = language
125
+ @resource_type = resource_type
126
+ super("Language '#{language}' is not set up (missing #{resource_type}). " \
127
+ "Run `kotoshu setup #{language}` or " \
128
+ "`Kotoshu.setup(:#{language})` first.")
129
+ end
130
+
131
+ attr_reader :language, :resource_type
132
+ end
133
+
134
+ # Error raised when a resource cannot be resolved for a language
135
+ # (unsupported language, download failure, etc.).
136
+ class ResourceResolutionError < Error
137
+ def initialize(language, reason)
138
+ @language = language
139
+ super("Cannot resolve resources for '#{language}': #{reason}")
140
+ end
141
+
142
+ attr_reader :language
143
+ end
144
+
145
+ # Error raised when a downloaded resource fails integrity verification
146
+ # (SHA-256 mismatch against manifest, truncated content, etc.).
147
+ #
148
+ # The downloaded bytes are never trusted until verified against a known
149
+ # manifest entry. Mismatch raises this error with both hashes so the
150
+ # caller can surface them in audit logs and CI output.
151
+ class IntegrityError < Error
152
+ def initialize(resource_id, expected:, actual:, url: nil)
153
+ @resource_id = resource_id
154
+ @expected = expected
155
+ @actual = actual
156
+ @url = url
157
+ msg = +"Integrity verification failed for #{resource_id}: "
158
+ msg << "expected sha256=#{expected}, got sha256=#{actual}"
159
+ msg << " (url: #{url})" if url
160
+ super(msg)
161
+ end
162
+
163
+ attr_reader :resource_id, :expected, :actual, :url
164
+ end
165
+ end
@@ -0,0 +1,291 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Core
5
+ # Indexed dictionary for efficient word lookup with multiple indexes.
6
+ # This is MORE model-driven than Spylls which uses simple hash indices.
7
+ #
8
+ # This is a proper domain model with rich behavior including:
9
+ # - Multiple indexes (case-sensitive, case-insensitive, prefix, suffix)
10
+ # - Rich query methods
11
+ # - Index management
12
+ # - Domain-specific behavior
13
+ class IndexedDictionary
14
+ attr_reader :words, :size
15
+
16
+ # @param words [Array<String>] Initial words to add
17
+ def initialize(words = [])
18
+ @words = []
19
+ @indexes = {
20
+ exact: {}, # case_sensitive: word => [positions]
21
+ lowercase: {}, # case_insensitive: word.downcase => [positions]
22
+ prefix: {}, # prefix => [words]
23
+ suffix: {}, # suffix => [words]
24
+ flag: {} # flag => [words] (future: for Hunspell)
25
+ }
26
+ @size = 0
27
+
28
+ words.each { |word| add_word(word) }
29
+ end
30
+
31
+ # Add a word to the dictionary with optional metadata.
32
+ #
33
+ # @param word [String] The word to add
34
+ # @param metadata [Hash] Optional metadata associated with the word
35
+ # @return [IndexedDictionary] Self for chaining
36
+ def add_word(word, metadata = {})
37
+ # Store the word with its index and metadata
38
+ entry = { word: word, index: @size, metadata: metadata }
39
+ @words << entry
40
+ @size += 1
41
+
42
+ # Update exact index (case-sensitive)
43
+ @indexes[:exact][word] ||= []
44
+ @indexes[:exact][word] << @size - 1
45
+
46
+ # Update lowercase index (case-insensitive)
47
+ lower = word.downcase
48
+ @indexes[:lowercase][lower] ||= []
49
+ @indexes[:lowercase][lower] << @size - 1
50
+
51
+ # Update prefix indexes (for prefix searching)
52
+ (1...word.length).each do |i|
53
+ prefix = word[0...i]
54
+ @indexes[:prefix][prefix] ||= []
55
+ @indexes[:prefix][prefix] << word
56
+
57
+ # Update suffix indexes (for suffix searching)
58
+ suffix = word[i..]
59
+ @indexes[:suffix][suffix] ||= []
60
+ @indexes[:suffix][suffix] << word
61
+ end
62
+
63
+ self
64
+ end
65
+ alias << add_word
66
+
67
+ # Add multiple words.
68
+ #
69
+ # @param new_words [Array<String>] Words to add
70
+ # @return [IndexedDictionary] Self for chaining
71
+ def add_words(new_words)
72
+ new_words.each { |word| add_word(word) }
73
+ self
74
+ end
75
+
76
+ # Check if a word exists (case-sensitive).
77
+ #
78
+ # @param word [String] The word to check
79
+ # @return [Boolean] True if word exists
80
+ def has_word?(word)
81
+ @indexes[:exact].key?(word)
82
+ end
83
+ alias include? has_word?
84
+ alias contains? has_word?
85
+
86
+ # Check if a word exists (case-insensitive).
87
+ #
88
+ # @param word [String] The word to check
89
+ # @return [Boolean] True if word exists (any case)
90
+ def has_word_ignorecase?(word)
91
+ @indexes[:lowercase].key?(word.downcase)
92
+ end
93
+
94
+ # Look up a word (case-sensitive).
95
+ #
96
+ # @param word [String] The word to look up
97
+ # @return [Hash, nil] Word entry or nil
98
+ def lookup(word)
99
+ indices = @indexes[:exact][word]
100
+ return nil if indices.nil? || indices.empty?
101
+
102
+ @words[indices.first]
103
+ end
104
+
105
+ # Look up a word (case-insensitive).
106
+ #
107
+ # @param word [String] The word to look up
108
+ # @return [Hash, nil] Word entry or nil
109
+ def lookup_ignorecase(word)
110
+ indices = @indexes[:lowercase][word.downcase]
111
+ return nil if indices.nil? || indices.empty?
112
+
113
+ @words[indices.first]
114
+ end
115
+
116
+ # Find all words with a given prefix.
117
+ #
118
+ # @param prefix [String] The prefix to match
119
+ # @param ignore_case [Boolean] Whether to ignore case
120
+ # @return [Array<String>] Words with the prefix
121
+ def find_by_prefix(prefix, ignore_case: false)
122
+ if ignore_case
123
+ prefix_lower = prefix.downcase
124
+ all_words.select { |w| w.downcase.start_with?(prefix_lower) }
125
+ else
126
+ @indexes[:prefix].fetch(prefix, []).dup
127
+ end
128
+ end
129
+
130
+ # Find all words with a given suffix.
131
+ #
132
+ # @param suffix [String] The suffix to match
133
+ # @param ignore_case [Boolean] Whether to ignore case
134
+ # @return [Array<String>] Words with the suffix
135
+ def find_by_suffix(suffix, ignore_case: false)
136
+ if ignore_case
137
+ suffix_lower = suffix.downcase
138
+ all_words.select { |w| w.downcase.end_with?(suffix_lower) }
139
+ else
140
+ @indexes[:suffix].fetch(suffix, []).dup
141
+ end
142
+ end
143
+
144
+ # Find words matching a pattern.
145
+ #
146
+ # @param pattern [Regexp] The pattern to match
147
+ # @return [Array<String>] Matching words
148
+ def find_by_pattern(pattern)
149
+ all_words.select { |w| w.match?(pattern) }
150
+ end
151
+
152
+ # Find words of a specific length.
153
+ #
154
+ # @param length [Integer] The exact length
155
+ # @return [Array<String>] Words of the given length
156
+ def find_by_length(length)
157
+ all_words.select { |w| w.length == length }
158
+ end
159
+
160
+ # Find words within a length range.
161
+ #
162
+ # @param min_length [Integer] Minimum length
163
+ # @param max_length [Integer] Maximum length
164
+ # @return [Array<String>] Words within the length range
165
+ def find_by_length_range(min_length:, max_length:)
166
+ all_words.select { |w| w.length >= min_length && w.length <= max_length }
167
+ end
168
+
169
+ # Get all words in the dictionary.
170
+ #
171
+ # @return [Array<String>] All words
172
+ def all_words
173
+ @words.map { |entry| entry[:word] }
174
+ end
175
+
176
+ # Get random words from the dictionary.
177
+ #
178
+ # @param count [Integer] Number of random words
179
+ # @return [Array<String>] Random words
180
+ def random_words(count: 1)
181
+ return [] if @words.empty?
182
+
183
+ indices = (0...@size).to_a.sample(count)
184
+ indices.map { |i| @words[i][:word] }
185
+ end
186
+
187
+ # Get words starting with each letter (A-Z).
188
+ #
189
+ # @return [Hash] Hash of letter => word count
190
+ def count_by_first_letter
191
+ result = Hash.new(0)
192
+ all_words.each do |word|
193
+ next if word.empty?
194
+
195
+ letter = word[0].upcase
196
+ result[letter] += 1
197
+ end
198
+ result
199
+ end
200
+
201
+ # Get word length distribution.
202
+ #
203
+ # @return [Hash] Hash of length => count
204
+ def count_by_length
205
+ result = Hash.new(0)
206
+ all_words.each { |word| result[word.length] += 1 }
207
+ result
208
+ end
209
+
210
+ # Check if the dictionary is empty.
211
+ #
212
+ # @return [Boolean] True if empty
213
+ def empty?
214
+ @size.zero?
215
+ end
216
+
217
+ # Iterate over all words.
218
+ #
219
+ # @yield [word] Each word
220
+ # @return [Enumerator] Enumerator if no block given
221
+ def each_word
222
+ return enum_for(:each_word) unless block_given?
223
+
224
+ @words.each { |entry| yield entry[:word] }
225
+ end
226
+
227
+ # Iterate over all words with indices.
228
+ #
229
+ # @yield [word, index] Each word and its index
230
+ # @return [Enumerator] Enumerator if no block given
231
+ def each_with_index
232
+ return enum_for(:each_with_index) unless block_given?
233
+
234
+ @words.each { |entry| yield entry[:word], entry[:index] }
235
+ end
236
+
237
+ # Build a Trie from the dictionary words.
238
+ #
239
+ # @return [Trie] New trie containing all words
240
+ def to_trie
241
+ require_relative "trie/trie"
242
+ require_relative "trie/builder"
243
+
244
+ Trie::Builder.from_array(all_words)
245
+ end
246
+
247
+ # Get statistics about the dictionary.
248
+ #
249
+ # @return [Hash] Statistics
250
+ def statistics
251
+ lengths = all_words.map(&:length)
252
+
253
+ {
254
+ total_words: @size,
255
+ unique_words: all_words.uniq.size,
256
+ min_length: lengths.min || 0,
257
+ max_length: lengths.max || 0,
258
+ avg_length: lengths.empty? ? 0 : (lengths.sum.to_f / lengths.size).round(2),
259
+ count_by_first_letter: count_by_first_letter,
260
+ count_by_length: count_by_length
261
+ }
262
+ end
263
+
264
+ # Convert to string.
265
+ #
266
+ # @return [String] String representation
267
+ def to_s
268
+ "IndexedDictionary(size: #{@size})"
269
+ end
270
+ alias inspect to_s
271
+
272
+ # Create indexed dictionary from a file.
273
+ #
274
+ # @param path [String] Path to word list file
275
+ # @return [IndexedDictionary] New dictionary
276
+ def self.from_file(path)
277
+ words = File.foreach(path, chomp: true).reject { |l| l.empty? || l.start_with?("#") }
278
+ new(words)
279
+ end
280
+
281
+ # Create indexed dictionary from a Trie.
282
+ #
283
+ # @param trie [Trie] The trie to convert
284
+ # @return [IndexedDictionary] New dictionary
285
+ def self.from_trie(trie)
286
+ words = trie.all_words
287
+ new(words)
288
+ end
289
+ end
290
+ end
291
+ end
@@ -0,0 +1,260 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Models
5
+ # Affix rule model for Hunspell-style affix processing.
6
+ #
7
+ # Affix rules define how prefixes and suffixes can be added or removed
8
+ # from words to generate morphological variants.
9
+ #
10
+ # This is a value object that represents a single affix rule.
11
+ #
12
+ # @note This class is immutable and frozen on initialization.
13
+ #
14
+ # @example Creating a prefix rule
15
+ # rule = Models::AffixRule.new(
16
+ # type: :prefix,
17
+ # flag: "A",
18
+ # strip: "",
19
+ # add: "re",
20
+ # condition: "."
21
+ # )
22
+ # rule.prefix? # => true
23
+ # rule.suffix? # => false
24
+ class AffixRule
25
+ # @return [Symbol] The affix type (:prefix or :suffix)
26
+ attr_reader :type
27
+
28
+ # @return [String] The flag character identifying this rule
29
+ attr_reader :flag
30
+
31
+ # @return [String] Characters to strip from the word
32
+ attr_reader :strip
33
+
34
+ # @return [String] Characters to add to the word
35
+ attr_reader :add
36
+
37
+ # @return [String, Regexp] Condition for applying this rule
38
+ attr_reader :condition
39
+
40
+ # @return [Boolean] Whether this is a cross-product rule
41
+ attr_reader :cross_product
42
+
43
+ # Affix rule types.
44
+ TYPES = {
45
+ prefix: "PFX",
46
+ suffix: "SFX"
47
+ }.freeze
48
+
49
+ # Create a new AffixRule.
50
+ #
51
+ # @param type [Symbol] The affix type (:prefix or :suffix)
52
+ # @param flag [String] The flag character
53
+ # @param strip [String] Characters to strip
54
+ # @param add [String] Characters to add
55
+ # @param condition [String, Regexp] Condition for applying
56
+ # @param cross_product [Boolean] Whether this is cross-product
57
+ def initialize(type:, flag:, strip:, add:, condition: ".", cross_product: false)
58
+ raise ArgumentError, "Invalid type: #{type}" unless %i[prefix suffix].include?(type)
59
+ raise ArgumentError, "Flag cannot be empty" if flag.nil? || flag.empty?
60
+
61
+ @type = type
62
+ @flag = flag.dup.freeze
63
+ @strip = strip.dup.freeze
64
+ @add = add.dup.freeze
65
+ @condition = condition.is_a?(Regexp) ? condition : compile_condition(condition)
66
+ @cross_product = cross_product
67
+
68
+ freeze
69
+ end
70
+
71
+ # Check if this is a prefix rule.
72
+ #
73
+ # @return [Boolean] True if prefix
74
+ def prefix?
75
+ @type == :prefix
76
+ end
77
+
78
+ # Check if this is a suffix rule.
79
+ #
80
+ # @return [Boolean] True if suffix
81
+ def suffix?
82
+ @type == :suffix
83
+ end
84
+
85
+ # Check if this rule can be applied to a word.
86
+ #
87
+ # @param word [String] The word to check
88
+ # @return [Boolean] True if the rule applies
89
+ def applies_to?(word)
90
+ return false if word.nil? || word.empty?
91
+
92
+ word.match?(@condition)
93
+ end
94
+
95
+ # Apply this rule to a word.
96
+ #
97
+ # @param word [String] The word to modify
98
+ # @return [String, nil] The modified word, or nil if rule doesn't apply
99
+ def apply(word)
100
+ return nil unless applies_to?(word)
101
+
102
+ if prefix?
103
+ # Strip from beginning, add prefix
104
+ word.start_with?(@strip) ? @add + word[@strip.length..] : nil
105
+ else
106
+ # Strip from end, add suffix
107
+ word.end_with?(@strip) ? word[0...-@strip.length] + @add : nil
108
+ end
109
+ end
110
+
111
+ # Remove this affix from a word (reverse operation).
112
+ #
113
+ # @param word [String] The word to modify
114
+ # @return [String, nil] The stripped word, or nil if affix doesn't match
115
+ def remove(word)
116
+ return nil unless applies_to?(word)
117
+
118
+ if prefix?
119
+ # Remove prefix if it matches
120
+ word.start_with?(@add) ? @strip + word[@add.length..] : nil
121
+ else
122
+ # Remove suffix if it matches
123
+ word.end_with?(@add) ? word[0...-@add.length] + @strip : nil
124
+ end
125
+ end
126
+
127
+ # Get the Hunspell representation.
128
+ #
129
+ # @return [String] The affix line for Hunspell format
130
+ def to_hunspell
131
+ type_code = TYPES[@type]
132
+ cross = @cross_product ? "Y" : "N"
133
+ "#{type_code} #{@flag} #{cross} #{@strip.empty? ? "0" : @strip} " \
134
+ "#{@add} #{@condition.is_a?(Regexp) ? condition_to_s : @condition}"
135
+ end
136
+
137
+ # Convert to hash.
138
+ #
139
+ # @return [Hash] Hash representation
140
+ def to_h
141
+ {
142
+ type: @type,
143
+ flag: @flag,
144
+ strip: @strip,
145
+ add: @add,
146
+ condition: @condition.is_a?(Regexp) ? @condition.source : @condition,
147
+ cross_product: @cross_product
148
+ }
149
+ end
150
+
151
+ # Check equality based on all attributes.
152
+ #
153
+ # @param other [AffixRule] The other rule
154
+ # @return [Boolean] True if equal
155
+ def ==(other)
156
+ return false unless other.is_a?(AffixRule)
157
+
158
+ @type == other.type &&
159
+ @flag == other.flag &&
160
+ @strip == other.strip &&
161
+ @add == other.add &&
162
+ @condition == other.condition &&
163
+ @cross_product == other.cross_product
164
+ end
165
+ alias eql? ==
166
+
167
+ # Hash based on all attributes.
168
+ #
169
+ # @return [Integer] Hash code
170
+ def hash
171
+ [@type, @flag, @strip, @add, @cross_product].hash
172
+ end
173
+
174
+ # Compare rules by flag.
175
+ #
176
+ # @param other [AffixRule] The other rule
177
+ # @return [Integer] Comparison result
178
+ def <=>(other)
179
+ return nil unless other.is_a?(AffixRule)
180
+
181
+ @flag <=> other.flag
182
+ end
183
+
184
+ private
185
+
186
+ # Compile condition string to regex.
187
+ #
188
+ # @param condition [String] The condition string
189
+ # @return [Regexp] The compiled regex
190
+ def compile_condition(condition)
191
+ return // if condition == "."
192
+
193
+ # Hunspell uses '.' for match-all, '[...]' for character classes
194
+ # and '^[...]' for negated classes. Convert to Ruby regex.
195
+ regex_str = condition.dup
196
+
197
+ # Convert [...] to Ruby character class
198
+ regex_str = regex_str.gsub(/\[([^\]]+)\]/, "(?:\\1)")
199
+
200
+ # Convert ^[...] to negative lookahead
201
+ # Convert ^ to negative lookahead for single character
202
+ regex_str = regex_str.gsub("\\^(\\w)", "(?!\\1).")
203
+
204
+ # Anchor to end for suffix, beginning for prefix
205
+ if @type == :suffix
206
+ Regexp.new("#{regex_str}\\$")
207
+ else
208
+ Regexp.new("\\^#{regex_str}")
209
+ end
210
+ end
211
+
212
+ # Convert regex condition back to string.
213
+ #
214
+ # @return [String] The condition string
215
+ def condition_to_s
216
+ source = @condition.source
217
+
218
+ # Remove anchors
219
+ source = source.gsub("\\^", "").gsub("\\$", "")
220
+
221
+ # Convert negative lookaheads back
222
+ source = source.gsub("\\(\\?\\!([^)]+)\\)\\.", "^\\1")
223
+
224
+ # Convert non-capturing groups back
225
+ source.gsub("\\(\\?:", "[").gsub("\\)", "]")
226
+ end
227
+
228
+ # Create an affix rule from a Hunspell affix line.
229
+ #
230
+ # @param line [String] The affix line
231
+ # @param type [Symbol] The rule type (:prefix or :suffix)
232
+ # @return [AffixRule] New affix rule
233
+ #
234
+ # @example Parsing a Hunspell prefix rule
235
+ # AffixRule.from_hunspell("PFX A Y 1 re .", :prefix)
236
+ #
237
+ # @example Parsing a Hunspell suffix rule
238
+ # AffixRule.from_hunspell("SFX V N 2 ive e", :suffix)
239
+ def self.from_hunspell(line, type)
240
+ parts = line.split
241
+ return nil if parts.length < 5
242
+
243
+ flag = parts[1]
244
+ cross_product = parts[2] == "Y"
245
+ strip = parts[3] == "0" ? "" : parts[3]
246
+ add = parts[4]
247
+ condition = parts[5] || "."
248
+
249
+ new(
250
+ type: type,
251
+ flag: flag,
252
+ strip: strip,
253
+ add: add,
254
+ condition: condition,
255
+ cross_product: cross_product
256
+ )
257
+ end
258
+ end
259
+ end
260
+ end