kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,135 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Suggestions
5
+ # Pipeline for composable suggestion strategies.
6
+ #
7
+ # Allows chaining multiple suggestion strategies that execute in sequence,
8
+ # with optional early termination when a stage produces no results.
9
+ #
10
+ # @example Creating a pipeline
11
+ # pipeline = Pipeline.new do |p|
12
+ # p.add :sym_spell
13
+ # p.add :phonetic
14
+ # p.add :ngram
15
+ # end
16
+ #
17
+ # @example Executing a pipeline
18
+ # result = pipeline.execute(context, strategies)
19
+ class Pipeline
20
+ # @return [Array<Symbol>] Ordered stage names
21
+ attr_reader :stages
22
+
23
+ # Create a new pipeline.
24
+ #
25
+ # @yield [pipeline] Optional block to add stages
26
+ # @return [Pipeline] New pipeline
27
+ #
28
+ # @example With block
29
+ # pipeline = Pipeline.new do |p|
30
+ # p.add :sym_spell
31
+ # p.add :phonetic
32
+ # end
33
+ def initialize
34
+ @stages = []
35
+ yield self if block_given?
36
+ end
37
+
38
+ # Add a stage to the pipeline.
39
+ #
40
+ # @param stage_name [Symbol] Name of the stage
41
+ # @return [Pipeline] Self for chaining
42
+ #
43
+ # @example
44
+ # pipeline.add(:sym_spell)
45
+ def add(stage_name)
46
+ @stages << stage_name
47
+ self
48
+ end
49
+
50
+ # Remove a stage from the pipeline.
51
+ #
52
+ # @param stage_name [Symbol] Name of the stage to remove
53
+ # @return [Pipeline] Self for chaining
54
+ #
55
+ # @example
56
+ # pipeline.remove(:phonetic)
57
+ def remove(stage_name)
58
+ @stages.delete(stage_name)
59
+ self
60
+ end
61
+
62
+ # Execute strategies through the pipeline.
63
+ #
64
+ # Strategies are executed in sequence. If a strategy returns
65
+ # an empty SuggestionSet, subsequent strategies are still executed
66
+ # unless early_termination is enabled.
67
+ #
68
+ # @param context [Context] The suggestion context
69
+ # @param strategies [Hash] Hash of stage_name => strategy_instance
70
+ # @param early_termination [Boolean] Whether to stop on empty result
71
+ # @return [SuggestionSet] Combined results from all stages
72
+ #
73
+ # @example
74
+ # strategies = { sym_spell: sym_spell_strategy, phonetic: phonetic_strategy }
75
+ # result = pipeline.execute(context, strategies)
76
+ def execute(context, strategies = nil, early_termination: false)
77
+ combined = SuggestionSet.empty
78
+
79
+ @stages.each do |stage_name|
80
+ strategy = if strategies.is_a?(Hash)
81
+ strategies[stage_name]
82
+ else
83
+ strategies
84
+ end
85
+
86
+ next unless strategy
87
+
88
+ result = strategy.generate(context)
89
+
90
+ # Combine results
91
+ combined = combine_results(combined, result)
92
+
93
+ # Early termination on empty result
94
+ break if early_termination && result.empty?
95
+ end
96
+
97
+ combined
98
+ end
99
+
100
+ # Check if pipeline has a stage.
101
+ #
102
+ # @param stage_name [Symbol] Stage name
103
+ # @return [Boolean] True if stage exists
104
+ def has_stage?(stage_name)
105
+ @stages.include?(stage_name)
106
+ end
107
+
108
+ # Clear all stages.
109
+ #
110
+ # @return [Pipeline] Self for chaining
111
+ def clear
112
+ @stages.clear
113
+ self
114
+ end
115
+
116
+ # Clone the pipeline.
117
+ #
118
+ # @return [Pipeline] New pipeline with same stages
119
+ def clone
120
+ self.class.new.tap { |p| @stages.each { |s| p.add(s) } }
121
+ end
122
+
123
+ private
124
+
125
+ # Combine two suggestion sets.
126
+ #
127
+ # @param combined [SuggestionSet] Current combined results
128
+ # @param new_result [SuggestionSet] New results to add
129
+ # @return [SuggestionSet] Combined suggestion set
130
+ def combine_results(combined, new_result)
131
+ combined.concat(new_result.suggestions).unique
132
+ end
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,296 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Suggestions
5
+ module Strategies
6
+ # Base class for suggestion strategies.
7
+ #
8
+ # Subclasses must implement the {#generate} method.
9
+ #
10
+ # @example Implementing a custom strategy
11
+ # class MyStrategy < BaseStrategy
12
+ # def generate(context)
13
+ # # Return suggestions based on context.word
14
+ # SuggestionSet.from_words(%w[word1 word2], source: :my_strategy)
15
+ # end
16
+ # end
17
+ class BaseStrategy
18
+ # @return [Symbol] Strategy name
19
+ attr_reader :name
20
+
21
+ # @return [Hash] Strategy configuration
22
+ attr_reader :config
23
+
24
+ # Create a new base strategy.
25
+ #
26
+ # @param name [String, Symbol] Strategy name
27
+ # @param config [Hash] Configuration options
28
+ # @option config [Integer] max_results Maximum results to return
29
+ # @option config [Boolean] enabled Whether strategy is enabled
30
+ def initialize(name: :base, **config)
31
+ @name = name.to_sym
32
+ @config = config
33
+ @enabled = config.fetch(:enabled, true)
34
+ @max_results = config.fetch(:max_results, 10)
35
+ end
36
+
37
+ # Generate suggestions for a word.
38
+ #
39
+ # @abstract Subclasses must implement this method.
40
+ # @param context [Context] The suggestion context
41
+ # @return [SuggestionSet] Generated suggestions
42
+ # @raise [NotImplementedError] Subclass must implement
43
+ def generate(context)
44
+ raise NotImplementedError, "#{self.class} must implement #generate"
45
+ end
46
+
47
+ # Check if this strategy is enabled.
48
+ #
49
+ # @return [Boolean] True if enabled
50
+ def enabled?
51
+ @enabled
52
+ end
53
+
54
+ # Get the max results configuration.
55
+ #
56
+ # @param default [Integer] Default value if not set
57
+ # @return [Integer] Max results
58
+ def max_results(default = 10)
59
+ @max_results || default
60
+ end
61
+
62
+ # Get a configuration value.
63
+ #
64
+ # @param key [Symbol] The config key
65
+ # @param default [Object] Default value if not set
66
+ # @return [Object] The config value
67
+ def get_config(key, default = nil)
68
+ @config.fetch(key, default)
69
+ end
70
+
71
+ # Check if a config value is present.
72
+ #
73
+ # @param key [Symbol] The config key
74
+ # @return [Boolean] True if config has the key
75
+ def has_config?(key)
76
+ @config.key?(key)
77
+ end
78
+
79
+ # Get the priority for this strategy.
80
+ #
81
+ # @return [Integer] Priority (lower = higher priority)
82
+ def priority
83
+ @config.fetch(:priority, 100)
84
+ end
85
+
86
+ # Check if this strategy should handle the context.
87
+ #
88
+ # Default implementation checks if the word is not in the dictionary.
89
+ # Subclasses can override for more specific logic.
90
+ #
91
+ # @param context [Context] The suggestion context
92
+ # @return [Boolean] True if the strategy should handle this context
93
+ def handles?(context)
94
+ return false unless enabled?
95
+
96
+ !dictionary_lookup(context, context.word)
97
+ end
98
+
99
+ # Create a suggestion from a word.
100
+ #
101
+ # @param word [String] The suggested word
102
+ # @param distance [Integer] Edit distance
103
+ # @param confidence [Float] Confidence score
104
+ # @param metadata [Hash] Additional metadata for ranking
105
+ # @return [Suggestion] New suggestion
106
+ def create_suggestion(word, distance: 0, confidence: 1.0, **metadata)
107
+ Suggestion.new(
108
+ word: word,
109
+ distance: distance,
110
+ confidence: confidence,
111
+ source: @name,
112
+ **metadata
113
+ )
114
+ end
115
+
116
+ # Create a suggestion set from words.
117
+ #
118
+ # @param words [Array<String>] Array of words
119
+ # @param distances [Hash] Optional word => distance mapping
120
+ # @param original_word [String] The original misspelled word (for ranking)
121
+ # @return [SuggestionSet] New suggestion set
122
+ def create_suggestion_set(words, distances: {}, original_word: nil)
123
+ suggestions = words.map do |word|
124
+ # Try case-sensitive first, then case-insensitive for distance lookup
125
+ distance = if distances.key?(word)
126
+ distances[word]
127
+ else
128
+ distances.fetch(word.downcase, 1)
129
+ end
130
+ confidence = calculate_confidence(distance)
131
+
132
+ # Calculate n-gram similarity (like Hunspell) for better ranking
133
+ ngram_score = if original_word
134
+ calculate_ngram_similarity(original_word, word)
135
+ else
136
+ 0
137
+ end
138
+
139
+ metadata = {
140
+ original_length: original_word&.length || word.length,
141
+ ngram_score: ngram_score
142
+ }
143
+
144
+ create_suggestion(word, distance: distance, confidence: confidence, **metadata)
145
+ end
146
+ SuggestionSet.new(suggestions, max_size: max_results)
147
+ end
148
+
149
+ # Calculate typo correction similarity between two words.
150
+ #
151
+ # This is a custom similarity metric designed specifically for spelling
152
+ # correction, combining:
153
+ # - Character overlap (how many characters are shared)
154
+ # - Prefix weight (common prefix is very important for typos)
155
+ # - Suffix weight (common ending is also important)
156
+ # - Length penalty (very different lengths are less similar)
157
+ #
158
+ # Returns a value from 0.0 (no similarity) to 1.0 (identical).
159
+ #
160
+ # @param word1 [String] First word
161
+ # @param word2 [String] Second word
162
+ # @return [Float] Typo correction similarity (0.0 to 1.0)
163
+ def calculate_ngram_similarity(word1, word2)
164
+ return 0 if word1.nil? || word2.nil? || word1.empty? || word2.empty?
165
+
166
+ w1 = word1.downcase
167
+ w2 = word2.downcase
168
+
169
+ # Identical strings have maximum similarity
170
+ return 1.0 if w1 == w2
171
+
172
+ len1 = w1.length
173
+ len2 = w2.length
174
+ max_len = [len1, len2].max
175
+
176
+ # Calculate common prefix length (up to 4 characters)
177
+ prefix_len = 0
178
+ (0...[len1, len2, 4].min).each do |i|
179
+ break if w1[i] != w2[i]
180
+ prefix_len += 1
181
+ end
182
+
183
+ # Calculate common suffix length
184
+ suffix_len = 0
185
+ (1..[len1, len2, 4].min).each do |i|
186
+ break if w1[-i] != w2[-i]
187
+ suffix_len += 1
188
+ end
189
+
190
+ # Calculate character overlap (how many characters from w1 are in w2)
191
+ w2_chars = w2.chars
192
+ overlap = w1.chars.count { |c| w2_chars.include?(c) }
193
+
194
+ # Calculate similarity score
195
+ # 1. Base score from character overlap
196
+ similarity = overlap.to_f / max_len
197
+
198
+ # 2. Prefix bonus (common start is very important for typos)
199
+ prefix_bonus = prefix_len * 0.15
200
+
201
+ # 3. Suffix bonus (common ending is also important)
202
+ suffix_bonus = suffix_len * 0.05
203
+
204
+ # 4. Length penalty (very different lengths are less similar)
205
+ length_diff = (len1 - len2).abs
206
+ length_penalty = length_diff * 0.1
207
+
208
+ # Combine all factors
209
+ similarity = similarity + prefix_bonus + suffix_bonus - length_penalty
210
+
211
+ # Cap at 1.0, floor at 0.0
212
+ [[similarity, 1.0].min, 0.0].max
213
+ end
214
+
215
+ # Generate n-grams for a word.
216
+ #
217
+ # @param word [String] The word
218
+ # @param n [Integer] N-gram size
219
+ # @return [Set<String>] Set of n-grams
220
+ def generate_ngrams(word, n)
221
+ ngrams = Set.new
222
+ (word.length - n + 1).times do |i|
223
+ ngrams.add(word[i, n])
224
+ end
225
+ ngrams
226
+ end
227
+
228
+ # Convert strategy to string.
229
+ #
230
+ # @return [String] String representation
231
+ def to_s
232
+ "#{self.class.name}(name: #{@name}, enabled: #{enabled?})"
233
+ end
234
+ alias inspect to_s
235
+
236
+ private
237
+
238
+ # Look up a word in the dictionary.
239
+ #
240
+ # @param context [Context] The suggestion context
241
+ # @param word [String] The word to look up
242
+ # @return [Boolean] True if word exists
243
+ def dictionary_lookup(context, word)
244
+ dictionary = context.dictionary
245
+
246
+ # Check if it's a dictionary backend with lookup method
247
+ if dictionary.respond_to?(:lookup)
248
+ dictionary.lookup(word)
249
+ elsif dictionary.is_a?(::Kotoshu::Core::IndexedDictionary)
250
+ dictionary.has_word?(word)
251
+ elsif dictionary.respond_to?(:include?)
252
+ dictionary.include?(word)
253
+ elsif dictionary.is_a?(Hash)
254
+ dictionary.key?(word)
255
+ else
256
+ false
257
+ end
258
+ end
259
+
260
+ # Get all words from the dictionary.
261
+ #
262
+ # @param context [Context] The suggestion context
263
+ # @return [Array<String>] All words
264
+ def dictionary_words(context)
265
+ dictionary = context.dictionary
266
+
267
+ if dictionary.respond_to?(:words)
268
+ dictionary.words
269
+ elsif dictionary.is_a?(Array)
270
+ dictionary
271
+ elsif dictionary.is_a?(Hash)
272
+ dictionary.keys
273
+ elsif dictionary.is_a?(::Kotoshu::Core::IndexedDictionary)
274
+ dictionary.words
275
+ else
276
+ []
277
+ end
278
+ end
279
+
280
+ # Calculate confidence from distance.
281
+ #
282
+ # Higher distance = lower confidence.
283
+ #
284
+ # @param distance [Integer] Edit distance
285
+ # @return [Float] Confidence score (0.0 to 1.0)
286
+ def calculate_confidence(distance)
287
+ return 1.0 if distance.zero?
288
+
289
+ # Simple decay: confidence = 1 / (1 + distance)
290
+ # Can be overridden by subclasses for more sophisticated calculations
291
+ 1.0 / (1.0 + distance)
292
+ end
293
+ end
294
+ end
295
+ end
296
+ end
@@ -0,0 +1,140 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Suggestions
5
+ module Strategies
6
+ # Composite strategy that chains multiple suggestion strategies.
7
+ # Implements the Composite Pattern for extensible suggestion generation.
8
+ #
9
+ # This is MORE OOP than Spylls which has a procedural suggestion pipeline.
10
+ # Here, strategies are proper objects that can be added/removed/reordered.
11
+ #
12
+ # @example Using composite strategy
13
+ # pipeline = CompositeStrategy.new(name: :pipeline)
14
+ # pipeline.add(EditDistanceStrategy.new)
15
+ # pipeline.add(PhoneticStrategy.new)
16
+ # pipeline.add(NgramStrategy.new)
17
+ # suggestions = pipeline.generate(context)
18
+ class CompositeStrategy < BaseStrategy
19
+ attr_reader :strategies
20
+
21
+ # @param name [String, Symbol] Name of the composite
22
+ # @param strategies [Array<BaseStrategy>] Initial strategies
23
+ # @param config [Hash] Configuration options
24
+ def initialize(name:, strategies: [], **config)
25
+ @strategies = strategies
26
+ super(name: name, **config)
27
+ end
28
+
29
+ # Add a strategy to the pipeline.
30
+ #
31
+ # @param strategy [BaseStrategy] The strategy to add
32
+ # @return [CompositeStrategy] Self for chaining
33
+ def add(strategy)
34
+ @strategies << strategy
35
+ self
36
+ end
37
+ alias << add
38
+
39
+ # Remove a strategy from the pipeline.
40
+ #
41
+ # @param strategy [BaseStrategy] The strategy to remove
42
+ # @return [CompositeStrategy] Self for chaining
43
+ def remove(strategy)
44
+ @strategies.delete(strategy)
45
+ self
46
+ end
47
+
48
+ # Clear all strategies.
49
+ #
50
+ # @return [CompositeStrategy] Self for chaining
51
+ def clear
52
+ @strategies.clear
53
+ self
54
+ end
55
+
56
+ # Get strategies that can handle the given context.
57
+ #
58
+ # @param context [Context] The suggestion context
59
+ # @return [Array<BaseStrategy>] Applicable strategies
60
+ def applicable_strategies(context)
61
+ @strategies.select { |s| s.handles?(context) }
62
+ end
63
+
64
+ # Generate suggestions by delegating to all child strategies.
65
+ #
66
+ # @param context [Context] The suggestion context
67
+ # @return [SuggestionSet] Combined suggestions from all strategies
68
+ def generate(context)
69
+ # Create result set
70
+ result = SuggestionSet.empty(max_size: context.max_results)
71
+
72
+ # Process each applicable strategy
73
+ applicable_strategies(context).each do |strategy|
74
+ strategy_result = strategy.generate(context)
75
+ result.merge!(strategy_result)
76
+ end
77
+
78
+ result
79
+ end
80
+
81
+ # Check if any strategy can handle the context.
82
+ #
83
+ # @param context [Context] The suggestion context
84
+ # @return [Boolean] True if any strategy handles the context
85
+ def handles?(context)
86
+ applicable_strategies(context).any?
87
+ end
88
+
89
+ # Get the number of strategies.
90
+ #
91
+ # @return [Integer] Number of strategies
92
+ def size
93
+ @strategies.size
94
+ end
95
+ alias count size
96
+
97
+ # Check if the composite has any strategies.
98
+ #
99
+ # @return [Boolean] True if there are strategies
100
+ def any?
101
+ @strategies.any?
102
+ end
103
+
104
+ # Iterate over strategies.
105
+ #
106
+ # @yield [strategy] Each strategy
107
+ # @return [Enumerator] Enumerator if no block given
108
+ def each_strategy(&block)
109
+ return enum_for(:each_strategy) unless block_given?
110
+
111
+ @strategies.each(&block)
112
+ end
113
+
114
+ # Sort strategies by priority.
115
+ #
116
+ # @return [CompositeStrategy] Self for chaining
117
+ def sort_by_priority!
118
+ @strategies.sort_by!(&:priority)
119
+ self
120
+ end
121
+
122
+ # Convert to string.
123
+ #
124
+ # @return [String] String representation
125
+ def to_s
126
+ "#{self.class.name}(name: #{@name}, strategies: #{@strategies.map(&:name).join(", ")})"
127
+ end
128
+ alias inspect to_s
129
+
130
+ # Create a composite strategy with default algorithms.
131
+ #
132
+ # @param config [Hash] Configuration
133
+ # @return [CompositeStrategy] New composite with default strategies
134
+ def self.with_defaults(**config)
135
+ new(name: :default, **config)
136
+ end
137
+ end
138
+ end
139
+ end
140
+ end