kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,98 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Components
5
+ # Base class for POS (Part-of-Speech) taggers.
6
+ #
7
+ # POS taggers assign grammatical categories (NOUN, VERB, ADJ, etc.) to tokens.
8
+ # Different languages use different POS tagging strategies:
9
+ # - Latin scripts: Dictionary-based (Hunspell flags → POS tags)
10
+ # - CJK: Integrated with morphological analysis (tokenizer provides POS)
11
+ # - German: Compound word decomposition affects tagging
12
+ #
13
+ # Common POS tags (Penn Treebank style):
14
+ # - CC: Coordinating conjunction
15
+ # - CD: Cardinal number
16
+ # - DT: Determiner
17
+ # - EX: Existential there
18
+ # - FW: Foreign word
19
+ # - IN: Preposition or subordinating conjunction
20
+ # - JJ: Adjective
21
+ # - JJR: Adjective, comparative
22
+ # - JJS: Adjective, superlative
23
+ # - LS: List item marker
24
+ # - MD: Modal
25
+ # - NN: Noun, singular or mass
26
+ # - NNS: Noun, plural
27
+ # - NNP: Proper noun, singular
28
+ # - NNPS: Proper noun, plural
29
+ # - PDT: Predeterminer
30
+ # - POS: Possessive ending
31
+ # - PRP: Personal pronoun
32
+ # - PRP$: Possessive pronoun
33
+ # - RB: Adverb
34
+ # - RBR: Adverb, comparative
35
+ # - RBS: Adverb, superlative
36
+ # - RP: Particle
37
+ # - SYM: Symbol
38
+ # - TO: to
39
+ # - UH: Interjection
40
+ # - VB: Verb, base form
41
+ # - VBD: Verb, past tense
42
+ # - VBG: Verb, gerund or present participle
43
+ # - VBN: Verb, past participle
44
+ # - VBP: Verb, non-3rd person singular present
45
+ # - VBZ: Verb, 3rd person singular present
46
+ # - WDT: Wh-determiner
47
+ # - WP: Wh-pronoun
48
+ # - WP$: Possessive wh-pronoun
49
+ # - WRB: Wh-adverb
50
+ #
51
+ # Language-specific tags:
52
+ # - CJK uses its own tagset (e.g., Japanese: 名詞, 動詞, etc.)
53
+ # - German uses STTS tagset
54
+ #
55
+ # @abstract Subclasses must implement #tag
56
+ #
57
+ # @example Tagging tokens
58
+ # tagger = EnglishPosTagger.new(aff_path: "en_US.aff", dic_path: "en_US.dic")
59
+ # tokens = [
60
+ # { token: "The", position: 0, length: 3 },
61
+ # { token: "dog", position: 4, length: 3 },
62
+ # { token: "runs", position: 8, length: 4 }
63
+ # ]
64
+ # tagged = tagger.tag(tokens)
65
+ # # => [
66
+ # # { token: "The", position: 0, length: 3, pos_tag: "DET", lemma: "the" },
67
+ # # { token: "dog", position: 4, length: 3, pos_tag: "NOUN", lemma: "dog" },
68
+ # # { token: "runs", position: 8, length: 4, pos_tag: "VERB", lemma: "run" }
69
+ # # ]
70
+ class PosTagger
71
+ # Tag tokens with POS information.
72
+ #
73
+ # Takes an array of token hashes (from Tokenizer#tokenize) and adds:
74
+ # - :pos_tag (String, nil) - POS category (NOUN, VERB, etc.) or nil if unknown
75
+ # - :lemma (String, nil) - Lemma/base form or nil if unknown
76
+ #
77
+ # @abstract Subclasses must implement
78
+ # @param tokens [Array<Hash>] Array of token hashes from Tokenizer
79
+ # @return [Array<Hash>] Token hashes with added :pos_tag and :lemma keys
80
+ # @raise [NotImplementedError] if not implemented by subclass
81
+ def tag(tokens)
82
+ raise NotImplementedError, "#{self.class} must implement #tag"
83
+ end
84
+
85
+ # Tag a single word.
86
+ #
87
+ # Convenience method for single-word tagging.
88
+ #
89
+ # @param word [String] The word to tag
90
+ # @return [Hash] Hash with :pos_tag and :lemma keys (may be nil)
91
+ def tag_word(word)
92
+ token = { token: word, position: 0, length: word.length }
93
+ result = tag([token])
94
+ result.first || { pos_tag: nil, lemma: nil }
95
+ end
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Components
5
+ # Base class for spell checkers.
6
+ #
7
+ # Spell checkers validate words and provide suggestions for misspelled words.
8
+ # Different languages use different spell checking strategies:
9
+ # - Latin scripts: Dictionary lookup (Hunspell, Morfologik)
10
+ # - CJK: Confusion rule checking (no dictionary)
11
+ # - RTL: Dictionary lookup with bidirectional text handling
12
+ #
13
+ # @abstract Subclasses must implement #check and #suggest
14
+ #
15
+ # @example Checking a word
16
+ # checker = EnglishSpellChecker.new(aff_path: "en_US.aff", dic_path: "en_US.dic")
17
+ # result = checker.check("hello")
18
+ # # => { found: true, stem: "hello", flags: [] }
19
+ #
20
+ # @example Getting suggestions
21
+ # result = checker.check("helo")
22
+ # # => { found: false, stem: nil, flags: [] }
23
+ # suggestions = checker.suggest("helo")
24
+ # # => [
25
+ # # { word: "hello", distance: 1, score: 0.9 },
26
+ # # { word: "help", distance: 2, score: 0.7 }
27
+ # # ]
28
+ class SpellChecker
29
+ # Check if a word is spelled correctly.
30
+ #
31
+ # Returns a hash with:
32
+ # - :found (Boolean) - true if word is in dictionary
33
+ # - :stem (String, nil) - The stem/lemma if found
34
+ # - :flags (Array<String>) - Morphological flags
35
+ #
36
+ # @abstract Subclasses must implement
37
+ # @param word [String] The word to check
38
+ # @return [Hash] Result with :found, :stem, :flags
39
+ # @raise [NotImplementedError] if not implemented by subclass
40
+ def check(word)
41
+ raise NotImplementedError, "#{self.class} must implement #check"
42
+ end
43
+
44
+ # Get spelling suggestions for a misspelled word.
45
+ #
46
+ # Returns an array of suggestion hashes with:
47
+ # - :word (String) - The suggested word
48
+ # - :distance (Integer) - Edit distance from original word
49
+ # - :score (Float) - Confidence score (0-1, higher is better)
50
+ #
51
+ # Suggestions are sorted by relevance (highest score first).
52
+ #
53
+ # @abstract Subclasses must implement
54
+ # @param word [String] The misspelled word
55
+ # @param max_suggestions [Integer] Maximum number of suggestions to return
56
+ # @return [Array<Hash>] Array of suggestion hashes
57
+ # @raise [NotImplementedError] if not implemented by subclass
58
+ def suggest(word, max_suggestions: 10)
59
+ raise NotImplementedError, "#{self.class} must implement #suggest"
60
+ end
61
+
62
+ # Check if a word is spelled correctly.
63
+ #
64
+ # Convenience method that returns a boolean.
65
+ #
66
+ # @param word [String] The word to check
67
+ # @return [Boolean] true if word is correct
68
+ def correct?(word)
69
+ check(word)[:found]
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Components
5
+ # Base class for word form synthesizers.
6
+ #
7
+ # Synthesizers generate inflected forms from a lemma (base form).
8
+ # This is the inverse of lemmatization:
9
+ # - Lemmatization: "runs" → "run"
10
+ # - Synthesis: "run" → ["run", "runs", "running", "ran"]
11
+ #
12
+ # Different languages use different synthesis strategies:
13
+ # - Latin scripts: Hunspell affix rules
14
+ # - CJK: Not applicable (no inflection)
15
+ # - German: Compound word + affix synthesis
16
+ # - Finnish: Complex agglutinative patterns
17
+ #
18
+ # @abstract Subclasses must implement #synthesize
19
+ #
20
+ # @example Synthesizing English verb forms
21
+ # synthesizer = EnglishSynthesizer.new(aff_path: "en_US.aff", dic_path: "en_US.dic")
22
+ # forms = synthesizer.synthesize("run", "VERB")
23
+ # # => ["run", "runs", "running", "ran"]
24
+ #
25
+ # @example Synthesizing with POS constraint
26
+ # forms = synthesizer.synthesize("happy", "ADJ")
27
+ # # => ["happy", "happier", "happiest"]
28
+ class Synthesizer
29
+ # Generate inflected forms of a word.
30
+ #
31
+ # Given a lemma (base form) and a POS tag, returns all possible
32
+ # inflected forms of that word.
33
+ #
34
+ # @abstract Subclasses must implement
35
+ # @param lemma [String] The base form (lemma)
36
+ # @param pos_tag [String] The POS tag to constrain generation
37
+ # @return [Array<String>] Array of inflected forms
38
+ # @raise [NotImplementedError] if not implemented by subclass
39
+ def synthesize(lemma, pos_tag)
40
+ raise NotImplementedError, "#{self.class} must implement #synthesize"
41
+ end
42
+
43
+ # Generate all inflected forms (all POS tags).
44
+ #
45
+ # Convenience method that generates forms for all possible POS tags.
46
+ #
47
+ # @param lemma [String] The base form (lemma)
48
+ # @return [Hash] Hash mapping POS tags to arrays of forms
49
+ def synthesize_all(lemma)
50
+ # Default implementation - subclasses can optimize
51
+ {
52
+ 'NOUN' => synthesize(lemma, 'NOUN'),
53
+ 'VERB' => synthesize(lemma, 'VERB'),
54
+ 'ADJ' => synthesize(lemma, 'ADJ'),
55
+ 'ADV' => synthesize(lemma, 'ADV')
56
+ }
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Components
5
+ # Base class for tokenizers.
6
+ #
7
+ # Tokenizers split text into individual tokens (words, punctuation).
8
+ # Different languages use different tokenization strategies:
9
+ # - Latin scripts: Whitespace + punctuation
10
+ # - CJK: Morphological analysis
11
+ # - German: Compound word splitting
12
+ # - RTL: Right-to-left text handling
13
+ #
14
+ # @abstract Subclasses must implement #tokenize
15
+ #
16
+ # @example Tokenizing English text
17
+ # tokenizer = WhitespaceTokenizer.new
18
+ # tokens = tokenizer.tokenize("Hello, world!")
19
+ # # => [
20
+ # # { token: "Hello", position: 0, length: 5 },
21
+ # # { token: ",", position: 5, length: 1 },
22
+ # # { token: "world", position: 7, length: 5 },
23
+ # # { token: "!", position: 12, length: 1 }
24
+ # # ]
25
+ class Tokenizer
26
+ # Split text into tokens.
27
+ #
28
+ # Each token is a hash with:
29
+ # - :token (String) - The token text
30
+ # - :position (Integer) - Character position in original text
31
+ # - :length (Integer) - Token length in characters
32
+ #
33
+ # Additional keys may be added by subclasses:
34
+ # - :pos_tag (String) - Part of speech tag
35
+ # - :lemma (String) - Base form / lemma
36
+ # - :compound_part (Boolean) - Whether this is a compound word part
37
+ # - :script (Symbol) - Script type for multilingual text
38
+ #
39
+ # @abstract Subclasses must implement
40
+ # @param text [String] The input text
41
+ # @return [Array<Hash>] Array of token hashes
42
+ # @raise [NotImplementedError] if not implemented by subclass
43
+ def tokenize(text)
44
+ raise NotImplementedError, "#{self.class} must implement #tokenize"
45
+ end
46
+
47
+ # Tokenize and return just the token strings.
48
+ #
49
+ # Convenience method for when you only need the text content.
50
+ #
51
+ # @param text [String] The input text
52
+ # @return [Array<String>] Array of token strings
53
+ def tokenize_to_strings(text)
54
+ tokenize(text).map { |t| t[:token] }
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'tokenizer'
4
+
5
+ module Kotoshu
6
+ module Components
7
+ # Whitespace-based tokenizer for Latin-script languages.
8
+ #
9
+ # Splits text on whitespace and separates punctuation.
10
+ # Suitable for languages with space-separated words (English, French, German, etc.).
11
+ #
12
+ # This is a simple tokenizer that works well for most Latin-script languages.
13
+ # For more advanced tokenization (contractions, compounds), use language-specific
14
+ # tokenizers.
15
+ #
16
+ # @example Basic tokenization
17
+ # tokenizer = WhitespaceTokenizer.new
18
+ # tokens = tokenizer.tokenize("Hello, world!")
19
+ # # => [
20
+ # # { token: "Hello", position: 0, length: 5 },
21
+ # # { token: ",", position: 5, length: 1 },
22
+ # # { token: "world", position: 7, length: 5 },
23
+ # # { token: "!", position: 12, length: 1 }
24
+ # # ]
25
+ #
26
+ # @example Tokenizing to strings
27
+ # tokenizer.tokenize_to_strings("Hello, world!")
28
+ # # => ["Hello", ",", "world", "!"]
29
+ class WhitespaceTokenizer < Tokenizer
30
+ # Regex pattern for matching tokens (words or punctuation).
31
+ TOKEN_PATTERN = /[\w']+|[^\w\s]/.freeze
32
+
33
+ # Create a new whitespace tokenizer.
34
+ #
35
+ # @param pattern [Regexp] Optional custom token pattern
36
+ def initialize(pattern: TOKEN_PATTERN)
37
+ @pattern = pattern
38
+ end
39
+
40
+ # Split text into tokens.
41
+ #
42
+ # Each token is a hash with:
43
+ # - :token (String) - The token text
44
+ # - :position (Integer) - Character position in original text
45
+ # - :length (Integer) - Token length in characters
46
+ #
47
+ # @param text [String] The input text
48
+ # @return [Array<Hash>] Array of token hashes
49
+ def tokenize(text)
50
+ return [] if text.nil? || text.empty?
51
+
52
+ tokens = []
53
+ position = 0
54
+
55
+ # Find all matches
56
+ text.scan(@pattern) do |match|
57
+ match_str = match.is_a?(Array) ? match.first : match
58
+ start_pos = text.index(match_str, position)
59
+
60
+ tokens << {
61
+ token: match_str,
62
+ position: start_pos,
63
+ length: match_str.length
64
+ }
65
+
66
+ position = start_pos + match_str.length
67
+ end
68
+
69
+ tokens
70
+ end
71
+
72
+ # Get the token pattern used by this tokenizer.
73
+ #
74
+ # @return [Regexp] The token pattern
75
+ def pattern
76
+ @pattern
77
+ end
78
+
79
+ # Check if a character is a word character.
80
+ #
81
+ # @param char [String] Single character
82
+ # @return [Boolean] True if word character
83
+ def word_char?(char)
84
+ char.match?(/[\w]/)
85
+ end
86
+
87
+ # Check if a character is punctuation.
88
+ #
89
+ # @param char [String] Single character
90
+ # @return [Boolean] True if punctuation
91
+ def punctuation?(char)
92
+ char.match?(/[^\w\s]/)
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,209 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../configuration"
4
+
5
+ module Kotoshu
6
+ class Configuration
7
+ # Builder for creating immutable Configuration objects.
8
+ #
9
+ # Provides a fluent interface for building configuration objects
10
+ # that are frozen after creation, ensuring thread-safety and immutability.
11
+ #
12
+ # @example Building with block
13
+ # config = Configuration::Builder.build do |b|
14
+ # b.dictionary_path = "words.txt"
15
+ # b.language = "en-GB"
16
+ # end
17
+ #
18
+ # @example Building with fluent methods
19
+ # config = Configuration::Builder.build
20
+ # .with_dictionary_path("words.txt")
21
+ # .with_language("en-GB")
22
+ class Builder
23
+ # Build an immutable configuration.
24
+ #
25
+ # @yield [builder] Optional block for configuration
26
+ # @return [Configuration] Frozen configuration object
27
+ #
28
+ # @example With block
29
+ # config = Builder.build do |b|
30
+ # b.dictionary_path = "words.txt"
31
+ # end
32
+ #
33
+ # @example Without block (uses defaults)
34
+ # config = Builder.build
35
+ def self.build
36
+ builder_instance = new
37
+ yield(builder_instance) if block_given?
38
+ builder_instance.to_config
39
+ end
40
+
41
+ # Create a new builder.
42
+ def initialize
43
+ @settings = DEFAULTS.dup
44
+ end
45
+
46
+ # Set dictionary path.
47
+ #
48
+ # @param path [String] Path to dictionary file
49
+ # @return [self] Self for chaining
50
+ def dictionary_path=(path)
51
+ @settings[:dictionary_path] = path
52
+ self
53
+ end
54
+
55
+ # Set dictionary type.
56
+ #
57
+ # @param type [Symbol] Dictionary type
58
+ # @return [self] Self for chaining
59
+ def dictionary_type=(type)
60
+ @settings[:dictionary_type] = type
61
+ self
62
+ end
63
+
64
+ # Set language code.
65
+ #
66
+ # @param lang [String] Language code
67
+ # @return [self] Self for chaining
68
+ def language=(lang)
69
+ @settings[:language] = lang
70
+ self
71
+ end
72
+
73
+ # Set locale.
74
+ #
75
+ # @param locale [String, nil] Locale
76
+ # @return [self] Self for chaining
77
+ def locale=(locale)
78
+ @settings[:locale] = locale
79
+ self
80
+ end
81
+
82
+ # Set max suggestions.
83
+ #
84
+ # @param max [Integer] Maximum suggestions
85
+ # @return [self] Self for chaining
86
+ def max_suggestions=(max)
87
+ @settings[:max_suggestions] = max
88
+ self
89
+ end
90
+
91
+ # Set case sensitivity.
92
+ #
93
+ # @param sensitive [Boolean] Case sensitive flag
94
+ # @return [self] Self for chaining
95
+ def case_sensitive=(sensitive)
96
+ @settings[:case_sensitive] = sensitive
97
+ self
98
+ end
99
+
100
+ # Set verbose mode.
101
+ #
102
+ # @param verbose [Boolean] Verbose flag
103
+ # @return [self] Self for chaining
104
+ def verbose=(verbose)
105
+ @settings[:verbose] = verbose
106
+ self
107
+ end
108
+
109
+ # Set suggestion algorithms.
110
+ #
111
+ # @param algorithms [Array<Class>, nil] Suggestion algorithms
112
+ # @return [self] Self for chaining
113
+ def suggestion_algorithms=(algorithms)
114
+ @settings[:suggestion_algorithms] = algorithms
115
+ self
116
+ end
117
+
118
+ # Set custom words.
119
+ #
120
+ # @param words [Array<String>] Custom words
121
+ # @return [self] Self for chaining
122
+ def custom_words=(words)
123
+ @settings[:custom_words] = words.dup.freeze
124
+ self
125
+ end
126
+
127
+ # Set encoding.
128
+ #
129
+ # @param encoding [String] Character encoding
130
+ # @return [self] Self for chaining
131
+ def encoding=(encoding)
132
+ @settings[:encoding] = encoding
133
+ self
134
+ end
135
+
136
+ # Fluent method to set dictionary path.
137
+ #
138
+ # @param path [String] Path to dictionary file
139
+ # @return [Configuration] New configuration
140
+ def with_dictionary_path(path)
141
+ @settings[:dictionary_path] = path
142
+ self
143
+ end
144
+
145
+ # Fluent method to set dictionary type.
146
+ #
147
+ # @param type [Symbol] Dictionary type
148
+ # @return [Configuration] New configuration
149
+ def with_dictionary_type(type)
150
+ @settings[:dictionary_type] = type
151
+ self
152
+ end
153
+
154
+ # Fluent method to set language.
155
+ #
156
+ # @param lang [String] Language code
157
+ # @return [Configuration] New configuration
158
+ def with_language(lang)
159
+ @settings[:language] = lang
160
+ self
161
+ end
162
+
163
+ # Fluent method to set locale.
164
+ #
165
+ # @param locale [String, nil] Locale
166
+ # @return [Configuration] New configuration
167
+ def with_locale(locale)
168
+ @settings[:locale] = locale
169
+ self
170
+ end
171
+
172
+ # Fluent method to set max suggestions.
173
+ #
174
+ # @param max [Integer] Maximum suggestions
175
+ # @return [Configuration] New configuration
176
+ def with_max_suggestions(max)
177
+ @settings[:max_suggestions] = max
178
+ self
179
+ end
180
+
181
+ # Fluent method to set case sensitivity.
182
+ #
183
+ # @param sensitive [Boolean] Case sensitive flag
184
+ # @return [Configuration] New configuration
185
+ def with_case_sensitive(sensitive)
186
+ @settings[:case_sensitive] = sensitive
187
+ self
188
+ end
189
+
190
+ # Fluent method to set verbose mode.
191
+ #
192
+ # @param verbose [Boolean] Verbose flag
193
+ # @return [Configuration] New configuration
194
+ def with_verbose(verbose)
195
+ @settings[:verbose] = verbose
196
+ self
197
+ end
198
+
199
+ # Convert builder to frozen Configuration.
200
+ #
201
+ # @return [Configuration] Frozen configuration object
202
+ def to_config
203
+ config = Configuration.new(@settings.dup)
204
+ config.freeze
205
+ config
206
+ end
207
+ end
208
+ end
209
+ end