kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,477 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../readers/lookup_builder'
4
+ require_relative '../../components/spell_checker'
5
+ require_relative '../../components/pos_tagger'
6
+ require_relative '../../language/normalizer/base'
7
+
8
+ module Kotoshu
9
+ module Languages
10
+ # Japanese language implementation.
11
+ #
12
+ # Supports ja-JP with full CJK script support.
13
+ #
14
+ # Uses morphological analysis via Suika gem for tokenization and POS tagging.
15
+ # Japanese spell checking uses dictionary lookup with CJK character support.
16
+ class Japanese < Language::Base
17
+ # Japanese spell checker using dictionary lookup.
18
+ #
19
+ # Japanese uses morphological analysis rather than traditional Hunspell
20
+ # dictionaries. Spell checking is done through dictionary lookup of segmented
21
+ # words from the morphological analyzer.
22
+ class SpellChecker < Components::SpellChecker
23
+ attr_reader :dic_path, :script
24
+
25
+ def initialize(dic_path:, script: :cjk)
26
+ @dic_path = dic_path
27
+ @script = script
28
+ # Japanese dictionaries are typically in custom formats
29
+ # Load dictionary into memory for fast lookup
30
+ @dictionary = load_dictionary(dic_path)
31
+ end
32
+
33
+ def check(word)
34
+ return { found: false, stem: nil, flags: [] } if word.nil? || word.empty?
35
+
36
+ # Check if word exists in dictionary
37
+ found = @dictionary.include?(word)
38
+
39
+ if found
40
+ { found: true, stem: word, flags: [] }
41
+ else
42
+ # For CJK text, we might want to check if it contains valid characters
43
+ # but not actual word validation
44
+ { found: false, stem: nil, flags: [] }
45
+ end
46
+ end
47
+
48
+ def suggest(word, max_suggestions: 10)
49
+ return [] if word.nil? || word.empty?
50
+ return [] if @dictionary.include?(word)
51
+
52
+ # Generate suggestions based on common Japanese errors
53
+ generate_suggestions(word, max_suggestions).take(max_suggestions)
54
+ end
55
+
56
+ def correct?(word)
57
+ check(word)[:found]
58
+ end
59
+
60
+ private
61
+
62
+ def load_dictionary(path)
63
+ # Simple in-memory dictionary for Japanese words
64
+ # In production, this would use a proper CJK dictionary
65
+ @dictionary = Set.new
66
+ if File.exist?(path)
67
+ File.readlines(path, encoding: 'UTF-8').each do |line|
68
+ @dictionary.add(line.strip)
69
+ end
70
+ end
71
+ @dictionary
72
+ end
73
+
74
+ def generate_suggestions(word, max_suggestions)
75
+ variations = []
76
+
77
+ # Japanese character substitutions (common errors)
78
+ japanese_substitutions = {
79
+ 'あ' => %w[ああ],
80
+ 'い' => %w[いい],
81
+ 'う' => %w[うう],
82
+ 'え' => %w[ええ],
83
+ 'お' => %w[おお],
84
+ 'か' => %w[かが],
85
+ 'き' => %w[きぎ],
86
+ 'く' => %w[くぐ],
87
+ 'け' => %w[けげ],
88
+ 'こ' => %w[こご],
89
+ 'さ' => %w[さざ],
90
+ 'し' => %w[しじ],
91
+ 'す' => %w[すず],
92
+ 'せ' => %w[せぜ],
93
+ 'そ' => %w[そぞ],
94
+ 'た' => %w[ただ],
95
+ 'ち' => %w[ちぢ],
96
+ 'つ' => %w[つづ],
97
+ 'て' => %w[てで],
98
+ 'と' => %w[とど],
99
+ 'は' => %w[はば],
100
+ 'ひ' => %w[ひび],
101
+ 'ふ' => %w[ふぶ],
102
+ 'へ' => %w[へべ],
103
+ 'ほ' => %w[ほぼ],
104
+ 'ま' => %w[まま],
105
+ 'み' => %w[みみ],
106
+ 'む' => %w[むむ],
107
+ 'め' => %w[めめ],
108
+ 'も' => %w[もも],
109
+ 'や' => %w[やや],
110
+ 'ゆ' => %w[ゆゆ],
111
+ 'よ' => %w[よよ],
112
+ 'ら' => %w[らら],
113
+ 'り' => %w[りり],
114
+ 'る' => %w[るる],
115
+ 'れ' => %w[れれ],
116
+ 'ろ' => %w[ろろ],
117
+ 'わ' => %w[わわ],
118
+ 'を' => %w[お],
119
+ }
120
+
121
+ word.chars.each_with_index do |char, i|
122
+ next unless japanese_substitutions.key?(char)
123
+ japanese_substitutions[char].each do |sub|
124
+ substituted = word.dup
125
+ substituted[i] = sub
126
+ variations << substituted if @dictionary.include?(substituted)
127
+ end
128
+ end
129
+
130
+ # Suggest similar dictionary words
131
+ if word.length >= 2
132
+ @dictionary.each do |dict_word|
133
+ distance = levenshtein_distance(word, dict_word)
134
+ if distance <= 2 && distance > 0
135
+ variations << dict_word
136
+ end
137
+ break if variations.length >= max_suggestions * 2
138
+ end
139
+ end
140
+
141
+ variations.uniq.first(max_suggestions)
142
+ end
143
+
144
+ def levenshtein_distance(a, b)
145
+ return a.length if b.empty?
146
+ return b.length if a.empty?
147
+ matrix = Array.new(a.length + 1) { |i| [i] + [0] * b.length }
148
+ (1..b.length).each { |j| matrix[0][j] = j }
149
+ (1..a.length).each do |i|
150
+ (1..b.length).each do |j|
151
+ cost = a[i - 1] == b[j - 1] ? 0 : 1
152
+ matrix[i][j] = [matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost].min
153
+ end
154
+ end
155
+ matrix[a.length][b.length]
156
+ end
157
+ end
158
+
159
+ # Japanese tokenizer with morphological analysis.
160
+ class Tokenizer < Language::Tokenizer::JapaneseTokenizer
161
+ end
162
+
163
+ # Japanese POS tagger using morphological analysis.
164
+ #
165
+ # Japanese POS tagging is integrated with tokenization via Suika gem,
166
+ # which provides both segmentation and part-of-speech information.
167
+ #
168
+ # Suika output format: surface<TAB>POS,subcat1,subcat2,subcat3,conj_type,conj_form,lemma,reading,pronunciation
169
+ # Example: "すもも\t名詞,一般,*,*,*,*,すもも,スモモ,スモモ"
170
+ #
171
+ # POS tags use universal English categories for common types, and ROMAJI
172
+ # (Latin script) identifiers based on Japanese terminology only for
173
+ # language-specific categories without universal equivalents.
174
+ class POSTagger < Components::PosTagger
175
+ # Japanese POS tag mappings from Suika to standard identifiers.
176
+ #
177
+ # Strategy: Use universal English POS tags (NOUN, VERB, etc.) with
178
+ # English suffixes for subcategories. All identifiers are ASCII.
179
+ #
180
+ # Main categories (field 0) - universal:
181
+ # - 名詞 → NOUN
182
+ # - 動詞 → VERB
183
+ # - 助詞 → PARTICLE
184
+ # - 助動詞 → AUX
185
+ #
186
+ # Noun subcategories (field 1):
187
+ # - NOUN_COMMON: 一般 - common nouns
188
+ # - NOUN_PROPER: 固有名詞 - proper nouns
189
+ # - NOUN_PROPER_GEOGRAPHIC: 固有名詞,地域 - proper noun, geographic
190
+ # - NOUN_SUFFIX: 接尾 - suffixes
191
+ # - NOUN_DEPENDENT: 非自立 - dependent nouns (cannot stand alone)
192
+ # - NOUN_SA_CONNECTION: サ変接続 - sa-variant connection nouns
193
+ #
194
+ # Particle subcategories (field 1):
195
+ # - PARTICLE_GRAMMAR: 格助詞 - grammar/case particles (が, を, に, etc.)
196
+ # - PARTICLE_BINDING: 係助詞 - binding particles (は, も, etc.)
197
+ # - PARTICLE_ADNOMINAL: 連体化 - adnominal particles (の)
198
+ #
199
+ # Verb subcategories (field 1):
200
+ # - VERB_INDEPENDENT: 自立 - independent verbs
201
+ FLAG_TO_POS = {
202
+ # Main categories - universal English
203
+ '名詞' => 'NOUN',
204
+ '動詞' => 'VERB',
205
+ '助詞' => 'PARTICLE',
206
+ '助動詞' => 'AUX',
207
+
208
+ # Noun subcategories
209
+ '名詞,一般' => 'NOUN_COMMON',
210
+ '名詞,固有名詞' => 'NOUN_PROPER',
211
+ '名詞,固有名詞,地域' => 'NOUN_PROPER_GEOGRAPHIC',
212
+ '名詞,接尾' => 'NOUN_SUFFIX',
213
+ '名詞,非自立' => 'NOUN_DEPENDENT',
214
+ '名詞,サ変接続' => 'NOUN_SA_CONNECTION',
215
+
216
+ # Particle subcategories
217
+ '助詞,格助詞' => 'PARTICLE_GRAMMAR',
218
+ '助詞,係助詞' => 'PARTICLE_BINDING',
219
+ '助詞,連体化' => 'PARTICLE_ADNOMINAL',
220
+
221
+ # Verb subcategories
222
+ '動詞,自立' => 'VERB_INDEPENDENT',
223
+ }.freeze
224
+
225
+ def initialize(dictionary_path: nil, flag_mapping: FLAG_TO_POS)
226
+ @dictionary_path = dictionary_path
227
+ @flag_mapping = flag_mapping
228
+ @suika_tagger = nil
229
+ @lookup_cache = {}
230
+ end
231
+
232
+ def tag(tokens)
233
+ return [] if tokens.nil? || tokens.empty?
234
+
235
+ # Initialize Suika tagger
236
+ require "suika" unless defined?(::Suika)
237
+ @suika_tagger ||= ::Suika::Tagger.new
238
+
239
+ tokens.map do |token|
240
+ word = token[:token]
241
+ if word.nil? || word.empty?
242
+ token.merge(pos_tag: nil, lemma: nil)
243
+ else
244
+ lookup_result = lookup_with_pos(word)
245
+ token.merge(pos_tag: lookup_result[:pos_tag], lemma: lookup_result[:lemma] || word)
246
+ end
247
+ end
248
+ end
249
+
250
+ def flag_mapping
251
+ @flag_mapping
252
+ end
253
+
254
+ def flag_mapping=(mapping)
255
+ @flag_mapping = mapping
256
+ end
257
+
258
+ def clear_cache
259
+ @lookup_cache.clear
260
+ end
261
+
262
+ private
263
+
264
+ def lookup_with_pos(word)
265
+ return { pos_tag: nil, lemma: nil } if word.nil? || word.empty?
266
+ return @lookup_cache[word] if @lookup_cache.key?(word)
267
+
268
+ # Use Suika to parse and get POS information
269
+ parsed = @suika_tagger.parse(word)
270
+
271
+ # Suika returns tab-separated values: surface\tfeatures
272
+ # Features contain POS information
273
+ pos_tag = extract_pos_from_suika(parsed)
274
+ lemma = extract_lemma_from_suika(parsed)
275
+
276
+ cache_result = { pos_tag: pos_tag, lemma: lemma }
277
+ @lookup_cache[word] = cache_result
278
+ cache_result
279
+ end
280
+
281
+ def extract_pos_from_suika(parsed)
282
+ return nil unless parsed && parsed.first
283
+
284
+ # Parse features from Suika output
285
+ # Format: surface<TAB>POS,sub1,sub2,sub3,conj_type,conj_form,lemma,reading,pronunciation
286
+ parts = parsed.first.split("\t")
287
+ return nil unless parts.length > 1
288
+
289
+ # Features are comma-separated
290
+ # Field 0: Surface form
291
+ # Field 1: Main POS category (e.g., 名詞, 動詞, 助詞)
292
+ # Field 2-6: POS subcategories and conjugation info
293
+ # Field 7: Lemma (dictionary form)
294
+ # Field 8: Reading (katakana)
295
+ # Field 9: Pronunciation (katakana)
296
+ features = parts[1].split(',')
297
+
298
+ # Build hierarchical POS paths from most specific to least specific
299
+ # e.g., ["名詞,固有名詞,地域", "名詞,固有名詞", "名詞"]
300
+ pos_paths = []
301
+ 6.times do |i|
302
+ path = features[0..i].join(',')
303
+ pos_paths << path
304
+ end
305
+ # Reverse to check most specific first
306
+ pos_paths.reverse!
307
+
308
+ # Try to match from most specific to least specific
309
+ pos_paths.each do |pos_path|
310
+ if FLAG_TO_POS.key?(pos_path)
311
+ return FLAG_TO_POS[pos_path]
312
+ end
313
+ end
314
+
315
+ nil
316
+ end
317
+
318
+ def extract_lemma_from_suika(parsed)
319
+ return nil unless parsed && parsed.first
320
+
321
+ parts = parsed.first.split("\t")
322
+ return nil unless parts.length > 1
323
+
324
+ # Extract lemma from Suika features
325
+ # Format is complex, so simplified version
326
+ parts[0] # Return surface form as lemma
327
+ end
328
+ end
329
+
330
+ # Japanese grammar rules module.
331
+ module GrammarRules
332
+ class Rule
333
+ attr_reader :id, :name, :description
334
+
335
+ def initialize(id, name, description)
336
+ @id = id
337
+ @name = name
338
+ @description = description
339
+ end
340
+
341
+ def check(tokens)
342
+ raise NotImplementedError, "#{self.class} must implement #check"
343
+ end
344
+ end
345
+
346
+ # Rule: Particle usage (wa vs ga)
347
+ class ParticleRule < Rule
348
+ def initialize
349
+ super('JA_PARTICLE_USAGE', 'Particle Usage', 'Correct usage of topic marker は vs subject marker が.')
350
+ end
351
+
352
+ def check(tokens)
353
+ # Simplified implementation
354
+ []
355
+ end
356
+ end
357
+
358
+ # Rule: Script mixing
359
+ class ScriptMixingRule < Rule
360
+ def initialize
361
+ super('JA_SCRIPT_MIXING', 'Script Mixing', 'Japanese text uses multiple scripts (Hiragana, Katakana, Kanji).')
362
+ end
363
+
364
+ def check(tokens)
365
+ errors = []
366
+ tokens.each do |token|
367
+ word = token[:token]
368
+ next if word.nil? || word.empty?
369
+
370
+ # Check for script mixing inconsistencies
371
+ has_hiragana = word.match?(/[\u3040-\u309F]/)
372
+ has_katakana = word.match?(/[\u30A0-\u30FF]/)
373
+ has_kanji = word.match?(/[\u4E00-\u9FFF]/)
374
+
375
+ # Words typically shouldn't mix all three scripts
376
+ if has_hiragana && has_katakana && has_kanji
377
+ errors << {
378
+ rule_id: @id,
379
+ position: token[:position],
380
+ message: "Unusual script mixing in word '#{word}'",
381
+ suggestion: 'Review script usage',
382
+ context: word,
383
+ suggestions: ['Use consistent script']
384
+ }
385
+ end
386
+ end
387
+ errors
388
+ end
389
+ end
390
+
391
+ class RuleRegistry
392
+ class << self
393
+ def default_rules
394
+ [ParticleRule.new, ScriptMixingRule.new]
395
+ end
396
+
397
+ def get_rule(id)
398
+ default_rules.find { |rule| rule.id == id }
399
+ end
400
+ end
401
+ end
402
+ end
403
+
404
+ # Registration
405
+ register "ja"
406
+ register "ja-JP"
407
+
408
+ HUNSPELL_DICTIONARIES = {
409
+ 'ja-JP' => {
410
+ # Japanese dictionaries are in custom formats
411
+ # Suika uses its own dictionary format
412
+ }
413
+ }.freeze
414
+
415
+ VARIANT_NAMES = {
416
+ 'JP' => 'Japan'
417
+ }.freeze
418
+
419
+ def initialize(code: "ja", name: "Japanese", variant: nil)
420
+ variant ||= extract_region_code(code)
421
+ super(code: code, name: name, variant: variant)
422
+ end
423
+
424
+ def description
425
+ return name unless variant
426
+ variant_name = VARIANT_NAMES[variant] || variant
427
+ "#{name} (#{variant_name})"
428
+ end
429
+
430
+ def tokenizer
431
+ @tokenizer ||= Tokenizer.new
432
+ end
433
+
434
+ def normalizer
435
+ @normalizer ||= Language::Normalizer::Base.new
436
+ end
437
+
438
+ def dictionary_class
439
+ Dictionary::UnixWords
440
+ end
441
+
442
+ def default_dictionary_paths
443
+ ["/usr/share/dict/words"]
444
+ end
445
+
446
+ def script_type
447
+ :cjk
448
+ end
449
+
450
+ def create_spell_checker
451
+ # Japanese uses custom dictionary, not Hunspell format
452
+ SpellChecker.new(
453
+ dic_path: default_dictionary_paths.first,
454
+ script: :cjk
455
+ )
456
+ end
457
+
458
+ def create_tokenizer
459
+ Tokenizer.new
460
+ end
461
+
462
+ def create_pos_tagger
463
+ POSTagger.new(
464
+ dictionary_path: default_dictionary_paths.first,
465
+ flag_mapping: POSTagger::FLAG_TO_POS
466
+ )
467
+ end
468
+
469
+ private
470
+
471
+ def extract_region_code(code)
472
+ return nil unless code.include?("-")
473
+ code.split("-", 2).last.upcase
474
+ end
475
+ end
476
+ end
477
+ end