kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,477 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../../readers/lookup_builder'
|
|
4
|
+
require_relative '../../components/spell_checker'
|
|
5
|
+
require_relative '../../components/pos_tagger'
|
|
6
|
+
require_relative '../../language/normalizer/base'
|
|
7
|
+
|
|
8
|
+
module Kotoshu
|
|
9
|
+
module Languages
|
|
10
|
+
# Japanese language implementation.
|
|
11
|
+
#
|
|
12
|
+
# Supports ja-JP with full CJK script support.
|
|
13
|
+
#
|
|
14
|
+
# Uses morphological analysis via Suika gem for tokenization and POS tagging.
|
|
15
|
+
# Japanese spell checking uses dictionary lookup with CJK character support.
|
|
16
|
+
class Japanese < Language::Base
|
|
17
|
+
# Japanese spell checker using dictionary lookup.
|
|
18
|
+
#
|
|
19
|
+
# Japanese uses morphological analysis rather than traditional Hunspell
|
|
20
|
+
# dictionaries. Spell checking is done through dictionary lookup of segmented
|
|
21
|
+
# words from the morphological analyzer.
|
|
22
|
+
class SpellChecker < Components::SpellChecker
|
|
23
|
+
attr_reader :dic_path, :script
|
|
24
|
+
|
|
25
|
+
def initialize(dic_path:, script: :cjk)
|
|
26
|
+
@dic_path = dic_path
|
|
27
|
+
@script = script
|
|
28
|
+
# Japanese dictionaries are typically in custom formats
|
|
29
|
+
# Load dictionary into memory for fast lookup
|
|
30
|
+
@dictionary = load_dictionary(dic_path)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def check(word)
|
|
34
|
+
return { found: false, stem: nil, flags: [] } if word.nil? || word.empty?
|
|
35
|
+
|
|
36
|
+
# Check if word exists in dictionary
|
|
37
|
+
found = @dictionary.include?(word)
|
|
38
|
+
|
|
39
|
+
if found
|
|
40
|
+
{ found: true, stem: word, flags: [] }
|
|
41
|
+
else
|
|
42
|
+
# For CJK text, we might want to check if it contains valid characters
|
|
43
|
+
# but not actual word validation
|
|
44
|
+
{ found: false, stem: nil, flags: [] }
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def suggest(word, max_suggestions: 10)
|
|
49
|
+
return [] if word.nil? || word.empty?
|
|
50
|
+
return [] if @dictionary.include?(word)
|
|
51
|
+
|
|
52
|
+
# Generate suggestions based on common Japanese errors
|
|
53
|
+
generate_suggestions(word, max_suggestions).take(max_suggestions)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def correct?(word)
|
|
57
|
+
check(word)[:found]
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
def load_dictionary(path)
|
|
63
|
+
# Simple in-memory dictionary for Japanese words
|
|
64
|
+
# In production, this would use a proper CJK dictionary
|
|
65
|
+
@dictionary = Set.new
|
|
66
|
+
if File.exist?(path)
|
|
67
|
+
File.readlines(path, encoding: 'UTF-8').each do |line|
|
|
68
|
+
@dictionary.add(line.strip)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
@dictionary
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def generate_suggestions(word, max_suggestions)
|
|
75
|
+
variations = []
|
|
76
|
+
|
|
77
|
+
# Japanese character substitutions (common errors)
|
|
78
|
+
japanese_substitutions = {
|
|
79
|
+
'あ' => %w[ああ],
|
|
80
|
+
'い' => %w[いい],
|
|
81
|
+
'う' => %w[うう],
|
|
82
|
+
'え' => %w[ええ],
|
|
83
|
+
'お' => %w[おお],
|
|
84
|
+
'か' => %w[かが],
|
|
85
|
+
'き' => %w[きぎ],
|
|
86
|
+
'く' => %w[くぐ],
|
|
87
|
+
'け' => %w[けげ],
|
|
88
|
+
'こ' => %w[こご],
|
|
89
|
+
'さ' => %w[さざ],
|
|
90
|
+
'し' => %w[しじ],
|
|
91
|
+
'す' => %w[すず],
|
|
92
|
+
'せ' => %w[せぜ],
|
|
93
|
+
'そ' => %w[そぞ],
|
|
94
|
+
'た' => %w[ただ],
|
|
95
|
+
'ち' => %w[ちぢ],
|
|
96
|
+
'つ' => %w[つづ],
|
|
97
|
+
'て' => %w[てで],
|
|
98
|
+
'と' => %w[とど],
|
|
99
|
+
'は' => %w[はば],
|
|
100
|
+
'ひ' => %w[ひび],
|
|
101
|
+
'ふ' => %w[ふぶ],
|
|
102
|
+
'へ' => %w[へべ],
|
|
103
|
+
'ほ' => %w[ほぼ],
|
|
104
|
+
'ま' => %w[まま],
|
|
105
|
+
'み' => %w[みみ],
|
|
106
|
+
'む' => %w[むむ],
|
|
107
|
+
'め' => %w[めめ],
|
|
108
|
+
'も' => %w[もも],
|
|
109
|
+
'や' => %w[やや],
|
|
110
|
+
'ゆ' => %w[ゆゆ],
|
|
111
|
+
'よ' => %w[よよ],
|
|
112
|
+
'ら' => %w[らら],
|
|
113
|
+
'り' => %w[りり],
|
|
114
|
+
'る' => %w[るる],
|
|
115
|
+
'れ' => %w[れれ],
|
|
116
|
+
'ろ' => %w[ろろ],
|
|
117
|
+
'わ' => %w[わわ],
|
|
118
|
+
'を' => %w[お],
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
word.chars.each_with_index do |char, i|
|
|
122
|
+
next unless japanese_substitutions.key?(char)
|
|
123
|
+
japanese_substitutions[char].each do |sub|
|
|
124
|
+
substituted = word.dup
|
|
125
|
+
substituted[i] = sub
|
|
126
|
+
variations << substituted if @dictionary.include?(substituted)
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Suggest similar dictionary words
|
|
131
|
+
if word.length >= 2
|
|
132
|
+
@dictionary.each do |dict_word|
|
|
133
|
+
distance = levenshtein_distance(word, dict_word)
|
|
134
|
+
if distance <= 2 && distance > 0
|
|
135
|
+
variations << dict_word
|
|
136
|
+
end
|
|
137
|
+
break if variations.length >= max_suggestions * 2
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
variations.uniq.first(max_suggestions)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def levenshtein_distance(a, b)
|
|
145
|
+
return a.length if b.empty?
|
|
146
|
+
return b.length if a.empty?
|
|
147
|
+
matrix = Array.new(a.length + 1) { |i| [i] + [0] * b.length }
|
|
148
|
+
(1..b.length).each { |j| matrix[0][j] = j }
|
|
149
|
+
(1..a.length).each do |i|
|
|
150
|
+
(1..b.length).each do |j|
|
|
151
|
+
cost = a[i - 1] == b[j - 1] ? 0 : 1
|
|
152
|
+
matrix[i][j] = [matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost].min
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
matrix[a.length][b.length]
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Japanese tokenizer with morphological analysis.
|
|
160
|
+
class Tokenizer < Language::Tokenizer::JapaneseTokenizer
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Japanese POS tagger using morphological analysis.
|
|
164
|
+
#
|
|
165
|
+
# Japanese POS tagging is integrated with tokenization via Suika gem,
|
|
166
|
+
# which provides both segmentation and part-of-speech information.
|
|
167
|
+
#
|
|
168
|
+
# Suika output format: surface<TAB>POS,subcat1,subcat2,subcat3,conj_type,conj_form,lemma,reading,pronunciation
|
|
169
|
+
# Example: "すもも\t名詞,一般,*,*,*,*,すもも,スモモ,スモモ"
|
|
170
|
+
#
|
|
171
|
+
# POS tags use universal English categories for common types, and ROMAJI
|
|
172
|
+
# (Latin script) identifiers based on Japanese terminology only for
|
|
173
|
+
# language-specific categories without universal equivalents.
|
|
174
|
+
class POSTagger < Components::PosTagger
|
|
175
|
+
# Japanese POS tag mappings from Suika to standard identifiers.
|
|
176
|
+
#
|
|
177
|
+
# Strategy: Use universal English POS tags (NOUN, VERB, etc.) with
|
|
178
|
+
# English suffixes for subcategories. All identifiers are ASCII.
|
|
179
|
+
#
|
|
180
|
+
# Main categories (field 0) - universal:
|
|
181
|
+
# - 名詞 → NOUN
|
|
182
|
+
# - 動詞 → VERB
|
|
183
|
+
# - 助詞 → PARTICLE
|
|
184
|
+
# - 助動詞 → AUX
|
|
185
|
+
#
|
|
186
|
+
# Noun subcategories (field 1):
|
|
187
|
+
# - NOUN_COMMON: 一般 - common nouns
|
|
188
|
+
# - NOUN_PROPER: 固有名詞 - proper nouns
|
|
189
|
+
# - NOUN_PROPER_GEOGRAPHIC: 固有名詞,地域 - proper noun, geographic
|
|
190
|
+
# - NOUN_SUFFIX: 接尾 - suffixes
|
|
191
|
+
# - NOUN_DEPENDENT: 非自立 - dependent nouns (cannot stand alone)
|
|
192
|
+
# - NOUN_SA_CONNECTION: サ変接続 - sa-variant connection nouns
|
|
193
|
+
#
|
|
194
|
+
# Particle subcategories (field 1):
|
|
195
|
+
# - PARTICLE_GRAMMAR: 格助詞 - grammar/case particles (が, を, に, etc.)
|
|
196
|
+
# - PARTICLE_BINDING: 係助詞 - binding particles (は, も, etc.)
|
|
197
|
+
# - PARTICLE_ADNOMINAL: 連体化 - adnominal particles (の)
|
|
198
|
+
#
|
|
199
|
+
# Verb subcategories (field 1):
|
|
200
|
+
# - VERB_INDEPENDENT: 自立 - independent verbs
|
|
201
|
+
FLAG_TO_POS = {
|
|
202
|
+
# Main categories - universal English
|
|
203
|
+
'名詞' => 'NOUN',
|
|
204
|
+
'動詞' => 'VERB',
|
|
205
|
+
'助詞' => 'PARTICLE',
|
|
206
|
+
'助動詞' => 'AUX',
|
|
207
|
+
|
|
208
|
+
# Noun subcategories
|
|
209
|
+
'名詞,一般' => 'NOUN_COMMON',
|
|
210
|
+
'名詞,固有名詞' => 'NOUN_PROPER',
|
|
211
|
+
'名詞,固有名詞,地域' => 'NOUN_PROPER_GEOGRAPHIC',
|
|
212
|
+
'名詞,接尾' => 'NOUN_SUFFIX',
|
|
213
|
+
'名詞,非自立' => 'NOUN_DEPENDENT',
|
|
214
|
+
'名詞,サ変接続' => 'NOUN_SA_CONNECTION',
|
|
215
|
+
|
|
216
|
+
# Particle subcategories
|
|
217
|
+
'助詞,格助詞' => 'PARTICLE_GRAMMAR',
|
|
218
|
+
'助詞,係助詞' => 'PARTICLE_BINDING',
|
|
219
|
+
'助詞,連体化' => 'PARTICLE_ADNOMINAL',
|
|
220
|
+
|
|
221
|
+
# Verb subcategories
|
|
222
|
+
'動詞,自立' => 'VERB_INDEPENDENT',
|
|
223
|
+
}.freeze
|
|
224
|
+
|
|
225
|
+
def initialize(dictionary_path: nil, flag_mapping: FLAG_TO_POS)
|
|
226
|
+
@dictionary_path = dictionary_path
|
|
227
|
+
@flag_mapping = flag_mapping
|
|
228
|
+
@suika_tagger = nil
|
|
229
|
+
@lookup_cache = {}
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
def tag(tokens)
|
|
233
|
+
return [] if tokens.nil? || tokens.empty?
|
|
234
|
+
|
|
235
|
+
# Initialize Suika tagger
|
|
236
|
+
require "suika" unless defined?(::Suika)
|
|
237
|
+
@suika_tagger ||= ::Suika::Tagger.new
|
|
238
|
+
|
|
239
|
+
tokens.map do |token|
|
|
240
|
+
word = token[:token]
|
|
241
|
+
if word.nil? || word.empty?
|
|
242
|
+
token.merge(pos_tag: nil, lemma: nil)
|
|
243
|
+
else
|
|
244
|
+
lookup_result = lookup_with_pos(word)
|
|
245
|
+
token.merge(pos_tag: lookup_result[:pos_tag], lemma: lookup_result[:lemma] || word)
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def flag_mapping
|
|
251
|
+
@flag_mapping
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
def flag_mapping=(mapping)
|
|
255
|
+
@flag_mapping = mapping
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
def clear_cache
|
|
259
|
+
@lookup_cache.clear
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
private
|
|
263
|
+
|
|
264
|
+
def lookup_with_pos(word)
|
|
265
|
+
return { pos_tag: nil, lemma: nil } if word.nil? || word.empty?
|
|
266
|
+
return @lookup_cache[word] if @lookup_cache.key?(word)
|
|
267
|
+
|
|
268
|
+
# Use Suika to parse and get POS information
|
|
269
|
+
parsed = @suika_tagger.parse(word)
|
|
270
|
+
|
|
271
|
+
# Suika returns tab-separated values: surface\tfeatures
|
|
272
|
+
# Features contain POS information
|
|
273
|
+
pos_tag = extract_pos_from_suika(parsed)
|
|
274
|
+
lemma = extract_lemma_from_suika(parsed)
|
|
275
|
+
|
|
276
|
+
cache_result = { pos_tag: pos_tag, lemma: lemma }
|
|
277
|
+
@lookup_cache[word] = cache_result
|
|
278
|
+
cache_result
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
def extract_pos_from_suika(parsed)
|
|
282
|
+
return nil unless parsed && parsed.first
|
|
283
|
+
|
|
284
|
+
# Parse features from Suika output
|
|
285
|
+
# Format: surface<TAB>POS,sub1,sub2,sub3,conj_type,conj_form,lemma,reading,pronunciation
|
|
286
|
+
parts = parsed.first.split("\t")
|
|
287
|
+
return nil unless parts.length > 1
|
|
288
|
+
|
|
289
|
+
# Features are comma-separated
|
|
290
|
+
# Field 0: Surface form
|
|
291
|
+
# Field 1: Main POS category (e.g., 名詞, 動詞, 助詞)
|
|
292
|
+
# Field 2-6: POS subcategories and conjugation info
|
|
293
|
+
# Field 7: Lemma (dictionary form)
|
|
294
|
+
# Field 8: Reading (katakana)
|
|
295
|
+
# Field 9: Pronunciation (katakana)
|
|
296
|
+
features = parts[1].split(',')
|
|
297
|
+
|
|
298
|
+
# Build hierarchical POS paths from most specific to least specific
|
|
299
|
+
# e.g., ["名詞,固有名詞,地域", "名詞,固有名詞", "名詞"]
|
|
300
|
+
pos_paths = []
|
|
301
|
+
6.times do |i|
|
|
302
|
+
path = features[0..i].join(',')
|
|
303
|
+
pos_paths << path
|
|
304
|
+
end
|
|
305
|
+
# Reverse to check most specific first
|
|
306
|
+
pos_paths.reverse!
|
|
307
|
+
|
|
308
|
+
# Try to match from most specific to least specific
|
|
309
|
+
pos_paths.each do |pos_path|
|
|
310
|
+
if FLAG_TO_POS.key?(pos_path)
|
|
311
|
+
return FLAG_TO_POS[pos_path]
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
nil
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
def extract_lemma_from_suika(parsed)
|
|
319
|
+
return nil unless parsed && parsed.first
|
|
320
|
+
|
|
321
|
+
parts = parsed.first.split("\t")
|
|
322
|
+
return nil unless parts.length > 1
|
|
323
|
+
|
|
324
|
+
# Extract lemma from Suika features
|
|
325
|
+
# Format is complex, so simplified version
|
|
326
|
+
parts[0] # Return surface form as lemma
|
|
327
|
+
end
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# Japanese grammar rules module.
|
|
331
|
+
module GrammarRules
|
|
332
|
+
class Rule
|
|
333
|
+
attr_reader :id, :name, :description
|
|
334
|
+
|
|
335
|
+
def initialize(id, name, description)
|
|
336
|
+
@id = id
|
|
337
|
+
@name = name
|
|
338
|
+
@description = description
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
def check(tokens)
|
|
342
|
+
raise NotImplementedError, "#{self.class} must implement #check"
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
# Rule: Particle usage (wa vs ga)
|
|
347
|
+
class ParticleRule < Rule
|
|
348
|
+
def initialize
|
|
349
|
+
super('JA_PARTICLE_USAGE', 'Particle Usage', 'Correct usage of topic marker は vs subject marker が.')
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
def check(tokens)
|
|
353
|
+
# Simplified implementation
|
|
354
|
+
[]
|
|
355
|
+
end
|
|
356
|
+
end
|
|
357
|
+
|
|
358
|
+
# Rule: Script mixing
|
|
359
|
+
class ScriptMixingRule < Rule
|
|
360
|
+
def initialize
|
|
361
|
+
super('JA_SCRIPT_MIXING', 'Script Mixing', 'Japanese text uses multiple scripts (Hiragana, Katakana, Kanji).')
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
def check(tokens)
|
|
365
|
+
errors = []
|
|
366
|
+
tokens.each do |token|
|
|
367
|
+
word = token[:token]
|
|
368
|
+
next if word.nil? || word.empty?
|
|
369
|
+
|
|
370
|
+
# Check for script mixing inconsistencies
|
|
371
|
+
has_hiragana = word.match?(/[\u3040-\u309F]/)
|
|
372
|
+
has_katakana = word.match?(/[\u30A0-\u30FF]/)
|
|
373
|
+
has_kanji = word.match?(/[\u4E00-\u9FFF]/)
|
|
374
|
+
|
|
375
|
+
# Words typically shouldn't mix all three scripts
|
|
376
|
+
if has_hiragana && has_katakana && has_kanji
|
|
377
|
+
errors << {
|
|
378
|
+
rule_id: @id,
|
|
379
|
+
position: token[:position],
|
|
380
|
+
message: "Unusual script mixing in word '#{word}'",
|
|
381
|
+
suggestion: 'Review script usage',
|
|
382
|
+
context: word,
|
|
383
|
+
suggestions: ['Use consistent script']
|
|
384
|
+
}
|
|
385
|
+
end
|
|
386
|
+
end
|
|
387
|
+
errors
|
|
388
|
+
end
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
class RuleRegistry
|
|
392
|
+
class << self
|
|
393
|
+
def default_rules
|
|
394
|
+
[ParticleRule.new, ScriptMixingRule.new]
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
def get_rule(id)
|
|
398
|
+
default_rules.find { |rule| rule.id == id }
|
|
399
|
+
end
|
|
400
|
+
end
|
|
401
|
+
end
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
# Registration
|
|
405
|
+
register "ja"
|
|
406
|
+
register "ja-JP"
|
|
407
|
+
|
|
408
|
+
HUNSPELL_DICTIONARIES = {
|
|
409
|
+
'ja-JP' => {
|
|
410
|
+
# Japanese dictionaries are in custom formats
|
|
411
|
+
# Suika uses its own dictionary format
|
|
412
|
+
}
|
|
413
|
+
}.freeze
|
|
414
|
+
|
|
415
|
+
VARIANT_NAMES = {
|
|
416
|
+
'JP' => 'Japan'
|
|
417
|
+
}.freeze
|
|
418
|
+
|
|
419
|
+
def initialize(code: "ja", name: "Japanese", variant: nil)
|
|
420
|
+
variant ||= extract_region_code(code)
|
|
421
|
+
super(code: code, name: name, variant: variant)
|
|
422
|
+
end
|
|
423
|
+
|
|
424
|
+
def description
|
|
425
|
+
return name unless variant
|
|
426
|
+
variant_name = VARIANT_NAMES[variant] || variant
|
|
427
|
+
"#{name} (#{variant_name})"
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
def tokenizer
|
|
431
|
+
@tokenizer ||= Tokenizer.new
|
|
432
|
+
end
|
|
433
|
+
|
|
434
|
+
def normalizer
|
|
435
|
+
@normalizer ||= Language::Normalizer::Base.new
|
|
436
|
+
end
|
|
437
|
+
|
|
438
|
+
def dictionary_class
|
|
439
|
+
Dictionary::UnixWords
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
def default_dictionary_paths
|
|
443
|
+
["/usr/share/dict/words"]
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
def script_type
|
|
447
|
+
:cjk
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
def create_spell_checker
|
|
451
|
+
# Japanese uses custom dictionary, not Hunspell format
|
|
452
|
+
SpellChecker.new(
|
|
453
|
+
dic_path: default_dictionary_paths.first,
|
|
454
|
+
script: :cjk
|
|
455
|
+
)
|
|
456
|
+
end
|
|
457
|
+
|
|
458
|
+
def create_tokenizer
|
|
459
|
+
Tokenizer.new
|
|
460
|
+
end
|
|
461
|
+
|
|
462
|
+
def create_pos_tagger
|
|
463
|
+
POSTagger.new(
|
|
464
|
+
dictionary_path: default_dictionary_paths.first,
|
|
465
|
+
flag_mapping: POSTagger::FLAG_TO_POS
|
|
466
|
+
)
|
|
467
|
+
end
|
|
468
|
+
|
|
469
|
+
private
|
|
470
|
+
|
|
471
|
+
def extract_region_code(code)
|
|
472
|
+
return nil unless code.include?("-")
|
|
473
|
+
code.split("-", 2).last.upcase
|
|
474
|
+
end
|
|
475
|
+
end
|
|
476
|
+
end
|
|
477
|
+
end
|