kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Language
|
|
5
|
+
module Tokenizer
|
|
6
|
+
# Abstract base class for tokenizers.
|
|
7
|
+
#
|
|
8
|
+
# Uses Strategy pattern to allow different tokenization approaches
|
|
9
|
+
# for different languages.
|
|
10
|
+
#
|
|
11
|
+
# Subclasses must implement the tokenize method.
|
|
12
|
+
#
|
|
13
|
+
# @example Implement a tokenizer
|
|
14
|
+
# class MyTokenizer < Tokenizer::Base
|
|
15
|
+
# def tokenize(text)
|
|
16
|
+
# text.split(/ /)
|
|
17
|
+
# end
|
|
18
|
+
# end
|
|
19
|
+
class Base
|
|
20
|
+
# Tokenize text into words.
|
|
21
|
+
#
|
|
22
|
+
# @param text [String] Text to tokenize
|
|
23
|
+
# @return [Array<String>] Array of tokens
|
|
24
|
+
# @raise [NotImplementedError] Must be implemented by subclass
|
|
25
|
+
def tokenize(text)
|
|
26
|
+
raise NotImplementedError, "#{self.class} must implement #tokenize"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Tokenize text with positions.
|
|
30
|
+
#
|
|
31
|
+
# Returns tokens along with their position information.
|
|
32
|
+
#
|
|
33
|
+
# @param text [String] Text to tokenize
|
|
34
|
+
# @return [Array<Hash>] Array of {token:, start:, end:, line:, column:}
|
|
35
|
+
def tokenize_with_positions(text)
|
|
36
|
+
return [] if text.nil?
|
|
37
|
+
return [] if text.empty?
|
|
38
|
+
|
|
39
|
+
tokens = []
|
|
40
|
+
line = 1
|
|
41
|
+
column = 1
|
|
42
|
+
position = 0
|
|
43
|
+
|
|
44
|
+
while position < text.length
|
|
45
|
+
# Skip whitespace
|
|
46
|
+
while position < text.length && text[position].match?(/\s/)
|
|
47
|
+
if text[position] == "\n"
|
|
48
|
+
line += 1
|
|
49
|
+
column = 1
|
|
50
|
+
else
|
|
51
|
+
column += 1
|
|
52
|
+
end
|
|
53
|
+
position += 1
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
break if position >= text.length
|
|
57
|
+
|
|
58
|
+
# Find token
|
|
59
|
+
start_pos = position
|
|
60
|
+
start_line = line
|
|
61
|
+
start_column = column
|
|
62
|
+
|
|
63
|
+
token_text = extract_next_token(text, position)
|
|
64
|
+
|
|
65
|
+
if token_text
|
|
66
|
+
tokens << {
|
|
67
|
+
token: token_text,
|
|
68
|
+
start: start_pos,
|
|
69
|
+
end: start_pos + token_text.length,
|
|
70
|
+
line: start_line,
|
|
71
|
+
column: start_column
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
token_text.each_char do |char|
|
|
75
|
+
column += 1
|
|
76
|
+
position += 1
|
|
77
|
+
if char == "\n"
|
|
78
|
+
line += 1
|
|
79
|
+
column = 1
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
else
|
|
83
|
+
position += 1
|
|
84
|
+
column += 1
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
tokens
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Check if a character is a word character.
|
|
92
|
+
#
|
|
93
|
+
# @param char [String] Single character
|
|
94
|
+
# @return [Boolean] True if word character
|
|
95
|
+
def word_char?(char)
|
|
96
|
+
match?(word_boundary_regex, char)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Get word boundary regex for this tokenizer.
|
|
100
|
+
#
|
|
101
|
+
# Subclasses should override this to define word boundaries.
|
|
102
|
+
#
|
|
103
|
+
# @return [Regexp] Word boundary regex
|
|
104
|
+
def word_boundary_regex
|
|
105
|
+
raise NotImplementedError, "#{self.class} must implement #word_boundary_regex"
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Normalize a token.
|
|
109
|
+
#
|
|
110
|
+
# Subclasses can override this for language-specific normalization.
|
|
111
|
+
#
|
|
112
|
+
# @param token [String] Token to normalize
|
|
113
|
+
# @return [String] Normalized token
|
|
114
|
+
def normalize(token)
|
|
115
|
+
token
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Check if a token should be skipped.
|
|
119
|
+
#
|
|
120
|
+
# Subclasses can override this for language-specific filtering.
|
|
121
|
+
#
|
|
122
|
+
# @param token [String] Token to check
|
|
123
|
+
# @return [Boolean] True if token should be skipped
|
|
124
|
+
def skip_token?(token)
|
|
125
|
+
return true if token.empty?
|
|
126
|
+
return true if token.match?(/^\d+$/) # Pure numbers
|
|
127
|
+
return true if token.length < 2 && token.match?(/^[^\p{L}]$/)
|
|
128
|
+
|
|
129
|
+
false
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
protected
|
|
133
|
+
|
|
134
|
+
# Extract the next token from text at position.
|
|
135
|
+
#
|
|
136
|
+
# @param text [String] Full text
|
|
137
|
+
# @param position [Integer] Current position
|
|
138
|
+
# @return [String, nil] Next token or nil
|
|
139
|
+
def extract_next_token(text, position)
|
|
140
|
+
remaining = text[position..]
|
|
141
|
+
match = remaining.match(/^#{word_pattern}/)
|
|
142
|
+
match ? match[0] : nil
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Get pattern for matching tokens.
|
|
146
|
+
#
|
|
147
|
+
# @return [String] Regex pattern string
|
|
148
|
+
def word_pattern
|
|
149
|
+
"[#{word_chars}]+"
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Get word characters for this tokenizer.
|
|
153
|
+
#
|
|
154
|
+
# @return [String] Character class of word characters
|
|
155
|
+
def word_chars
|
|
156
|
+
raise NotImplementedError, "#{self.class} must implement #word_chars"
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Check if string matches regex.
|
|
160
|
+
#
|
|
161
|
+
# @param regex [Regexp] Regex to match
|
|
162
|
+
# @param string [String] String to check
|
|
163
|
+
# @return [Boolean] True if matches
|
|
164
|
+
def match?(regex, string)
|
|
165
|
+
regex.match?(string)
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Language
|
|
5
|
+
module Tokenizer
|
|
6
|
+
# Tokenizer for French text.
|
|
7
|
+
#
|
|
8
|
+
# Ported from LanguageTool's FrenchWordTokenizer.
|
|
9
|
+
#
|
|
10
|
+
# Handles:
|
|
11
|
+
# - Apostrophes (l', d', qu', c'est, j'ai, etc.)
|
|
12
|
+
# - Hyphens (c'est-à-dire, rendez-vous, etc.)
|
|
13
|
+
# - Decimal points/commas
|
|
14
|
+
# - Multiple contraction patterns (7 total)
|
|
15
|
+
class FrenchTokenizer < Base
|
|
16
|
+
# French word separators - most punctuation and whitespace
|
|
17
|
+
# Note: apostrophe (') is NOT a separator in French (used for contractions)
|
|
18
|
+
WORD_SEPARATORS = /[\s"()\[\]{}<>,.;:!?\\\/|`~@#$%^&*·]/.freeze
|
|
19
|
+
|
|
20
|
+
# Do-not-split list (from LanguageTool)
|
|
21
|
+
DO_NOT_SPLIT = %w[
|
|
22
|
+
mers-cov mcgraw-hill sars-cov-2 sars-cov
|
|
23
|
+
ph-metre ph-metres anti-ivg anti-uv anti-vih al-qaïda
|
|
24
|
+
c'est-à-dire add-on add-ons rendez-vous garde-à-vous
|
|
25
|
+
chez-eux chez-moi chez-nous chez-soi chez-toi chez-vous
|
|
26
|
+
m'as-tu-vu
|
|
27
|
+
].freeze
|
|
28
|
+
|
|
29
|
+
# Contraction patterns (from LanguageTool)
|
|
30
|
+
# French contractions are complex: l', d', qu', c'est, j'ai, n'a, etc.
|
|
31
|
+
CONTRACTION_PATTERNS = [
|
|
32
|
+
# c' followed by word: c'est, c'était, etc.
|
|
33
|
+
/^(c[''])$/i,
|
|
34
|
+
# j' (je): j'ai, j'aime, etc.
|
|
35
|
+
/^(j[''])$/i,
|
|
36
|
+
# n' (ne): n'a, n'est, etc.
|
|
37
|
+
/^(n[''])$/i,
|
|
38
|
+
# m' (me): m'a, m'appelle, etc.
|
|
39
|
+
/^(m[''])$/i,
|
|
40
|
+
# t' (te): t'a, t'asseoir, etc.
|
|
41
|
+
/^(t[''])$/i,
|
|
42
|
+
# s' (se): s'a, s'appelle, etc.
|
|
43
|
+
/^(s[''])$/i,
|
|
44
|
+
# l' (le/la): l'a, l'homme, l'eau, etc.
|
|
45
|
+
/^(l[''])$/i,
|
|
46
|
+
# d' (de): d'un, d'une, d'abord, etc.
|
|
47
|
+
/^(d[''])$/i,
|
|
48
|
+
# qu' (que): qu'un, qu'une, qu'est, etc.
|
|
49
|
+
/^(qu[''])$/i,
|
|
50
|
+
# jusqu'à, jusqu'aux, etc.
|
|
51
|
+
/^(jusqu[''])$/i,
|
|
52
|
+
# puisque, puisqu'il, etc.
|
|
53
|
+
/^(puisqu[''])$/i,
|
|
54
|
+
# quoique, quoiqu'il, etc.
|
|
55
|
+
/^(quoiqu[''])$/i,
|
|
56
|
+
# lorsque, lorsqu'il, etc.
|
|
57
|
+
/^(lorsqu[''])$/i,
|
|
58
|
+
].freeze
|
|
59
|
+
|
|
60
|
+
def tokenize(text)
|
|
61
|
+
return [] if text.nil? || text.strip.empty?
|
|
62
|
+
|
|
63
|
+
# Replace hyphen variants
|
|
64
|
+
text = text.gsub("\u2010", "\u002d")
|
|
65
|
+
text = text.gsub("\u2011", "\u002d")
|
|
66
|
+
|
|
67
|
+
# Normalize apostrophes
|
|
68
|
+
text = normalize_apostrophes(text)
|
|
69
|
+
|
|
70
|
+
# Split on word boundaries
|
|
71
|
+
raw_tokens = text.split(WORD_SEPARATORS)
|
|
72
|
+
|
|
73
|
+
# Process each token
|
|
74
|
+
tokens = []
|
|
75
|
+
raw_tokens.each do |token|
|
|
76
|
+
next if token.empty?
|
|
77
|
+
|
|
78
|
+
# Try to split contractions and hyphenated words
|
|
79
|
+
parts = split_french_word(token)
|
|
80
|
+
tokens.concat(parts)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Filter and normalize
|
|
84
|
+
tokens
|
|
85
|
+
.map { |token| normalize(token) }
|
|
86
|
+
.reject { |token| skip_token?(token) }
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
protected
|
|
90
|
+
|
|
91
|
+
# Normalize apostrophes to straight quotes.
|
|
92
|
+
#
|
|
93
|
+
# @param text [String] Input text
|
|
94
|
+
# @return [String] Text with normalized apostrophes
|
|
95
|
+
def normalize_apostrophes(text)
|
|
96
|
+
text
|
|
97
|
+
.gsub("'", "'")
|
|
98
|
+
.gsub("'", "'")
|
|
99
|
+
.gsub("'", "'")
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Split French word, handling contractions and hyphens.
|
|
103
|
+
#
|
|
104
|
+
# @param word [String] Word to split
|
|
105
|
+
# @return [Array<String>] Array of tokens
|
|
106
|
+
def split_french_word(word)
|
|
107
|
+
# Check do-not-split list
|
|
108
|
+
return [word] if DO_NOT_SPLIT.include?(word.downcase)
|
|
109
|
+
|
|
110
|
+
# Handle hyphens first (but not for do-not-split words)
|
|
111
|
+
if word.include?("-")
|
|
112
|
+
# Check if it's a contraction pattern like "jusqu'à-ce"
|
|
113
|
+
if word.match?(/^(jusqu['']|[cç]['']|j['']|n['']|m['']|t['']|s['']|l['']|d['']|qu['']|lorsqu['']|puisqu['']|quoiqu[''])/)
|
|
114
|
+
# Split on hyphen for contractions
|
|
115
|
+
parts = []
|
|
116
|
+
word.split("-", -1).each do |part|
|
|
117
|
+
next if part.empty?
|
|
118
|
+
parts.concat(split_contractions(part))
|
|
119
|
+
end
|
|
120
|
+
return parts
|
|
121
|
+
else
|
|
122
|
+
# Regular hyphenated word - split it
|
|
123
|
+
return word.split("-", -1).reject(&:empty?)
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Handle contractions
|
|
128
|
+
if word.include?("'")
|
|
129
|
+
return split_contractions(word)
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# No special handling needed
|
|
133
|
+
[word]
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Split contractions into component parts.
|
|
137
|
+
#
|
|
138
|
+
# @param word [String] Word that might be a contraction
|
|
139
|
+
# @return [Array<String>] Array of tokens
|
|
140
|
+
def split_contractions(word)
|
|
141
|
+
# Try each contraction pattern
|
|
142
|
+
CONTRACTION_PATTERNS.each do |pattern|
|
|
143
|
+
match = word.match(pattern)
|
|
144
|
+
if match
|
|
145
|
+
# Return the contraction and the rest of the word
|
|
146
|
+
contraction = match[1]
|
|
147
|
+
rest = word.sub(/^#{Regexp.escape(contraction)}/, "")
|
|
148
|
+
return [contraction, rest] unless rest.empty?
|
|
149
|
+
return [contraction]
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Handle special case: word starts with apostrophe
|
|
154
|
+
if word.match?(/^[cç]['']|^[a-z]['']/i)
|
|
155
|
+
# Split at the apostrophe
|
|
156
|
+
parts = word.split("'", 2)
|
|
157
|
+
return parts if parts.length == 2
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# No pattern matched, return the word as-is
|
|
161
|
+
[word]
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def word_separators
|
|
165
|
+
WORD_SEPARATORS
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Language
|
|
5
|
+
module Tokenizer
|
|
6
|
+
# Tokenizer for German text.
|
|
7
|
+
#
|
|
8
|
+
# Ported from LanguageTool's GermanWordTokenizer.
|
|
9
|
+
#
|
|
10
|
+
# Handles:
|
|
11
|
+
# - Underscore as word character (not a separator)
|
|
12
|
+
# - Single low quote (‚) as word character (not a separator)
|
|
13
|
+
# - Umlauts (ä, ö, ü, ß)
|
|
14
|
+
#
|
|
15
|
+
# The LanguageTool implementation adds two characters to the word characters:
|
|
16
|
+
# underscore (_) and single low quote (‚ - U+201A).
|
|
17
|
+
class GermanTokenizer < Base
|
|
18
|
+
# German-specific word separators (exclude underscore and single low quote)
|
|
19
|
+
WORD_SEPARATORS = /[\s"()\[\]{}<>,.;:!?\\\/|`~@#$%^&*+\-·]/.freeze
|
|
20
|
+
|
|
21
|
+
def tokenize(text)
|
|
22
|
+
return [] if text.nil? || text.strip.empty?
|
|
23
|
+
|
|
24
|
+
# Split on word boundaries
|
|
25
|
+
raw_tokens = text.split(WORD_SEPARATORS)
|
|
26
|
+
|
|
27
|
+
# Filter and normalize
|
|
28
|
+
raw_tokens
|
|
29
|
+
.map { |token| normalize(token) }
|
|
30
|
+
.reject { |token| skip_token?(token) }
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
protected
|
|
34
|
+
|
|
35
|
+
def word_separators
|
|
36
|
+
WORD_SEPARATORS
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "suika"
|
|
4
|
+
|
|
5
|
+
module Kotoshu
|
|
6
|
+
module Language
|
|
7
|
+
module Tokenizer
|
|
8
|
+
# Tokenizer for Japanese text.
|
|
9
|
+
#
|
|
10
|
+
# Uses Suika gem for morphological analysis.
|
|
11
|
+
#
|
|
12
|
+
# Suika is a pure Ruby Japanese morphological analyzer with a built-in
|
|
13
|
+
# dictionary from mecab-ipadic. It provides proper tokenization with
|
|
14
|
+
# part-of-speech information.
|
|
15
|
+
#
|
|
16
|
+
# @see https://github.com/yoshoku/suika
|
|
17
|
+
class JapaneseTokenizer < Base
|
|
18
|
+
# Japanese word separators - keep it simple since Suika handles tokenization
|
|
19
|
+
WORD_SEPARATORS = /[\s"()\[\]{}<>,.;:!?\\\/|`~@#$%^&*·]/.freeze
|
|
20
|
+
|
|
21
|
+
# Class variable to hold the Suika tagger instance
|
|
22
|
+
@@tagger = nil
|
|
23
|
+
|
|
24
|
+
def tokenize(text)
|
|
25
|
+
return [] if text.nil? || text.strip.empty?
|
|
26
|
+
|
|
27
|
+
# Initialize tagger once (class variable for reuse)
|
|
28
|
+
@@tagger ||= ::Suika::Tagger.new
|
|
29
|
+
|
|
30
|
+
# Suika.parse returns an array of "surface\tfeatures" strings
|
|
31
|
+
tokens = []
|
|
32
|
+
parsed = @@tagger.parse(text)
|
|
33
|
+
|
|
34
|
+
parsed.each do |token|
|
|
35
|
+
# Suika returns: "すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ"
|
|
36
|
+
# The surface form is tab-separated from the POS features
|
|
37
|
+
surface = token.split("\t").first
|
|
38
|
+
tokens << surface if surface && !surface.strip.empty?
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
tokens
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
protected
|
|
45
|
+
|
|
46
|
+
# Detect if text contains Japanese script.
|
|
47
|
+
#
|
|
48
|
+
# @param text [String] Text to check
|
|
49
|
+
# @return [Boolean] True if Japanese
|
|
50
|
+
def japanese?(text)
|
|
51
|
+
text.match?(/[\u3040-\u309F\u30A0-\u30FF]/) # Hiragana or Katakana
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def word_separators
|
|
55
|
+
WORD_SEPARATORS
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Language
|
|
5
|
+
module Tokenizer
|
|
6
|
+
# Tokenizer for Latin-script languages.
|
|
7
|
+
#
|
|
8
|
+
# Base tokenizer for English, French, German, Spanish, Portuguese,
|
|
9
|
+
# and other European languages using Latin script.
|
|
10
|
+
#
|
|
11
|
+
# Handles:
|
|
12
|
+
# - Standard word boundaries (whitespace, punctuation)
|
|
13
|
+
# - Apostrophes within words (contractions, elisions)
|
|
14
|
+
# - Hyphenated words
|
|
15
|
+
# - Numbers with units
|
|
16
|
+
#
|
|
17
|
+
# Subclasses can override for language-specific handling.
|
|
18
|
+
class LatinTokenizer < Base
|
|
19
|
+
# Latin word characters including accented characters
|
|
20
|
+
WORD_CHARS = "a-zA-Zà-ÿ0-9'"
|
|
21
|
+
|
|
22
|
+
# Punctuation that separates words
|
|
23
|
+
WORD_SEPARATORS = /[\s"()\[\]{}<>,.;:!?\\\/|`~@#$%^&*+\-=_]/
|
|
24
|
+
|
|
25
|
+
# Contractions that should stay together
|
|
26
|
+
CONTRACTIONS = %w[
|
|
27
|
+
I'm I'd I've I'll you're you'd you've you'll he's he'd he'll
|
|
28
|
+
she's she'd she'll it's it'd we're we'd we've we'll they're
|
|
29
|
+
they'd they've they'll that's that'd that'll who's who'd who'll
|
|
30
|
+
what's what'd what'll where's where'd when's when'd why's why'd
|
|
31
|
+
how's how'd can't won't don't shouldn't couldn't wouldn't didn't
|
|
32
|
+
isn't aren't wasn't weren't hasn't haven't hadn't doesn't do
|
|
33
|
+
doesn't didn't mightn't mustn't shan't shouldn't wouldn't
|
|
34
|
+
].freeze
|
|
35
|
+
|
|
36
|
+
# Tokenize text into words.
|
|
37
|
+
#
|
|
38
|
+
# @param text [String] Text to tokenize
|
|
39
|
+
# @return [Array<String>] Array of tokens
|
|
40
|
+
def tokenize(text)
|
|
41
|
+
return [] if text.nil? || text.strip.empty?
|
|
42
|
+
|
|
43
|
+
# Split on word boundaries
|
|
44
|
+
raw_tokens = text.split(WORD_SEPARATORS)
|
|
45
|
+
|
|
46
|
+
# Filter and normalize
|
|
47
|
+
raw_tokens
|
|
48
|
+
.map { |token| normalize(token) }
|
|
49
|
+
.reject { |token| skip_token?(token) }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Get word boundary regex.
|
|
53
|
+
#
|
|
54
|
+
# @return [Regexp] Word boundary regex
|
|
55
|
+
def word_boundary_regex
|
|
56
|
+
/[#{WORD_CHARS}]/
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Normalize token.
|
|
60
|
+
#
|
|
61
|
+
# Subclasses can override for language-specific normalization.
|
|
62
|
+
#
|
|
63
|
+
# @param token [String] Token to normalize
|
|
64
|
+
# @return [String] Normalized token
|
|
65
|
+
def normalize(token)
|
|
66
|
+
token.strip
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Check if token should be skipped.
|
|
70
|
+
#
|
|
71
|
+
# @param token [String] Token to check
|
|
72
|
+
# @return [Boolean] True if should skip
|
|
73
|
+
def skip_token?(token)
|
|
74
|
+
return true if super
|
|
75
|
+
|
|
76
|
+
# Skip pure numbers
|
|
77
|
+
return true if token.match?(/^\d+$/)
|
|
78
|
+
|
|
79
|
+
# Skip single characters (unless a word)
|
|
80
|
+
return true if token.length == 1 && token.match?(/[^a-zA-Zà-ÿ]/)
|
|
81
|
+
|
|
82
|
+
# Skip empty tokens
|
|
83
|
+
return true if token.empty?
|
|
84
|
+
|
|
85
|
+
# Skip tokens with no letters
|
|
86
|
+
return true unless token.match?(/[a-zA-Zà-ÿ]/)
|
|
87
|
+
|
|
88
|
+
false
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
protected
|
|
92
|
+
|
|
93
|
+
# Get word characters.
|
|
94
|
+
#
|
|
95
|
+
# @return [String] Character class
|
|
96
|
+
def word_chars
|
|
97
|
+
WORD_CHARS
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Handle contractions to keep them together.
|
|
101
|
+
#
|
|
102
|
+
# @param text [String] Input text
|
|
103
|
+
# @return [String] Text with protected contractions
|
|
104
|
+
def handle_contractions(text)
|
|
105
|
+
result = text.dup
|
|
106
|
+
|
|
107
|
+
# Protect common contractions
|
|
108
|
+
CONTRACTIONS.each do |contraction|
|
|
109
|
+
# Use word boundaries to avoid partial matches
|
|
110
|
+
result = result.gsub(/\b#{Regexp.escape(contraction)}\b/, contraction.gsub("'", "\uFEFF"))
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
result
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Extract next token with position.
|
|
117
|
+
#
|
|
118
|
+
# Override to handle apostrophes within words.
|
|
119
|
+
#
|
|
120
|
+
# @param text [String] Full text
|
|
121
|
+
# @param position [Integer] Current position
|
|
122
|
+
# @return [String, nil] Next token or nil
|
|
123
|
+
def extract_next_token(text, position)
|
|
124
|
+
remaining = text[position..]
|
|
125
|
+
|
|
126
|
+
# Check for contraction first
|
|
127
|
+
CONTRACTIONS.each do |contraction|
|
|
128
|
+
if remaining.start_with?(contraction) &&
|
|
129
|
+
remaining[contraction.length]&.match?(/\s|[^a-zA-Zà-ÿ]/)
|
|
130
|
+
return contraction
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Extract word with potential apostrophe
|
|
135
|
+
match = remaining.match(/^([#{WORD_CHARS}]+(?:'[#{WORD_CHARS}]+)?)/)
|
|
136
|
+
match ? match[1] : nil
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|