kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,5 @@
1
+ # Russian confusion sets
2
+ # See the English 'confusion_sets.txt' for a description of file format
3
+
4
+ не; ни; 1000000; # p=1.000, r=0.550, 116+191, 3grams, 2016-10-13
5
+ шасси; шоссе; 10000000; # p=1.000, r=0.125, 48+48, 3grams, 2016-10-12
@@ -0,0 +1,170 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Language
5
+ module Tokenizer
6
+ # Abstract base class for tokenizers.
7
+ #
8
+ # Uses Strategy pattern to allow different tokenization approaches
9
+ # for different languages.
10
+ #
11
+ # Subclasses must implement the tokenize method.
12
+ #
13
+ # @example Implement a tokenizer
14
+ # class MyTokenizer < Tokenizer::Base
15
+ # def tokenize(text)
16
+ # text.split(/ /)
17
+ # end
18
+ # end
19
+ class Base
20
+ # Tokenize text into words.
21
+ #
22
+ # @param text [String] Text to tokenize
23
+ # @return [Array<String>] Array of tokens
24
+ # @raise [NotImplementedError] Must be implemented by subclass
25
+ def tokenize(text)
26
+ raise NotImplementedError, "#{self.class} must implement #tokenize"
27
+ end
28
+
29
+ # Tokenize text with positions.
30
+ #
31
+ # Returns tokens along with their position information.
32
+ #
33
+ # @param text [String] Text to tokenize
34
+ # @return [Array<Hash>] Array of {token:, start:, end:, line:, column:}
35
+ def tokenize_with_positions(text)
36
+ return [] if text.nil?
37
+ return [] if text.empty?
38
+
39
+ tokens = []
40
+ line = 1
41
+ column = 1
42
+ position = 0
43
+
44
+ while position < text.length
45
+ # Skip whitespace
46
+ while position < text.length && text[position].match?(/\s/)
47
+ if text[position] == "\n"
48
+ line += 1
49
+ column = 1
50
+ else
51
+ column += 1
52
+ end
53
+ position += 1
54
+ end
55
+
56
+ break if position >= text.length
57
+
58
+ # Find token
59
+ start_pos = position
60
+ start_line = line
61
+ start_column = column
62
+
63
+ token_text = extract_next_token(text, position)
64
+
65
+ if token_text
66
+ tokens << {
67
+ token: token_text,
68
+ start: start_pos,
69
+ end: start_pos + token_text.length,
70
+ line: start_line,
71
+ column: start_column
72
+ }
73
+
74
+ token_text.each_char do |char|
75
+ column += 1
76
+ position += 1
77
+ if char == "\n"
78
+ line += 1
79
+ column = 1
80
+ end
81
+ end
82
+ else
83
+ position += 1
84
+ column += 1
85
+ end
86
+ end
87
+
88
+ tokens
89
+ end
90
+
91
+ # Check if a character is a word character.
92
+ #
93
+ # @param char [String] Single character
94
+ # @return [Boolean] True if word character
95
+ def word_char?(char)
96
+ match?(word_boundary_regex, char)
97
+ end
98
+
99
+ # Get word boundary regex for this tokenizer.
100
+ #
101
+ # Subclasses should override this to define word boundaries.
102
+ #
103
+ # @return [Regexp] Word boundary regex
104
+ def word_boundary_regex
105
+ raise NotImplementedError, "#{self.class} must implement #word_boundary_regex"
106
+ end
107
+
108
+ # Normalize a token.
109
+ #
110
+ # Subclasses can override this for language-specific normalization.
111
+ #
112
+ # @param token [String] Token to normalize
113
+ # @return [String] Normalized token
114
+ def normalize(token)
115
+ token
116
+ end
117
+
118
+ # Check if a token should be skipped.
119
+ #
120
+ # Subclasses can override this for language-specific filtering.
121
+ #
122
+ # @param token [String] Token to check
123
+ # @return [Boolean] True if token should be skipped
124
+ def skip_token?(token)
125
+ return true if token.empty?
126
+ return true if token.match?(/^\d+$/) # Pure numbers
127
+ return true if token.length < 2 && token.match?(/^[^\p{L}]$/)
128
+
129
+ false
130
+ end
131
+
132
+ protected
133
+
134
+ # Extract the next token from text at position.
135
+ #
136
+ # @param text [String] Full text
137
+ # @param position [Integer] Current position
138
+ # @return [String, nil] Next token or nil
139
+ def extract_next_token(text, position)
140
+ remaining = text[position..]
141
+ match = remaining.match(/^#{word_pattern}/)
142
+ match ? match[0] : nil
143
+ end
144
+
145
+ # Get pattern for matching tokens.
146
+ #
147
+ # @return [String] Regex pattern string
148
+ def word_pattern
149
+ "[#{word_chars}]+"
150
+ end
151
+
152
+ # Get word characters for this tokenizer.
153
+ #
154
+ # @return [String] Character class of word characters
155
+ def word_chars
156
+ raise NotImplementedError, "#{self.class} must implement #word_chars"
157
+ end
158
+
159
+ # Check if string matches regex.
160
+ #
161
+ # @param regex [Regexp] Regex to match
162
+ # @param string [String] String to check
163
+ # @return [Boolean] True if matches
164
+ def match?(regex, string)
165
+ regex.match?(string)
166
+ end
167
+ end
168
+ end
169
+ end
170
+ end
@@ -0,0 +1,170 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Language
5
+ module Tokenizer
6
+ # Tokenizer for French text.
7
+ #
8
+ # Ported from LanguageTool's FrenchWordTokenizer.
9
+ #
10
+ # Handles:
11
+ # - Apostrophes (l', d', qu', c'est, j'ai, etc.)
12
+ # - Hyphens (c'est-à-dire, rendez-vous, etc.)
13
+ # - Decimal points/commas
14
+ # - Multiple contraction patterns (7 total)
15
+ class FrenchTokenizer < Base
16
+ # French word separators - most punctuation and whitespace
17
+ # Note: apostrophe (') is NOT a separator in French (used for contractions)
18
+ WORD_SEPARATORS = /[\s"()\[\]{}<>,.;:!?\\\/|`~@#$%^&*·]/.freeze
19
+
20
+ # Do-not-split list (from LanguageTool)
21
+ DO_NOT_SPLIT = %w[
22
+ mers-cov mcgraw-hill sars-cov-2 sars-cov
23
+ ph-metre ph-metres anti-ivg anti-uv anti-vih al-qaïda
24
+ c'est-à-dire add-on add-ons rendez-vous garde-à-vous
25
+ chez-eux chez-moi chez-nous chez-soi chez-toi chez-vous
26
+ m'as-tu-vu
27
+ ].freeze
28
+
29
+ # Contraction patterns (from LanguageTool)
30
+ # French contractions are complex: l', d', qu', c'est, j'ai, n'a, etc.
31
+ CONTRACTION_PATTERNS = [
32
+ # c' followed by word: c'est, c'était, etc.
33
+ /^(c[''])$/i,
34
+ # j' (je): j'ai, j'aime, etc.
35
+ /^(j[''])$/i,
36
+ # n' (ne): n'a, n'est, etc.
37
+ /^(n[''])$/i,
38
+ # m' (me): m'a, m'appelle, etc.
39
+ /^(m[''])$/i,
40
+ # t' (te): t'a, t'asseoir, etc.
41
+ /^(t[''])$/i,
42
+ # s' (se): s'a, s'appelle, etc.
43
+ /^(s[''])$/i,
44
+ # l' (le/la): l'a, l'homme, l'eau, etc.
45
+ /^(l[''])$/i,
46
+ # d' (de): d'un, d'une, d'abord, etc.
47
+ /^(d[''])$/i,
48
+ # qu' (que): qu'un, qu'une, qu'est, etc.
49
+ /^(qu[''])$/i,
50
+ # jusqu'à, jusqu'aux, etc.
51
+ /^(jusqu[''])$/i,
52
+ # puisque, puisqu'il, etc.
53
+ /^(puisqu[''])$/i,
54
+ # quoique, quoiqu'il, etc.
55
+ /^(quoiqu[''])$/i,
56
+ # lorsque, lorsqu'il, etc.
57
+ /^(lorsqu[''])$/i,
58
+ ].freeze
59
+
60
+ def tokenize(text)
61
+ return [] if text.nil? || text.strip.empty?
62
+
63
+ # Replace hyphen variants
64
+ text = text.gsub("\u2010", "\u002d")
65
+ text = text.gsub("\u2011", "\u002d")
66
+
67
+ # Normalize apostrophes
68
+ text = normalize_apostrophes(text)
69
+
70
+ # Split on word boundaries
71
+ raw_tokens = text.split(WORD_SEPARATORS)
72
+
73
+ # Process each token
74
+ tokens = []
75
+ raw_tokens.each do |token|
76
+ next if token.empty?
77
+
78
+ # Try to split contractions and hyphenated words
79
+ parts = split_french_word(token)
80
+ tokens.concat(parts)
81
+ end
82
+
83
+ # Filter and normalize
84
+ tokens
85
+ .map { |token| normalize(token) }
86
+ .reject { |token| skip_token?(token) }
87
+ end
88
+
89
+ protected
90
+
91
+ # Normalize apostrophes to straight quotes.
92
+ #
93
+ # @param text [String] Input text
94
+ # @return [String] Text with normalized apostrophes
95
+ def normalize_apostrophes(text)
96
+ text
97
+ .gsub("'", "'")
98
+ .gsub("'", "'")
99
+ .gsub("'", "'")
100
+ end
101
+
102
+ # Split French word, handling contractions and hyphens.
103
+ #
104
+ # @param word [String] Word to split
105
+ # @return [Array<String>] Array of tokens
106
+ def split_french_word(word)
107
+ # Check do-not-split list
108
+ return [word] if DO_NOT_SPLIT.include?(word.downcase)
109
+
110
+ # Handle hyphens first (but not for do-not-split words)
111
+ if word.include?("-")
112
+ # Check if it's a contraction pattern like "jusqu'à-ce"
113
+ if word.match?(/^(jusqu['']|[cç]['']|j['']|n['']|m['']|t['']|s['']|l['']|d['']|qu['']|lorsqu['']|puisqu['']|quoiqu[''])/)
114
+ # Split on hyphen for contractions
115
+ parts = []
116
+ word.split("-", -1).each do |part|
117
+ next if part.empty?
118
+ parts.concat(split_contractions(part))
119
+ end
120
+ return parts
121
+ else
122
+ # Regular hyphenated word - split it
123
+ return word.split("-", -1).reject(&:empty?)
124
+ end
125
+ end
126
+
127
+ # Handle contractions
128
+ if word.include?("'")
129
+ return split_contractions(word)
130
+ end
131
+
132
+ # No special handling needed
133
+ [word]
134
+ end
135
+
136
+ # Split contractions into component parts.
137
+ #
138
+ # @param word [String] Word that might be a contraction
139
+ # @return [Array<String>] Array of tokens
140
+ def split_contractions(word)
141
+ # Try each contraction pattern
142
+ CONTRACTION_PATTERNS.each do |pattern|
143
+ match = word.match(pattern)
144
+ if match
145
+ # Return the contraction and the rest of the word
146
+ contraction = match[1]
147
+ rest = word.sub(/^#{Regexp.escape(contraction)}/, "")
148
+ return [contraction, rest] unless rest.empty?
149
+ return [contraction]
150
+ end
151
+ end
152
+
153
+ # Handle special case: word starts with apostrophe
154
+ if word.match?(/^[cç]['']|^[a-z]['']/i)
155
+ # Split at the apostrophe
156
+ parts = word.split("'", 2)
157
+ return parts if parts.length == 2
158
+ end
159
+
160
+ # No pattern matched, return the word as-is
161
+ [word]
162
+ end
163
+
164
+ def word_separators
165
+ WORD_SEPARATORS
166
+ end
167
+ end
168
+ end
169
+ end
170
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Language
5
+ module Tokenizer
6
+ # Tokenizer for German text.
7
+ #
8
+ # Ported from LanguageTool's GermanWordTokenizer.
9
+ #
10
+ # Handles:
11
+ # - Underscore as word character (not a separator)
12
+ # - Single low quote (‚) as word character (not a separator)
13
+ # - Umlauts (ä, ö, ü, ß)
14
+ #
15
+ # The LanguageTool implementation adds two characters to the word characters:
16
+ # underscore (_) and single low quote (‚ - U+201A).
17
+ class GermanTokenizer < Base
18
+ # German-specific word separators (exclude underscore and single low quote)
19
+ WORD_SEPARATORS = /[\s"()\[\]{}<>,.;:!?\\\/|`~@#$%^&*+\-·]/.freeze
20
+
21
+ def tokenize(text)
22
+ return [] if text.nil? || text.strip.empty?
23
+
24
+ # Split on word boundaries
25
+ raw_tokens = text.split(WORD_SEPARATORS)
26
+
27
+ # Filter and normalize
28
+ raw_tokens
29
+ .map { |token| normalize(token) }
30
+ .reject { |token| skip_token?(token) }
31
+ end
32
+
33
+ protected
34
+
35
+ def word_separators
36
+ WORD_SEPARATORS
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "suika"
4
+
5
+ module Kotoshu
6
+ module Language
7
+ module Tokenizer
8
+ # Tokenizer for Japanese text.
9
+ #
10
+ # Uses Suika gem for morphological analysis.
11
+ #
12
+ # Suika is a pure Ruby Japanese morphological analyzer with a built-in
13
+ # dictionary from mecab-ipadic. It provides proper tokenization with
14
+ # part-of-speech information.
15
+ #
16
+ # @see https://github.com/yoshoku/suika
17
+ class JapaneseTokenizer < Base
18
+ # Japanese word separators - keep it simple since Suika handles tokenization
19
+ WORD_SEPARATORS = /[\s"()\[\]{}<>,.;:!?\\\/|`~@#$%^&*·]/.freeze
20
+
21
+ # Class variable to hold the Suika tagger instance
22
+ @@tagger = nil
23
+
24
+ def tokenize(text)
25
+ return [] if text.nil? || text.strip.empty?
26
+
27
+ # Initialize tagger once (class variable for reuse)
28
+ @@tagger ||= ::Suika::Tagger.new
29
+
30
+ # Suika.parse returns an array of "surface\tfeatures" strings
31
+ tokens = []
32
+ parsed = @@tagger.parse(text)
33
+
34
+ parsed.each do |token|
35
+ # Suika returns: "すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ"
36
+ # The surface form is tab-separated from the POS features
37
+ surface = token.split("\t").first
38
+ tokens << surface if surface && !surface.strip.empty?
39
+ end
40
+
41
+ tokens
42
+ end
43
+
44
+ protected
45
+
46
+ # Detect if text contains Japanese script.
47
+ #
48
+ # @param text [String] Text to check
49
+ # @return [Boolean] True if Japanese
50
+ def japanese?(text)
51
+ text.match?(/[\u3040-\u309F\u30A0-\u30FF]/) # Hiragana or Katakana
52
+ end
53
+
54
+ def word_separators
55
+ WORD_SEPARATORS
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Language
5
+ module Tokenizer
6
+ # Tokenizer for Latin-script languages.
7
+ #
8
+ # Base tokenizer for English, French, German, Spanish, Portuguese,
9
+ # and other European languages using Latin script.
10
+ #
11
+ # Handles:
12
+ # - Standard word boundaries (whitespace, punctuation)
13
+ # - Apostrophes within words (contractions, elisions)
14
+ # - Hyphenated words
15
+ # - Numbers with units
16
+ #
17
+ # Subclasses can override for language-specific handling.
18
+ class LatinTokenizer < Base
19
+ # Latin word characters including accented characters
20
+ WORD_CHARS = "a-zA-Zà-ÿ0-9'"
21
+
22
+ # Punctuation that separates words
23
+ WORD_SEPARATORS = /[\s"()\[\]{}<>,.;:!?\\\/|`~@#$%^&*+\-=_]/
24
+
25
+ # Contractions that should stay together
26
+ CONTRACTIONS = %w[
27
+ I'm I'd I've I'll you're you'd you've you'll he's he'd he'll
28
+ she's she'd she'll it's it'd we're we'd we've we'll they're
29
+ they'd they've they'll that's that'd that'll who's who'd who'll
30
+ what's what'd what'll where's where'd when's when'd why's why'd
31
+ how's how'd can't won't don't shouldn't couldn't wouldn't didn't
32
+ isn't aren't wasn't weren't hasn't haven't hadn't doesn't do
33
+ doesn't didn't mightn't mustn't shan't shouldn't wouldn't
34
+ ].freeze
35
+
36
+ # Tokenize text into words.
37
+ #
38
+ # @param text [String] Text to tokenize
39
+ # @return [Array<String>] Array of tokens
40
+ def tokenize(text)
41
+ return [] if text.nil? || text.strip.empty?
42
+
43
+ # Split on word boundaries
44
+ raw_tokens = text.split(WORD_SEPARATORS)
45
+
46
+ # Filter and normalize
47
+ raw_tokens
48
+ .map { |token| normalize(token) }
49
+ .reject { |token| skip_token?(token) }
50
+ end
51
+
52
+ # Get word boundary regex.
53
+ #
54
+ # @return [Regexp] Word boundary regex
55
+ def word_boundary_regex
56
+ /[#{WORD_CHARS}]/
57
+ end
58
+
59
+ # Normalize token.
60
+ #
61
+ # Subclasses can override for language-specific normalization.
62
+ #
63
+ # @param token [String] Token to normalize
64
+ # @return [String] Normalized token
65
+ def normalize(token)
66
+ token.strip
67
+ end
68
+
69
+ # Check if token should be skipped.
70
+ #
71
+ # @param token [String] Token to check
72
+ # @return [Boolean] True if should skip
73
+ def skip_token?(token)
74
+ return true if super
75
+
76
+ # Skip pure numbers
77
+ return true if token.match?(/^\d+$/)
78
+
79
+ # Skip single characters (unless a word)
80
+ return true if token.length == 1 && token.match?(/[^a-zA-Zà-ÿ]/)
81
+
82
+ # Skip empty tokens
83
+ return true if token.empty?
84
+
85
+ # Skip tokens with no letters
86
+ return true unless token.match?(/[a-zA-Zà-ÿ]/)
87
+
88
+ false
89
+ end
90
+
91
+ protected
92
+
93
+ # Get word characters.
94
+ #
95
+ # @return [String] Character class
96
+ def word_chars
97
+ WORD_CHARS
98
+ end
99
+
100
+ # Handle contractions to keep them together.
101
+ #
102
+ # @param text [String] Input text
103
+ # @return [String] Text with protected contractions
104
+ def handle_contractions(text)
105
+ result = text.dup
106
+
107
+ # Protect common contractions
108
+ CONTRACTIONS.each do |contraction|
109
+ # Use word boundaries to avoid partial matches
110
+ result = result.gsub(/\b#{Regexp.escape(contraction)}\b/, contraction.gsub("'", "\uFEFF"))
111
+ end
112
+
113
+ result
114
+ end
115
+
116
+ # Extract next token with position.
117
+ #
118
+ # Override to handle apostrophes within words.
119
+ #
120
+ # @param text [String] Full text
121
+ # @param position [Integer] Current position
122
+ # @return [String, nil] Next token or nil
123
+ def extract_next_token(text, position)
124
+ remaining = text[position..]
125
+
126
+ # Check for contraction first
127
+ CONTRACTIONS.each do |contraction|
128
+ if remaining.start_with?(contraction) &&
129
+ remaining[contraction.length]&.match?(/\s|[^a-zA-Zà-ÿ]/)
130
+ return contraction
131
+ end
132
+ end
133
+
134
+ # Extract word with potential apostrophe
135
+ match = remaining.match(/^([#{WORD_CHARS}]+(?:'[#{WORD_CHARS}]+)?)/)
136
+ match ? match[1] : nil
137
+ end
138
+ end
139
+ end
140
+ end
141
+ end