kotoshu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +18 -0
  4. data/CHANGELOG.md +182 -0
  5. data/CLAUDE.md +172 -0
  6. data/CODE_OF_CONDUCT.md +132 -0
  7. data/LICENSE +31 -0
  8. data/README.adoc +955 -0
  9. data/Rakefile +12 -0
  10. data/SECURITY.md +93 -0
  11. data/examples/01_basic_word_checking.rb +38 -0
  12. data/examples/02_text_document_checking.rb +77 -0
  13. data/examples/03_dictionary_backends.rb +137 -0
  14. data/examples/04_trie_data_structure.rb +146 -0
  15. data/examples/05_suggestion_algorithms.rb +239 -0
  16. data/examples/06_configuration_advanced.rb +287 -0
  17. data/examples/07_multi_language_dictionaries.rb +278 -0
  18. data/exe/kotoshu +6 -0
  19. data/lib/kotoshu/algorithms/capitalization.rb +276 -0
  20. data/lib/kotoshu/algorithms/lookup.rb +876 -0
  21. data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
  22. data/lib/kotoshu/algorithms/permutations.rb +283 -0
  23. data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
  24. data/lib/kotoshu/algorithms/suggest.rb +575 -0
  25. data/lib/kotoshu/algorithms.rb +14 -0
  26. data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
  27. data/lib/kotoshu/cache/base_cache.rb +596 -0
  28. data/lib/kotoshu/cache/cache.rb +91 -0
  29. data/lib/kotoshu/cache/frequency_cache.rb +224 -0
  30. data/lib/kotoshu/cache/language_cache.rb +454 -0
  31. data/lib/kotoshu/cache/lookup_cache.rb +166 -0
  32. data/lib/kotoshu/cache/model_cache.rb +513 -0
  33. data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
  34. data/lib/kotoshu/cache.rb +40 -0
  35. data/lib/kotoshu/cli/auto_setup.rb +71 -0
  36. data/lib/kotoshu/cli/batch_reporter.rb +315 -0
  37. data/lib/kotoshu/cli/cache_command.rb +356 -0
  38. data/lib/kotoshu/cli/display_formatter.rb +431 -0
  39. data/lib/kotoshu/cli/errors.rb +36 -0
  40. data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
  41. data/lib/kotoshu/cli/language_resolver.rb +91 -0
  42. data/lib/kotoshu/cli/navigation_manager.rb +272 -0
  43. data/lib/kotoshu/cli/progress_reporter.rb +114 -0
  44. data/lib/kotoshu/cli/status_report.rb +130 -0
  45. data/lib/kotoshu/cli.rb +627 -0
  46. data/lib/kotoshu/commands/cache_command.rb +424 -0
  47. data/lib/kotoshu/commands/check_command.rb +312 -0
  48. data/lib/kotoshu/commands/model_command.rb +295 -0
  49. data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
  50. data/lib/kotoshu/components/pos_tagger.rb +98 -0
  51. data/lib/kotoshu/components/spell_checker.rb +73 -0
  52. data/lib/kotoshu/components/synthesizer.rb +60 -0
  53. data/lib/kotoshu/components/tokenizer.rb +58 -0
  54. data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
  55. data/lib/kotoshu/configuration/builder.rb +209 -0
  56. data/lib/kotoshu/configuration/resolver.rb +124 -0
  57. data/lib/kotoshu/configuration.rb +702 -0
  58. data/lib/kotoshu/core/exceptions.rb +165 -0
  59. data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
  60. data/lib/kotoshu/core/models/affix_rule.rb +260 -0
  61. data/lib/kotoshu/core/models/result/document_result.rb +263 -0
  62. data/lib/kotoshu/core/models/result/word_result.rb +203 -0
  63. data/lib/kotoshu/core/models/word.rb +142 -0
  64. data/lib/kotoshu/core/trie/builder.rb +119 -0
  65. data/lib/kotoshu/core/trie/node.rb +94 -0
  66. data/lib/kotoshu/core/trie/trie.rb +249 -0
  67. data/lib/kotoshu/core.rb +28 -0
  68. data/lib/kotoshu/data/common_words/de.yml +1800 -0
  69. data/lib/kotoshu/data/common_words/en.yml +1215 -0
  70. data/lib/kotoshu/data/common_words/es.yml +750 -0
  71. data/lib/kotoshu/data/common_words/fr.yml +1015 -0
  72. data/lib/kotoshu/data/common_words/pt.yml +870 -0
  73. data/lib/kotoshu/data/common_words/ru.yml +484 -0
  74. data/lib/kotoshu/data/common_words_loader.rb +152 -0
  75. data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
  76. data/lib/kotoshu/debug_logger.rb +146 -0
  77. data/lib/kotoshu/debug_mode.rb +134 -0
  78. data/lib/kotoshu/defaults.rb +86 -0
  79. data/lib/kotoshu/dictionaries/catalog.rb +817 -0
  80. data/lib/kotoshu/dictionary/base.rb +237 -0
  81. data/lib/kotoshu/dictionary/cspell.rb +254 -0
  82. data/lib/kotoshu/dictionary/custom.rb +224 -0
  83. data/lib/kotoshu/dictionary/hunspell.rb +526 -0
  84. data/lib/kotoshu/dictionary/plain_text.rb +282 -0
  85. data/lib/kotoshu/dictionary/repository.rb +248 -0
  86. data/lib/kotoshu/dictionary/unified.rb +260 -0
  87. data/lib/kotoshu/dictionary/unix_words.rb +218 -0
  88. data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
  89. data/lib/kotoshu/documents/document.rb +229 -0
  90. data/lib/kotoshu/documents/location.rb +139 -0
  91. data/lib/kotoshu/documents/markdown_document.rb +389 -0
  92. data/lib/kotoshu/documents/plain_text_document.rb +147 -0
  93. data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
  94. data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
  95. data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
  96. data/lib/kotoshu/embeddings/protocol.rb +83 -0
  97. data/lib/kotoshu/embeddings/protocols.rb +17 -0
  98. data/lib/kotoshu/embeddings/registry.rb +182 -0
  99. data/lib/kotoshu/embeddings/search.rb +192 -0
  100. data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
  101. data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
  102. data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
  103. data/lib/kotoshu/embeddings.rb +97 -0
  104. data/lib/kotoshu/fluent_checker.rb +91 -0
  105. data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
  106. data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
  107. data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
  108. data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
  109. data/lib/kotoshu/grammar/rule.rb +95 -0
  110. data/lib/kotoshu/grammar/rule_engine.rb +111 -0
  111. data/lib/kotoshu/grammar/rule_loader.rb +31 -0
  112. data/lib/kotoshu/grammar.rb +18 -0
  113. data/lib/kotoshu/integrity/audit_log.rb +88 -0
  114. data/lib/kotoshu/integrity/manifest.rb +117 -0
  115. data/lib/kotoshu/integrity/net_http.rb +46 -0
  116. data/lib/kotoshu/integrity.rb +25 -0
  117. data/lib/kotoshu/keyboard/layout.rb +115 -0
  118. data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
  119. data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
  120. data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
  121. data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
  122. data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
  123. data/lib/kotoshu/keyboard/registry.rb +146 -0
  124. data/lib/kotoshu/keyboard.rb +60 -0
  125. data/lib/kotoshu/language/detector.rb +242 -0
  126. data/lib/kotoshu/language/identifier.rb +378 -0
  127. data/lib/kotoshu/language/languages/base.rb +256 -0
  128. data/lib/kotoshu/language/normalizer/base.rb +137 -0
  129. data/lib/kotoshu/language/registry.rb +147 -0
  130. data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
  131. data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
  132. data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
  133. data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
  134. data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
  135. data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
  136. data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
  137. data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
  138. data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
  139. data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
  140. data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
  141. data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
  142. data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
  143. data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
  144. data/lib/kotoshu/language/tokenizer/base.rb +170 -0
  145. data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
  146. data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
  147. data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
  148. data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
  149. data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
  150. data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
  151. data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
  152. data/lib/kotoshu/language.rb +99 -0
  153. data/lib/kotoshu/languages/de/language.rb +546 -0
  154. data/lib/kotoshu/languages/en/language.rb +448 -0
  155. data/lib/kotoshu/languages/es/language.rb +459 -0
  156. data/lib/kotoshu/languages/fr/language.rb +493 -0
  157. data/lib/kotoshu/languages/ja/language.rb +477 -0
  158. data/lib/kotoshu/languages/pt/language.rb +423 -0
  159. data/lib/kotoshu/languages/ru/language.rb +404 -0
  160. data/lib/kotoshu/languages.rb +43 -0
  161. data/lib/kotoshu/metrics_collector.rb +222 -0
  162. data/lib/kotoshu/metrics_module.rb +110 -0
  163. data/lib/kotoshu/models/context.rb +119 -0
  164. data/lib/kotoshu/models/embedding_model.rb +182 -0
  165. data/lib/kotoshu/models/fasttext_model.rb +220 -0
  166. data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
  167. data/lib/kotoshu/models/onnx_model.rb +333 -0
  168. data/lib/kotoshu/models/semantic_error.rb +165 -0
  169. data/lib/kotoshu/models/suggestion.rb +106 -0
  170. data/lib/kotoshu/models/word_embedding.rb +107 -0
  171. data/lib/kotoshu/paths.rb +53 -0
  172. data/lib/kotoshu/personal_dictionary.rb +94 -0
  173. data/lib/kotoshu/plugins/plugin.rb +61 -0
  174. data/lib/kotoshu/plugins/registry.rb +120 -0
  175. data/lib/kotoshu/project_config.rb +76 -0
  176. data/lib/kotoshu/readers/aff_data.rb +356 -0
  177. data/lib/kotoshu/readers/aff_reader.rb +375 -0
  178. data/lib/kotoshu/readers/condition_checker.rb +142 -0
  179. data/lib/kotoshu/readers/dic_reader.rb +118 -0
  180. data/lib/kotoshu/readers/file_reader.rb +347 -0
  181. data/lib/kotoshu/readers/lookup_builder.rb +299 -0
  182. data/lib/kotoshu/readers/readers.rb +6 -0
  183. data/lib/kotoshu/readers.rb +9 -0
  184. data/lib/kotoshu/resource_bundle.rb +30 -0
  185. data/lib/kotoshu/resource_manager.rb +295 -0
  186. data/lib/kotoshu/results/result.rb +165 -0
  187. data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
  188. data/lib/kotoshu/source_registry.rb +74 -0
  189. data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
  190. data/lib/kotoshu/spellchecker.rb +298 -0
  191. data/lib/kotoshu/string_metrics.rb +153 -0
  192. data/lib/kotoshu/suggestions/context.rb +55 -0
  193. data/lib/kotoshu/suggestions/generator.rb +175 -0
  194. data/lib/kotoshu/suggestions/pipeline.rb +135 -0
  195. data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
  196. data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
  197. data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
  198. data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
  199. data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
  200. data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
  201. data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
  202. data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
  203. data/lib/kotoshu/suggestions/suggestion.rb +174 -0
  204. data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
  205. data/lib/kotoshu/version.rb +5 -0
  206. data/lib/kotoshu.rb +493 -0
  207. data/script/validate_all_dictionaries.rb +444 -0
  208. data/sig/kotoshu.rbs +4 -0
  209. data/test_oop.rb +79 -0
  210. metadata +298 -0
@@ -0,0 +1,160 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Language
5
+ module Tokenizer
6
+ # Tokenizer for Portuguese text.
7
+ #
8
+ # Ported from LanguageTool's PortugueseWordTokenizer.
9
+ #
10
+ # Handles:
11
+ # - Decimal comma between digits (3,14)
12
+ # - Dotted numbers (1.000.000)
13
+ # - Dates (01.01.2024, 2024-01-01)
14
+ # - Colons in time (12:25)
15
+ # - Hyphens with do-not-split list
16
+ # - Spaced decimals (2 000 000)
17
+ class PortugueseTokenizer < Base
18
+ # Portuguese word separators - most punctuation and whitespace
19
+ # Note: We protect special patterns before splitting
20
+ WORD_SEPARATORS = /[\s"()\[\]{}<>@€£\\$%‰‱ºªᵃᵒˢ|`~#^·]/.freeze
21
+
22
+ # Placeholder characters (using non-printing characters)
23
+ DECIMAL_COMMA_SUBST = "\uE001"
24
+ NON_BREAKING_SPACE_SUBST = "\uE002"
25
+ NON_BREAKING_DOT_SUBST = "\uE003"
26
+ NON_BREAKING_COLON_SUBST = "\uE004"
27
+
28
+ # Decimal comma between digits: 3,14
29
+ DECIMAL_COMMA_PATTERN = /(\d),(\d)/
30
+
31
+ # Dotted numbers: 1.000.000
32
+ DOTTED_NUMBERS_PATTERN = /(\d)\.(\d)/
33
+
34
+ # Colon in numbers (time): 12:25
35
+ COLON_NUMBERS_PATTERN = /(\d):(\d)/
36
+
37
+ # Date patterns: 01.01.2024, 2024-01-01
38
+ DATE_PATTERN = /(\d{2})\.(\d{2})\.(\d{4})|(\d{4})\.(\d{2})\.(\d{2})|(\d{4})-(\d{2})-(\d{2})/
39
+
40
+ # Spaced decimals: 2 000 000
41
+ SPACED_DECIMAL_PATTERN = /(?<=^|[\s(])\d{1,3}( \d{3})+(?:[,#{DECIMAL_COMMA_SUBST}#{NON_BREAKING_DOT_SUBST}]\d+)?(?=\D|$)/
42
+
43
+ # Do-not-split list (from LanguageTool)
44
+ DO_NOT_SPLIT = %w[
45
+ mers-cov mcgraw-hill sars-cov-2 sars-cov
46
+ ph-metre ph-metres anti-ivg anti-uv anti-vih al-qaïda
47
+ ].freeze
48
+
49
+ def tokenize(text)
50
+ return [] if text.nil? || text.strip.empty?
51
+
52
+ # Handle decimal commas
53
+ if text.include?(",")
54
+ text = text.gsub(DECIMAL_COMMA_PATTERN, "\\1#{DECIMAL_COMMA_SUBST}\\2")
55
+ end
56
+
57
+ # Handle dots in numbers and dates
58
+ if text.include?(".")
59
+ # Handle dates first (before dotted numbers to avoid conflicts)
60
+ text = text.gsub(DATE_PATTERN) do |match|
61
+ # match[0] is the full match, match[1-9] are the capture groups
62
+ if match[1] && match[2] && match[3] # DD.MM.YYYY
63
+ "#{match[1]}#{NON_BREAKING_DOT_SUBST}#{match[2]}#{NON_BREAKING_DOT_SUBST}#{match[3]}"
64
+ elsif match[4] && match[5] && match[6] # YYYY.MM.DD
65
+ "#{match[4]}#{NON_BREAKING_DOT_SUBST}#{match[5]}#{NON_BREAKING_DOT_SUBST}#{match[6]}"
66
+ elsif match[7] && match[8] && match[9] # YYYY-MM-DD (keep as-is)
67
+ match[0]
68
+ else
69
+ match[0]
70
+ end
71
+ end
72
+ text = text.gsub(DOTTED_NUMBERS_PATTERN, "\\1#{NON_BREAKING_DOT_SUBST}\\2")
73
+ end
74
+
75
+ # Handle spaced decimals: 2 000 000
76
+ text = handle_spaced_decimals(text)
77
+
78
+ # Handle colons in time: 12:25
79
+ if text.include?(":")
80
+ text = text.gsub(COLON_NUMBERS_PATTERN, "\\1#{NON_BREAKING_COLON_SUBST}\\2")
81
+ end
82
+
83
+ # Split on word boundaries
84
+ raw_tokens = text.split(WORD_SEPARATORS)
85
+
86
+ # Process each token
87
+ tokens = []
88
+ raw_tokens.each do |token|
89
+ next if token.empty?
90
+
91
+ # Restore placeholders
92
+ token = restore_placeholders(token)
93
+
94
+ # Handle hyphenated words
95
+ parts = words_to_add(token)
96
+ tokens.concat(parts)
97
+ end
98
+
99
+ # Filter and normalize
100
+ tokens
101
+ .map { |token| normalize(token) }
102
+ .reject { |token| skip_token?(token) }
103
+ end
104
+
105
+ protected
106
+
107
+ # Restore placeholders to original characters.
108
+ #
109
+ # @param token [String] Token with placeholders
110
+ # @return [String] Token with restored characters
111
+ def restore_placeholders(token)
112
+ token
113
+ .gsub(DECIMAL_COMMA_SUBST, ",")
114
+ .gsub(NON_BREAKING_COLON_SUBST, ":")
115
+ .gsub(NON_BREAKING_SPACE_SUBST, " ")
116
+ .gsub(NON_BREAKING_DOT_SUBST, ".")
117
+ end
118
+
119
+ # Split a word into tokens, handling hyphens.
120
+ #
121
+ # @param word [String] Word to split
122
+ # @return [Array<String>] Array of tokens
123
+ def words_to_add(word)
124
+ return [word] unless word.include?("-")
125
+
126
+ # Check do-not-split list
127
+ return [word] if DO_NOT_SPLIT.include?(word.downcase)
128
+
129
+ # For now, split on hyphens if not in do-not-split list
130
+ # Future: integrate with tagger for better handling
131
+ word.split("-", -1).flat_map do |part|
132
+ part.empty? ? ["-"] : [part]
133
+ end
134
+ end
135
+
136
+ def word_separators
137
+ WORD_SEPARATORS
138
+ end
139
+
140
+ private
141
+
142
+ # Handle spaced decimals: 2 000 000.
143
+ #
144
+ # @param text [String] Input text
145
+ # @return [String] Text with non-breaking spaces
146
+ def handle_spaced_decimals(text)
147
+ result = text
148
+ text.scan(SPACED_DECIMAL_PATTERN) do
149
+ match = Regexp.last_match
150
+ split_number = match[0]
151
+ split_number_adjusted = split_number.gsub(" ", NON_BREAKING_SPACE_SUBST)
152
+ split_number_adjusted = split_number_adjusted.gsub("\u00A0", NON_BREAKING_SPACE_SUBST)
153
+ result = result.sub(split_number, split_number_adjusted)
154
+ end
155
+ result
156
+ end
157
+ end
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Language
5
+ module Tokenizer
6
+ # Tokenizer for Russian text.
7
+ #
8
+ # Ported from LanguageTool's RussianWordTokenizer.
9
+ #
10
+ # Handles:
11
+ # - Apostrophe as word character
12
+ # - Dot as word character (for abbreviations)
13
+ # - Special abbreviations: б/у (second-hand), б/н (new)
14
+ # - Spaced dots: .. , .
15
+ class RussianTokenizer < Base
16
+ # Russian-specific word separators (exclude apostrophe and dot)
17
+ WORD_SEPARATORS = /[\s"()\[\]{}<>,;:!?\\\/|`~@#$%^&*+\-·]/.freeze
18
+
19
+ # Special abbreviations that should not be split
20
+ # Using non-printing characters as placeholders
21
+ ABBREVIATION_PLACEHOLDERS = {
22
+ "б/у" => "\u0001\u0001SOCR_BU\u0001\u0001",
23
+ "б/н" => "\u0001\u0001SOCR_BN\u0001\u0001"
24
+ }.freeze
25
+
26
+ # Reverse placeholders for restoration
27
+ PLACEHOLDER_RESTORE = {
28
+ "\u0001\u0001SOCR_BU\u0001\u0001" => "б/у",
29
+ "\u0001\u0001SOCR_BN\u0001\u0001" => "б/н",
30
+ "\u0001\u0001SP_DDOT_SP\u0001\u0001" => " .. ",
31
+ "\u0001\u0001SP_DOT_SP\u0001\u0001" => " . ",
32
+ "\u0001\u0001SP_DOT\u0001\u0001" => "."
33
+ }.freeze
34
+
35
+ def tokenize(text)
36
+ return [] if text.nil? || text.strip.empty?
37
+
38
+ # Replace abbreviations with placeholders
39
+ text = replace_abbreviations(text)
40
+
41
+ # Split on word boundaries
42
+ raw_tokens = text.split(WORD_SEPARATORS)
43
+
44
+ # Restore abbreviations and filter
45
+ raw_tokens
46
+ .map { |token| restore_abbreviations(token) }
47
+ .map { |token| normalize(token) }
48
+ .reject { |token| skip_token?(token) }
49
+ end
50
+
51
+ protected
52
+
53
+ def word_separators
54
+ WORD_SEPARATORS
55
+ end
56
+
57
+ private
58
+
59
+ # Replace special abbreviations with placeholders.
60
+ #
61
+ # @param text [String] Input text
62
+ # @return [String] Text with placeholders
63
+ def replace_abbreviations(text)
64
+ result = text
65
+ ABBREVIATION_PLACEHOLDERS.each do |abbr, placeholder|
66
+ result = result.gsub(abbr, placeholder)
67
+ end
68
+
69
+ # Handle spaced dots
70
+ result = result.gsub(" .. ", "\u0001\u0001SP_DDOT_SP\u0001\u0001")
71
+ result = result.gsub(" . ", "\u0001\u0001SP_DOT_SP\u0001\u0001")
72
+ result = result.gsub(" .", " \u0001\u0001SP_DOT\u0001\u0001")
73
+
74
+ # Restore spaced dots first, then single dot pattern
75
+ result = result.gsub("\u0001\u0001SP_DDOT_SP\u0001\u0001", " .. ")
76
+ result = result.gsub("\u0001\u0001SP_DOT_SP\u0001\u0001", " . ")
77
+
78
+ result
79
+ end
80
+
81
+ # Restore abbreviations from placeholders.
82
+ #
83
+ # @param text [String] Text with placeholders
84
+ # @return [String] Text with restored abbreviations
85
+ def restore_abbreviations(text)
86
+ result = text
87
+ PLACEHOLDER_RESTORE.each do |placeholder, original|
88
+ result = result.gsub(placeholder, original)
89
+ end
90
+ result
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kotoshu
4
+ module Language
5
+ module Tokenizer
6
+ # Tokenizer for Spanish text.
7
+ #
8
+ # Ported from LanguageTool's SpanishWordTokenizer.
9
+ #
10
+ # Handles:
11
+ # - Decimal point between digits (3.14)
12
+ # - Decimal comma between digits (3,14)
13
+ # - Ordinals (1.º, 2.ª, 1.er, 1.os, 1.as)
14
+ # - Hyphens (with do-not-split list since no tagger)
15
+ # - Soft hyphens
16
+ # - Inverted punctuation (¡, ¿)
17
+ class SpanishTokenizer < Base
18
+ # Spanish word separators - most punctuation and whitespace
19
+ # Note: We need to handle decimals specially, so we protect them first
20
+ WORD_SEPARATORS = /[\s"()\[\]{}<>,.;:!?\\\/|`~@#$%^&*·]/.freeze
21
+
22
+ # Decimal point between digits: 3.14
23
+ DECIMAL_POINT = /(\d)\.(\d)/
24
+
25
+ # Decimal comma between digits: 3,14
26
+ DECIMAL_COMMA = /(\d),(\d)/
27
+
28
+ # Ordinal patterns: 1.º, 2.ª, 1.er, 1.os, 1.as
29
+ ORDINAL = /\b(\d+)\.(º|ª|o|a|er|os|as)\b/
30
+
31
+ # Placeholders for special patterns
32
+ DECIMAL_POINT_PLACEHOLDER = "\uE101"
33
+ DECIMAL_COMMA_PLACEHOLDER = "\uE102"
34
+ ORDINAL_PLACEHOLDER = "\uE103"
35
+
36
+ # Soft hyphen
37
+ SOFT_HYPHEN = "\u00AD"
38
+
39
+ # Do-not-split list (from LanguageTool)
40
+ DO_NOT_SPLIT = %w[
41
+ mers-cov mcgraw-hill sars-cov-2 sars-cov
42
+ ph-metre ph-metres
43
+ ].freeze
44
+
45
+ def tokenize(text)
46
+ return [] if text.nil? || text.strip.empty?
47
+
48
+ # Replace hyphen variants
49
+ text = text.gsub("\u2010", "\u002d") # hyphen to hyphen-minus
50
+ text = text.gsub("\u2011", "\u002d") # non-breaking hyphen to hyphen-minus
51
+
52
+ # Protect decimal points
53
+ text = text.gsub(DECIMAL_POINT, "\\1#{DECIMAL_POINT_PLACEHOLDER}\\2")
54
+
55
+ # Protect decimal commas
56
+ text = text.gsub(DECIMAL_COMMA, "\\1#{DECIMAL_COMMA_PLACEHOLDER}\\2")
57
+
58
+ # Protect ordinals
59
+ text = text.gsub(ORDINAL, "\\1#{ORDINAL_PLACEHOLDER}\\2")
60
+
61
+ # Split on word boundaries
62
+ raw_tokens = text.split(WORD_SEPARATORS)
63
+
64
+ # Process each token
65
+ tokens = []
66
+ raw_tokens.each do |token|
67
+ next if token.empty?
68
+
69
+ # Restore placeholders
70
+ token = restore_placeholders(token)
71
+
72
+ # Handle hyphenated words
73
+ parts = words_to_add(token)
74
+ tokens.concat(parts)
75
+ end
76
+
77
+ # Filter and normalize
78
+ tokens
79
+ .map { |token| normalize(token) }
80
+ .reject { |token| skip_token?(token) }
81
+ end
82
+
83
+ protected
84
+
85
+ # Restore placeholders to original characters.
86
+ #
87
+ # @param token [String] Token with placeholders
88
+ # @return [String] Token with restored characters
89
+ def restore_placeholders(token)
90
+ token
91
+ .gsub(DECIMAL_POINT_PLACEHOLDER, ".")
92
+ .gsub(DECIMAL_COMMA_PLACEHOLDER, ",")
93
+ .gsub(ORDINAL_PLACEHOLDER, ".")
94
+ end
95
+
96
+ # Split a word into tokens, handling hyphens.
97
+ #
98
+ # @param word [String] Word to split
99
+ # @return [Array<String>] Array of tokens
100
+ def words_to_add(word)
101
+ return [word] unless word.include?("-")
102
+
103
+ # Check do-not-split list
104
+ return [word] if DO_NOT_SPLIT.include?(word.downcase)
105
+
106
+ # Remove soft hyphens and check
107
+ normalized = word.gsub(SOFT_HYPHEN, "").gsub("'", "'")
108
+
109
+ # For now, split on hyphens if not in do-not-split list
110
+ # Future: integrate with tagger for better handling
111
+ normalized.split("-", -1).flat_map do |part|
112
+ part.empty? ? ["-"] : [part]
113
+ end
114
+ end
115
+
116
+ def word_separators
117
+ WORD_SEPARATORS
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,99 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "language/registry"
4
+ require_relative "language/detector"
5
+ require_relative "language/tokenizer/base"
6
+ require_relative "language/tokenizer/latin_tokenizer"
7
+ require_relative "language/tokenizer/french_tokenizer"
8
+ require_relative "language/tokenizer/german_tokenizer"
9
+ require_relative "language/tokenizer/spanish_tokenizer"
10
+ require_relative "language/tokenizer/portuguese_tokenizer"
11
+ require_relative "language/tokenizer/russian_tokenizer"
12
+ require_relative "language/tokenizer/japanese_tokenizer"
13
+ require_relative "language/normalizer/base"
14
+ require_relative "language/languages/base"
15
+
16
+ # Load all language-specific modules from new structure (languages/{en,fr,de,ja,pt,ru,es}/)
17
+ require_relative "languages"
18
+
19
+ module Kotoshu
20
+ # Language module for multi-language support.
21
+ #
22
+ # Provides language detection, tokenization, and normalization
23
+ # for different languages with proper OOP design.
24
+ #
25
+ # @example Detect language
26
+ # Kotoshu::Language.detect("Hello world") # => "en"
27
+ #
28
+ # @example Get language class
29
+ # lang_class = Kotoshu::Language.get("en-US")
30
+ #
31
+ # @example List supported languages
32
+ # Kotoshu::Language.supported_codes # => ["de-DE", "en-US", ...]
33
+ module Language
34
+ # Register the default detector with the registry
35
+ Registry.register_detector(Detector)
36
+
37
+ class << self
38
+ # Detect language from text.
39
+ #
40
+ # Delegates to Detector.
41
+ #
42
+ # @param text [String] Text to analyze
43
+ # @return [String, nil] Detected language code
44
+ def detect(text)
45
+ Detector.detect(text)
46
+ end
47
+
48
+ # Detect with confidence score.
49
+ #
50
+ # @param text [String] Text to analyze
51
+ # @return [Array<String, Float>] Language code and confidence
52
+ def detect_with_confidence(text)
53
+ Detector.detect_with_confidence(text)
54
+ end
55
+
56
+ # Get language class by code.
57
+ #
58
+ # Delegates to Registry.
59
+ #
60
+ # @param code [String] Language code
61
+ # @return [Class, nil] Language class or nil
62
+ def get(code)
63
+ Registry.get(code)
64
+ end
65
+
66
+ # Check if language is registered.
67
+ #
68
+ # @param code [String] Language code
69
+ # @return [Boolean] True if registered
70
+ def registered?(code)
71
+ Registry.registered?(code)
72
+ end
73
+
74
+ # Get all supported language codes.
75
+ #
76
+ # @return [Array<String>] List of codes
77
+ def supported_codes
78
+ Registry.supported_codes
79
+ end
80
+
81
+ # Get language info.
82
+ #
83
+ # @param code [String] Language code
84
+ # @return [Hash, nil] Language info or nil
85
+ def info(code)
86
+ Registry.info(code)
87
+ end
88
+
89
+ # Register a language.
90
+ #
91
+ # @param code [String] Language code
92
+ # @param klass [Class] Language class
93
+ # @return [void]
94
+ def register(code, klass)
95
+ Registry.register(code, klass)
96
+ end
97
+ end
98
+ end
99
+ end