kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Language
|
|
5
|
+
module Tokenizer
|
|
6
|
+
# Tokenizer for Portuguese text.
|
|
7
|
+
#
|
|
8
|
+
# Ported from LanguageTool's PortugueseWordTokenizer.
|
|
9
|
+
#
|
|
10
|
+
# Handles:
|
|
11
|
+
# - Decimal comma between digits (3,14)
|
|
12
|
+
# - Dotted numbers (1.000.000)
|
|
13
|
+
# - Dates (01.01.2024, 2024-01-01)
|
|
14
|
+
# - Colons in time (12:25)
|
|
15
|
+
# - Hyphens with do-not-split list
|
|
16
|
+
# - Spaced decimals (2 000 000)
|
|
17
|
+
class PortugueseTokenizer < Base
|
|
18
|
+
# Portuguese word separators - most punctuation and whitespace
|
|
19
|
+
# Note: We protect special patterns before splitting
|
|
20
|
+
WORD_SEPARATORS = /[\s"()\[\]{}<>@€£\\$%‰‱ºªᵃᵒˢ|`~#^·]/.freeze
|
|
21
|
+
|
|
22
|
+
# Placeholder characters (using non-printing characters)
|
|
23
|
+
DECIMAL_COMMA_SUBST = "\uE001"
|
|
24
|
+
NON_BREAKING_SPACE_SUBST = "\uE002"
|
|
25
|
+
NON_BREAKING_DOT_SUBST = "\uE003"
|
|
26
|
+
NON_BREAKING_COLON_SUBST = "\uE004"
|
|
27
|
+
|
|
28
|
+
# Decimal comma between digits: 3,14
|
|
29
|
+
DECIMAL_COMMA_PATTERN = /(\d),(\d)/
|
|
30
|
+
|
|
31
|
+
# Dotted numbers: 1.000.000
|
|
32
|
+
DOTTED_NUMBERS_PATTERN = /(\d)\.(\d)/
|
|
33
|
+
|
|
34
|
+
# Colon in numbers (time): 12:25
|
|
35
|
+
COLON_NUMBERS_PATTERN = /(\d):(\d)/
|
|
36
|
+
|
|
37
|
+
# Date patterns: 01.01.2024, 2024-01-01
|
|
38
|
+
DATE_PATTERN = /(\d{2})\.(\d{2})\.(\d{4})|(\d{4})\.(\d{2})\.(\d{2})|(\d{4})-(\d{2})-(\d{2})/
|
|
39
|
+
|
|
40
|
+
# Spaced decimals: 2 000 000
|
|
41
|
+
SPACED_DECIMAL_PATTERN = /(?<=^|[\s(])\d{1,3}( \d{3})+(?:[,#{DECIMAL_COMMA_SUBST}#{NON_BREAKING_DOT_SUBST}]\d+)?(?=\D|$)/
|
|
42
|
+
|
|
43
|
+
# Do-not-split list (from LanguageTool)
|
|
44
|
+
DO_NOT_SPLIT = %w[
|
|
45
|
+
mers-cov mcgraw-hill sars-cov-2 sars-cov
|
|
46
|
+
ph-metre ph-metres anti-ivg anti-uv anti-vih al-qaïda
|
|
47
|
+
].freeze
|
|
48
|
+
|
|
49
|
+
def tokenize(text)
|
|
50
|
+
return [] if text.nil? || text.strip.empty?
|
|
51
|
+
|
|
52
|
+
# Handle decimal commas
|
|
53
|
+
if text.include?(",")
|
|
54
|
+
text = text.gsub(DECIMAL_COMMA_PATTERN, "\\1#{DECIMAL_COMMA_SUBST}\\2")
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Handle dots in numbers and dates
|
|
58
|
+
if text.include?(".")
|
|
59
|
+
# Handle dates first (before dotted numbers to avoid conflicts)
|
|
60
|
+
text = text.gsub(DATE_PATTERN) do |match|
|
|
61
|
+
# match[0] is the full match, match[1-9] are the capture groups
|
|
62
|
+
if match[1] && match[2] && match[3] # DD.MM.YYYY
|
|
63
|
+
"#{match[1]}#{NON_BREAKING_DOT_SUBST}#{match[2]}#{NON_BREAKING_DOT_SUBST}#{match[3]}"
|
|
64
|
+
elsif match[4] && match[5] && match[6] # YYYY.MM.DD
|
|
65
|
+
"#{match[4]}#{NON_BREAKING_DOT_SUBST}#{match[5]}#{NON_BREAKING_DOT_SUBST}#{match[6]}"
|
|
66
|
+
elsif match[7] && match[8] && match[9] # YYYY-MM-DD (keep as-is)
|
|
67
|
+
match[0]
|
|
68
|
+
else
|
|
69
|
+
match[0]
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
text = text.gsub(DOTTED_NUMBERS_PATTERN, "\\1#{NON_BREAKING_DOT_SUBST}\\2")
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Handle spaced decimals: 2 000 000
|
|
76
|
+
text = handle_spaced_decimals(text)
|
|
77
|
+
|
|
78
|
+
# Handle colons in time: 12:25
|
|
79
|
+
if text.include?(":")
|
|
80
|
+
text = text.gsub(COLON_NUMBERS_PATTERN, "\\1#{NON_BREAKING_COLON_SUBST}\\2")
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Split on word boundaries
|
|
84
|
+
raw_tokens = text.split(WORD_SEPARATORS)
|
|
85
|
+
|
|
86
|
+
# Process each token
|
|
87
|
+
tokens = []
|
|
88
|
+
raw_tokens.each do |token|
|
|
89
|
+
next if token.empty?
|
|
90
|
+
|
|
91
|
+
# Restore placeholders
|
|
92
|
+
token = restore_placeholders(token)
|
|
93
|
+
|
|
94
|
+
# Handle hyphenated words
|
|
95
|
+
parts = words_to_add(token)
|
|
96
|
+
tokens.concat(parts)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Filter and normalize
|
|
100
|
+
tokens
|
|
101
|
+
.map { |token| normalize(token) }
|
|
102
|
+
.reject { |token| skip_token?(token) }
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
protected
|
|
106
|
+
|
|
107
|
+
# Restore placeholders to original characters.
|
|
108
|
+
#
|
|
109
|
+
# @param token [String] Token with placeholders
|
|
110
|
+
# @return [String] Token with restored characters
|
|
111
|
+
def restore_placeholders(token)
|
|
112
|
+
token
|
|
113
|
+
.gsub(DECIMAL_COMMA_SUBST, ",")
|
|
114
|
+
.gsub(NON_BREAKING_COLON_SUBST, ":")
|
|
115
|
+
.gsub(NON_BREAKING_SPACE_SUBST, " ")
|
|
116
|
+
.gsub(NON_BREAKING_DOT_SUBST, ".")
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Split a word into tokens, handling hyphens.
|
|
120
|
+
#
|
|
121
|
+
# @param word [String] Word to split
|
|
122
|
+
# @return [Array<String>] Array of tokens
|
|
123
|
+
def words_to_add(word)
|
|
124
|
+
return [word] unless word.include?("-")
|
|
125
|
+
|
|
126
|
+
# Check do-not-split list
|
|
127
|
+
return [word] if DO_NOT_SPLIT.include?(word.downcase)
|
|
128
|
+
|
|
129
|
+
# For now, split on hyphens if not in do-not-split list
|
|
130
|
+
# Future: integrate with tagger for better handling
|
|
131
|
+
word.split("-", -1).flat_map do |part|
|
|
132
|
+
part.empty? ? ["-"] : [part]
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def word_separators
|
|
137
|
+
WORD_SEPARATORS
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
private
|
|
141
|
+
|
|
142
|
+
# Handle spaced decimals: 2 000 000.
|
|
143
|
+
#
|
|
144
|
+
# @param text [String] Input text
|
|
145
|
+
# @return [String] Text with non-breaking spaces
|
|
146
|
+
def handle_spaced_decimals(text)
|
|
147
|
+
result = text
|
|
148
|
+
text.scan(SPACED_DECIMAL_PATTERN) do
|
|
149
|
+
match = Regexp.last_match
|
|
150
|
+
split_number = match[0]
|
|
151
|
+
split_number_adjusted = split_number.gsub(" ", NON_BREAKING_SPACE_SUBST)
|
|
152
|
+
split_number_adjusted = split_number_adjusted.gsub("\u00A0", NON_BREAKING_SPACE_SUBST)
|
|
153
|
+
result = result.sub(split_number, split_number_adjusted)
|
|
154
|
+
end
|
|
155
|
+
result
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
end
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Language
|
|
5
|
+
module Tokenizer
|
|
6
|
+
# Tokenizer for Russian text.
|
|
7
|
+
#
|
|
8
|
+
# Ported from LanguageTool's RussianWordTokenizer.
|
|
9
|
+
#
|
|
10
|
+
# Handles:
|
|
11
|
+
# - Apostrophe as word character
|
|
12
|
+
# - Dot as word character (for abbreviations)
|
|
13
|
+
# - Special abbreviations: б/у (second-hand), б/н (new)
|
|
14
|
+
# - Spaced dots: .. , .
|
|
15
|
+
class RussianTokenizer < Base
|
|
16
|
+
# Russian-specific word separators (exclude apostrophe and dot)
|
|
17
|
+
WORD_SEPARATORS = /[\s"()\[\]{}<>,;:!?\\\/|`~@#$%^&*+\-·]/.freeze
|
|
18
|
+
|
|
19
|
+
# Special abbreviations that should not be split
|
|
20
|
+
# Using non-printing characters as placeholders
|
|
21
|
+
ABBREVIATION_PLACEHOLDERS = {
|
|
22
|
+
"б/у" => "\u0001\u0001SOCR_BU\u0001\u0001",
|
|
23
|
+
"б/н" => "\u0001\u0001SOCR_BN\u0001\u0001"
|
|
24
|
+
}.freeze
|
|
25
|
+
|
|
26
|
+
# Reverse placeholders for restoration
|
|
27
|
+
PLACEHOLDER_RESTORE = {
|
|
28
|
+
"\u0001\u0001SOCR_BU\u0001\u0001" => "б/у",
|
|
29
|
+
"\u0001\u0001SOCR_BN\u0001\u0001" => "б/н",
|
|
30
|
+
"\u0001\u0001SP_DDOT_SP\u0001\u0001" => " .. ",
|
|
31
|
+
"\u0001\u0001SP_DOT_SP\u0001\u0001" => " . ",
|
|
32
|
+
"\u0001\u0001SP_DOT\u0001\u0001" => "."
|
|
33
|
+
}.freeze
|
|
34
|
+
|
|
35
|
+
def tokenize(text)
|
|
36
|
+
return [] if text.nil? || text.strip.empty?
|
|
37
|
+
|
|
38
|
+
# Replace abbreviations with placeholders
|
|
39
|
+
text = replace_abbreviations(text)
|
|
40
|
+
|
|
41
|
+
# Split on word boundaries
|
|
42
|
+
raw_tokens = text.split(WORD_SEPARATORS)
|
|
43
|
+
|
|
44
|
+
# Restore abbreviations and filter
|
|
45
|
+
raw_tokens
|
|
46
|
+
.map { |token| restore_abbreviations(token) }
|
|
47
|
+
.map { |token| normalize(token) }
|
|
48
|
+
.reject { |token| skip_token?(token) }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
protected
|
|
52
|
+
|
|
53
|
+
def word_separators
|
|
54
|
+
WORD_SEPARATORS
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
private
|
|
58
|
+
|
|
59
|
+
# Replace special abbreviations with placeholders.
|
|
60
|
+
#
|
|
61
|
+
# @param text [String] Input text
|
|
62
|
+
# @return [String] Text with placeholders
|
|
63
|
+
def replace_abbreviations(text)
|
|
64
|
+
result = text
|
|
65
|
+
ABBREVIATION_PLACEHOLDERS.each do |abbr, placeholder|
|
|
66
|
+
result = result.gsub(abbr, placeholder)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Handle spaced dots
|
|
70
|
+
result = result.gsub(" .. ", "\u0001\u0001SP_DDOT_SP\u0001\u0001")
|
|
71
|
+
result = result.gsub(" . ", "\u0001\u0001SP_DOT_SP\u0001\u0001")
|
|
72
|
+
result = result.gsub(" .", " \u0001\u0001SP_DOT\u0001\u0001")
|
|
73
|
+
|
|
74
|
+
# Restore spaced dots first, then single dot pattern
|
|
75
|
+
result = result.gsub("\u0001\u0001SP_DDOT_SP\u0001\u0001", " .. ")
|
|
76
|
+
result = result.gsub("\u0001\u0001SP_DOT_SP\u0001\u0001", " . ")
|
|
77
|
+
|
|
78
|
+
result
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Restore abbreviations from placeholders.
|
|
82
|
+
#
|
|
83
|
+
# @param text [String] Text with placeholders
|
|
84
|
+
# @return [String] Text with restored abbreviations
|
|
85
|
+
def restore_abbreviations(text)
|
|
86
|
+
result = text
|
|
87
|
+
PLACEHOLDER_RESTORE.each do |placeholder, original|
|
|
88
|
+
result = result.gsub(placeholder, original)
|
|
89
|
+
end
|
|
90
|
+
result
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Language
|
|
5
|
+
module Tokenizer
|
|
6
|
+
# Tokenizer for Spanish text.
|
|
7
|
+
#
|
|
8
|
+
# Ported from LanguageTool's SpanishWordTokenizer.
|
|
9
|
+
#
|
|
10
|
+
# Handles:
|
|
11
|
+
# - Decimal point between digits (3.14)
|
|
12
|
+
# - Decimal comma between digits (3,14)
|
|
13
|
+
# - Ordinals (1.º, 2.ª, 1.er, 1.os, 1.as)
|
|
14
|
+
# - Hyphens (with do-not-split list since no tagger)
|
|
15
|
+
# - Soft hyphens
|
|
16
|
+
# - Inverted punctuation (¡, ¿)
|
|
17
|
+
class SpanishTokenizer < Base
|
|
18
|
+
# Spanish word separators - most punctuation and whitespace
|
|
19
|
+
# Note: We need to handle decimals specially, so we protect them first
|
|
20
|
+
WORD_SEPARATORS = /[\s"()\[\]{}<>,.;:!?\\\/|`~@#$%^&*·]/.freeze
|
|
21
|
+
|
|
22
|
+
# Decimal point between digits: 3.14
|
|
23
|
+
DECIMAL_POINT = /(\d)\.(\d)/
|
|
24
|
+
|
|
25
|
+
# Decimal comma between digits: 3,14
|
|
26
|
+
DECIMAL_COMMA = /(\d),(\d)/
|
|
27
|
+
|
|
28
|
+
# Ordinal patterns: 1.º, 2.ª, 1.er, 1.os, 1.as
|
|
29
|
+
ORDINAL = /\b(\d+)\.(º|ª|o|a|er|os|as)\b/
|
|
30
|
+
|
|
31
|
+
# Placeholders for special patterns
|
|
32
|
+
DECIMAL_POINT_PLACEHOLDER = "\uE101"
|
|
33
|
+
DECIMAL_COMMA_PLACEHOLDER = "\uE102"
|
|
34
|
+
ORDINAL_PLACEHOLDER = "\uE103"
|
|
35
|
+
|
|
36
|
+
# Soft hyphen
|
|
37
|
+
SOFT_HYPHEN = "\u00AD"
|
|
38
|
+
|
|
39
|
+
# Do-not-split list (from LanguageTool)
|
|
40
|
+
DO_NOT_SPLIT = %w[
|
|
41
|
+
mers-cov mcgraw-hill sars-cov-2 sars-cov
|
|
42
|
+
ph-metre ph-metres
|
|
43
|
+
].freeze
|
|
44
|
+
|
|
45
|
+
def tokenize(text)
|
|
46
|
+
return [] if text.nil? || text.strip.empty?
|
|
47
|
+
|
|
48
|
+
# Replace hyphen variants
|
|
49
|
+
text = text.gsub("\u2010", "\u002d") # hyphen to hyphen-minus
|
|
50
|
+
text = text.gsub("\u2011", "\u002d") # non-breaking hyphen to hyphen-minus
|
|
51
|
+
|
|
52
|
+
# Protect decimal points
|
|
53
|
+
text = text.gsub(DECIMAL_POINT, "\\1#{DECIMAL_POINT_PLACEHOLDER}\\2")
|
|
54
|
+
|
|
55
|
+
# Protect decimal commas
|
|
56
|
+
text = text.gsub(DECIMAL_COMMA, "\\1#{DECIMAL_COMMA_PLACEHOLDER}\\2")
|
|
57
|
+
|
|
58
|
+
# Protect ordinals
|
|
59
|
+
text = text.gsub(ORDINAL, "\\1#{ORDINAL_PLACEHOLDER}\\2")
|
|
60
|
+
|
|
61
|
+
# Split on word boundaries
|
|
62
|
+
raw_tokens = text.split(WORD_SEPARATORS)
|
|
63
|
+
|
|
64
|
+
# Process each token
|
|
65
|
+
tokens = []
|
|
66
|
+
raw_tokens.each do |token|
|
|
67
|
+
next if token.empty?
|
|
68
|
+
|
|
69
|
+
# Restore placeholders
|
|
70
|
+
token = restore_placeholders(token)
|
|
71
|
+
|
|
72
|
+
# Handle hyphenated words
|
|
73
|
+
parts = words_to_add(token)
|
|
74
|
+
tokens.concat(parts)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Filter and normalize
|
|
78
|
+
tokens
|
|
79
|
+
.map { |token| normalize(token) }
|
|
80
|
+
.reject { |token| skip_token?(token) }
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
protected
|
|
84
|
+
|
|
85
|
+
# Restore placeholders to original characters.
|
|
86
|
+
#
|
|
87
|
+
# @param token [String] Token with placeholders
|
|
88
|
+
# @return [String] Token with restored characters
|
|
89
|
+
def restore_placeholders(token)
|
|
90
|
+
token
|
|
91
|
+
.gsub(DECIMAL_POINT_PLACEHOLDER, ".")
|
|
92
|
+
.gsub(DECIMAL_COMMA_PLACEHOLDER, ",")
|
|
93
|
+
.gsub(ORDINAL_PLACEHOLDER, ".")
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Split a word into tokens, handling hyphens.
|
|
97
|
+
#
|
|
98
|
+
# @param word [String] Word to split
|
|
99
|
+
# @return [Array<String>] Array of tokens
|
|
100
|
+
def words_to_add(word)
|
|
101
|
+
return [word] unless word.include?("-")
|
|
102
|
+
|
|
103
|
+
# Check do-not-split list
|
|
104
|
+
return [word] if DO_NOT_SPLIT.include?(word.downcase)
|
|
105
|
+
|
|
106
|
+
# Remove soft hyphens and check
|
|
107
|
+
normalized = word.gsub(SOFT_HYPHEN, "").gsub("'", "'")
|
|
108
|
+
|
|
109
|
+
# For now, split on hyphens if not in do-not-split list
|
|
110
|
+
# Future: integrate with tagger for better handling
|
|
111
|
+
normalized.split("-", -1).flat_map do |part|
|
|
112
|
+
part.empty? ? ["-"] : [part]
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def word_separators
|
|
117
|
+
WORD_SEPARATORS
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "language/registry"
|
|
4
|
+
require_relative "language/detector"
|
|
5
|
+
require_relative "language/tokenizer/base"
|
|
6
|
+
require_relative "language/tokenizer/latin_tokenizer"
|
|
7
|
+
require_relative "language/tokenizer/french_tokenizer"
|
|
8
|
+
require_relative "language/tokenizer/german_tokenizer"
|
|
9
|
+
require_relative "language/tokenizer/spanish_tokenizer"
|
|
10
|
+
require_relative "language/tokenizer/portuguese_tokenizer"
|
|
11
|
+
require_relative "language/tokenizer/russian_tokenizer"
|
|
12
|
+
require_relative "language/tokenizer/japanese_tokenizer"
|
|
13
|
+
require_relative "language/normalizer/base"
|
|
14
|
+
require_relative "language/languages/base"
|
|
15
|
+
|
|
16
|
+
# Load all language-specific modules from new structure (languages/{en,fr,de,ja,pt,ru,es}/)
|
|
17
|
+
require_relative "languages"
|
|
18
|
+
|
|
19
|
+
module Kotoshu
|
|
20
|
+
# Language module for multi-language support.
|
|
21
|
+
#
|
|
22
|
+
# Provides language detection, tokenization, and normalization
|
|
23
|
+
# for different languages with proper OOP design.
|
|
24
|
+
#
|
|
25
|
+
# @example Detect language
|
|
26
|
+
# Kotoshu::Language.detect("Hello world") # => "en"
|
|
27
|
+
#
|
|
28
|
+
# @example Get language class
|
|
29
|
+
# lang_class = Kotoshu::Language.get("en-US")
|
|
30
|
+
#
|
|
31
|
+
# @example List supported languages
|
|
32
|
+
# Kotoshu::Language.supported_codes # => ["de-DE", "en-US", ...]
|
|
33
|
+
module Language
|
|
34
|
+
# Register the default detector with the registry
|
|
35
|
+
Registry.register_detector(Detector)
|
|
36
|
+
|
|
37
|
+
class << self
|
|
38
|
+
# Detect language from text.
|
|
39
|
+
#
|
|
40
|
+
# Delegates to Detector.
|
|
41
|
+
#
|
|
42
|
+
# @param text [String] Text to analyze
|
|
43
|
+
# @return [String, nil] Detected language code
|
|
44
|
+
def detect(text)
|
|
45
|
+
Detector.detect(text)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Detect with confidence score.
|
|
49
|
+
#
|
|
50
|
+
# @param text [String] Text to analyze
|
|
51
|
+
# @return [Array<String, Float>] Language code and confidence
|
|
52
|
+
def detect_with_confidence(text)
|
|
53
|
+
Detector.detect_with_confidence(text)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Get language class by code.
|
|
57
|
+
#
|
|
58
|
+
# Delegates to Registry.
|
|
59
|
+
#
|
|
60
|
+
# @param code [String] Language code
|
|
61
|
+
# @return [Class, nil] Language class or nil
|
|
62
|
+
def get(code)
|
|
63
|
+
Registry.get(code)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Check if language is registered.
|
|
67
|
+
#
|
|
68
|
+
# @param code [String] Language code
|
|
69
|
+
# @return [Boolean] True if registered
|
|
70
|
+
def registered?(code)
|
|
71
|
+
Registry.registered?(code)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Get all supported language codes.
|
|
75
|
+
#
|
|
76
|
+
# @return [Array<String>] List of codes
|
|
77
|
+
def supported_codes
|
|
78
|
+
Registry.supported_codes
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Get language info.
|
|
82
|
+
#
|
|
83
|
+
# @param code [String] Language code
|
|
84
|
+
# @return [Hash, nil] Language info or nil
|
|
85
|
+
def info(code)
|
|
86
|
+
Registry.info(code)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Register a language.
|
|
90
|
+
#
|
|
91
|
+
# @param code [String] Language code
|
|
92
|
+
# @param klass [Class] Language class
|
|
93
|
+
# @return [void]
|
|
94
|
+
def register(code, klass)
|
|
95
|
+
Registry.register(code, klass)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|