kotoshu 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +18 -0
- data/CHANGELOG.md +182 -0
- data/CLAUDE.md +172 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE +31 -0
- data/README.adoc +955 -0
- data/Rakefile +12 -0
- data/SECURITY.md +93 -0
- data/examples/01_basic_word_checking.rb +38 -0
- data/examples/02_text_document_checking.rb +77 -0
- data/examples/03_dictionary_backends.rb +137 -0
- data/examples/04_trie_data_structure.rb +146 -0
- data/examples/05_suggestion_algorithms.rb +239 -0
- data/examples/06_configuration_advanced.rb +287 -0
- data/examples/07_multi_language_dictionaries.rb +278 -0
- data/exe/kotoshu +6 -0
- data/lib/kotoshu/algorithms/capitalization.rb +276 -0
- data/lib/kotoshu/algorithms/lookup.rb +876 -0
- data/lib/kotoshu/algorithms/ngram_suggest.rb +270 -0
- data/lib/kotoshu/algorithms/permutations.rb +283 -0
- data/lib/kotoshu/algorithms/phonet_suggest.rb +167 -0
- data/lib/kotoshu/algorithms/suggest.rb +575 -0
- data/lib/kotoshu/algorithms.rb +14 -0
- data/lib/kotoshu/analyzers/semantic_analyzer.rb +295 -0
- data/lib/kotoshu/cache/base_cache.rb +596 -0
- data/lib/kotoshu/cache/cache.rb +91 -0
- data/lib/kotoshu/cache/frequency_cache.rb +224 -0
- data/lib/kotoshu/cache/language_cache.rb +454 -0
- data/lib/kotoshu/cache/lookup_cache.rb +166 -0
- data/lib/kotoshu/cache/model_cache.rb +513 -0
- data/lib/kotoshu/cache/suggestion_cache.rb +113 -0
- data/lib/kotoshu/cache.rb +40 -0
- data/lib/kotoshu/cli/auto_setup.rb +71 -0
- data/lib/kotoshu/cli/batch_reporter.rb +315 -0
- data/lib/kotoshu/cli/cache_command.rb +356 -0
- data/lib/kotoshu/cli/display_formatter.rb +431 -0
- data/lib/kotoshu/cli/errors.rb +36 -0
- data/lib/kotoshu/cli/interactive_reviewer.rb +319 -0
- data/lib/kotoshu/cli/language_resolver.rb +91 -0
- data/lib/kotoshu/cli/navigation_manager.rb +272 -0
- data/lib/kotoshu/cli/progress_reporter.rb +114 -0
- data/lib/kotoshu/cli/status_report.rb +130 -0
- data/lib/kotoshu/cli.rb +627 -0
- data/lib/kotoshu/commands/cache_command.rb +424 -0
- data/lib/kotoshu/commands/check_command.rb +312 -0
- data/lib/kotoshu/commands/model_command.rb +295 -0
- data/lib/kotoshu/components/passthrough_spell_checker.rb +72 -0
- data/lib/kotoshu/components/pos_tagger.rb +98 -0
- data/lib/kotoshu/components/spell_checker.rb +73 -0
- data/lib/kotoshu/components/synthesizer.rb +60 -0
- data/lib/kotoshu/components/tokenizer.rb +58 -0
- data/lib/kotoshu/components/whitespace_tokenizer.rb +96 -0
- data/lib/kotoshu/configuration/builder.rb +209 -0
- data/lib/kotoshu/configuration/resolver.rb +124 -0
- data/lib/kotoshu/configuration.rb +702 -0
- data/lib/kotoshu/core/exceptions.rb +165 -0
- data/lib/kotoshu/core/indexed_dictionary.rb +291 -0
- data/lib/kotoshu/core/models/affix_rule.rb +260 -0
- data/lib/kotoshu/core/models/result/document_result.rb +263 -0
- data/lib/kotoshu/core/models/result/word_result.rb +203 -0
- data/lib/kotoshu/core/models/word.rb +142 -0
- data/lib/kotoshu/core/trie/builder.rb +119 -0
- data/lib/kotoshu/core/trie/node.rb +94 -0
- data/lib/kotoshu/core/trie/trie.rb +249 -0
- data/lib/kotoshu/core.rb +28 -0
- data/lib/kotoshu/data/common_words/de.yml +1800 -0
- data/lib/kotoshu/data/common_words/en.yml +1215 -0
- data/lib/kotoshu/data/common_words/es.yml +750 -0
- data/lib/kotoshu/data/common_words/fr.yml +1015 -0
- data/lib/kotoshu/data/common_words/pt.yml +870 -0
- data/lib/kotoshu/data/common_words/ru.yml +484 -0
- data/lib/kotoshu/data/common_words_loader.rb +152 -0
- data/lib/kotoshu/data_structures/bloom_filter.rb +176 -0
- data/lib/kotoshu/debug_logger.rb +146 -0
- data/lib/kotoshu/debug_mode.rb +134 -0
- data/lib/kotoshu/defaults.rb +86 -0
- data/lib/kotoshu/dictionaries/catalog.rb +817 -0
- data/lib/kotoshu/dictionary/base.rb +237 -0
- data/lib/kotoshu/dictionary/cspell.rb +254 -0
- data/lib/kotoshu/dictionary/custom.rb +224 -0
- data/lib/kotoshu/dictionary/hunspell.rb +526 -0
- data/lib/kotoshu/dictionary/plain_text.rb +282 -0
- data/lib/kotoshu/dictionary/repository.rb +248 -0
- data/lib/kotoshu/dictionary/unified.rb +260 -0
- data/lib/kotoshu/dictionary/unix_words.rb +218 -0
- data/lib/kotoshu/documents/asciidoc_document.rb +441 -0
- data/lib/kotoshu/documents/document.rb +229 -0
- data/lib/kotoshu/documents/location.rb +139 -0
- data/lib/kotoshu/documents/markdown_document.rb +389 -0
- data/lib/kotoshu/documents/plain_text_document.rb +147 -0
- data/lib/kotoshu/embeddings/embedding_pipeline.rb +244 -0
- data/lib/kotoshu/embeddings/lru_cache.rb +233 -0
- data/lib/kotoshu/embeddings/onnx_runtime_model.rb +388 -0
- data/lib/kotoshu/embeddings/protocol.rb +83 -0
- data/lib/kotoshu/embeddings/protocols.rb +17 -0
- data/lib/kotoshu/embeddings/registry.rb +182 -0
- data/lib/kotoshu/embeddings/search.rb +192 -0
- data/lib/kotoshu/embeddings/similarity_engine.rb +248 -0
- data/lib/kotoshu/embeddings/similarity_search.rb +331 -0
- data/lib/kotoshu/embeddings/vocabulary.rb +257 -0
- data/lib/kotoshu/embeddings.rb +97 -0
- data/lib/kotoshu/fluent_checker.rb +91 -0
- data/lib/kotoshu/grammar/pattern_matchers/base_matcher.rb +48 -0
- data/lib/kotoshu/grammar/pattern_matchers/double_negative_matcher.rb +105 -0
- data/lib/kotoshu/grammar/pattern_matchers/possessive_context_matcher.rb +77 -0
- data/lib/kotoshu/grammar/pattern_matchers/vowel_sound_matcher.rb +83 -0
- data/lib/kotoshu/grammar/rule.rb +95 -0
- data/lib/kotoshu/grammar/rule_engine.rb +111 -0
- data/lib/kotoshu/grammar/rule_loader.rb +31 -0
- data/lib/kotoshu/grammar.rb +18 -0
- data/lib/kotoshu/integrity/audit_log.rb +88 -0
- data/lib/kotoshu/integrity/manifest.rb +117 -0
- data/lib/kotoshu/integrity/net_http.rb +46 -0
- data/lib/kotoshu/integrity.rb +25 -0
- data/lib/kotoshu/keyboard/layout.rb +115 -0
- data/lib/kotoshu/keyboard/layouts/azerty.rb +57 -0
- data/lib/kotoshu/keyboard/layouts/dvorak.rb +56 -0
- data/lib/kotoshu/keyboard/layouts/jcuken.rb +59 -0
- data/lib/kotoshu/keyboard/layouts/qwerty.rb +54 -0
- data/lib/kotoshu/keyboard/layouts/qwertz.rb +57 -0
- data/lib/kotoshu/keyboard/registry.rb +146 -0
- data/lib/kotoshu/keyboard.rb +60 -0
- data/lib/kotoshu/language/detector.rb +242 -0
- data/lib/kotoshu/language/identifier.rb +378 -0
- data/lib/kotoshu/language/languages/base.rb +256 -0
- data/lib/kotoshu/language/normalizer/base.rb +137 -0
- data/lib/kotoshu/language/registry.rb +147 -0
- data/lib/kotoshu/language/resources/ar/common_words.txt +6753 -0
- data/lib/kotoshu/language/resources/ar/confusion_sets.txt +11 -0
- data/lib/kotoshu/language/resources/de/common_words.txt +10003 -0
- data/lib/kotoshu/language/resources/de/confusion_sets.txt +246 -0
- data/lib/kotoshu/language/resources/en/common_words.txt +9979 -0
- data/lib/kotoshu/language/resources/en/confusion_sets.txt +871 -0
- data/lib/kotoshu/language/resources/es/common_words.txt +9992 -0
- data/lib/kotoshu/language/resources/es/confusion_sets.txt +17 -0
- data/lib/kotoshu/language/resources/fr/common_words.txt +9993 -0
- data/lib/kotoshu/language/resources/fr/confusion_sets.txt +76 -0
- data/lib/kotoshu/language/resources/pt/common_words.txt +9977 -0
- data/lib/kotoshu/language/resources/pt/confusion_sets.txt +18 -0
- data/lib/kotoshu/language/resources/ru/common_words.txt +9951 -0
- data/lib/kotoshu/language/resources/ru/confusion_sets.txt +5 -0
- data/lib/kotoshu/language/tokenizer/base.rb +170 -0
- data/lib/kotoshu/language/tokenizer/french_tokenizer.rb +170 -0
- data/lib/kotoshu/language/tokenizer/german_tokenizer.rb +41 -0
- data/lib/kotoshu/language/tokenizer/japanese_tokenizer.rb +60 -0
- data/lib/kotoshu/language/tokenizer/latin_tokenizer.rb +141 -0
- data/lib/kotoshu/language/tokenizer/portuguese_tokenizer.rb +160 -0
- data/lib/kotoshu/language/tokenizer/russian_tokenizer.rb +95 -0
- data/lib/kotoshu/language/tokenizer/spanish_tokenizer.rb +122 -0
- data/lib/kotoshu/language.rb +99 -0
- data/lib/kotoshu/languages/de/language.rb +546 -0
- data/lib/kotoshu/languages/en/language.rb +448 -0
- data/lib/kotoshu/languages/es/language.rb +459 -0
- data/lib/kotoshu/languages/fr/language.rb +493 -0
- data/lib/kotoshu/languages/ja/language.rb +477 -0
- data/lib/kotoshu/languages/pt/language.rb +423 -0
- data/lib/kotoshu/languages/ru/language.rb +404 -0
- data/lib/kotoshu/languages.rb +43 -0
- data/lib/kotoshu/metrics_collector.rb +222 -0
- data/lib/kotoshu/metrics_module.rb +110 -0
- data/lib/kotoshu/models/context.rb +119 -0
- data/lib/kotoshu/models/embedding_model.rb +182 -0
- data/lib/kotoshu/models/fasttext_model.rb +220 -0
- data/lib/kotoshu/models/nearest_neighbor.rb +87 -0
- data/lib/kotoshu/models/onnx_model.rb +333 -0
- data/lib/kotoshu/models/semantic_error.rb +165 -0
- data/lib/kotoshu/models/suggestion.rb +106 -0
- data/lib/kotoshu/models/word_embedding.rb +107 -0
- data/lib/kotoshu/paths.rb +53 -0
- data/lib/kotoshu/personal_dictionary.rb +94 -0
- data/lib/kotoshu/plugins/plugin.rb +61 -0
- data/lib/kotoshu/plugins/registry.rb +120 -0
- data/lib/kotoshu/project_config.rb +76 -0
- data/lib/kotoshu/readers/aff_data.rb +356 -0
- data/lib/kotoshu/readers/aff_reader.rb +375 -0
- data/lib/kotoshu/readers/condition_checker.rb +142 -0
- data/lib/kotoshu/readers/dic_reader.rb +118 -0
- data/lib/kotoshu/readers/file_reader.rb +347 -0
- data/lib/kotoshu/readers/lookup_builder.rb +299 -0
- data/lib/kotoshu/readers/readers.rb +6 -0
- data/lib/kotoshu/readers.rb +9 -0
- data/lib/kotoshu/resource_bundle.rb +30 -0
- data/lib/kotoshu/resource_manager.rb +295 -0
- data/lib/kotoshu/results/result.rb +165 -0
- data/lib/kotoshu/scripts/fasttext_to_onnx.py +275 -0
- data/lib/kotoshu/source_registry.rb +74 -0
- data/lib/kotoshu/spellchecker/parallel_checker.rb +90 -0
- data/lib/kotoshu/spellchecker.rb +298 -0
- data/lib/kotoshu/string_metrics.rb +153 -0
- data/lib/kotoshu/suggestions/context.rb +55 -0
- data/lib/kotoshu/suggestions/generator.rb +175 -0
- data/lib/kotoshu/suggestions/pipeline.rb +135 -0
- data/lib/kotoshu/suggestions/strategies/base_strategy.rb +296 -0
- data/lib/kotoshu/suggestions/strategies/composite_strategy.rb +140 -0
- data/lib/kotoshu/suggestions/strategies/edit_distance_strategy.rb +671 -0
- data/lib/kotoshu/suggestions/strategies/keyboard_proximity_strategy.rb +228 -0
- data/lib/kotoshu/suggestions/strategies/ngram_strategy.rb +130 -0
- data/lib/kotoshu/suggestions/strategies/phonetic_strategy.rb +329 -0
- data/lib/kotoshu/suggestions/strategies/semantic_strategy.rb +316 -0
- data/lib/kotoshu/suggestions/strategies/symspell_strategy.rb +275 -0
- data/lib/kotoshu/suggestions/suggestion.rb +174 -0
- data/lib/kotoshu/suggestions/suggestion_set.rb +238 -0
- data/lib/kotoshu/version.rb +5 -0
- data/lib/kotoshu.rb +493 -0
- data/script/validate_all_dictionaries.rb +444 -0
- data/sig/kotoshu.rbs +4 -0
- data/test_oop.rb +79 -0
- metadata +298 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Components
|
|
5
|
+
# Base class for POS (Part-of-Speech) taggers.
|
|
6
|
+
#
|
|
7
|
+
# POS taggers assign grammatical categories (NOUN, VERB, ADJ, etc.) to tokens.
|
|
8
|
+
# Different languages use different POS tagging strategies:
|
|
9
|
+
# - Latin scripts: Dictionary-based (Hunspell flags → POS tags)
|
|
10
|
+
# - CJK: Integrated with morphological analysis (tokenizer provides POS)
|
|
11
|
+
# - German: Compound word decomposition affects tagging
|
|
12
|
+
#
|
|
13
|
+
# Common POS tags (Penn Treebank style):
|
|
14
|
+
# - CC: Coordinating conjunction
|
|
15
|
+
# - CD: Cardinal number
|
|
16
|
+
# - DT: Determiner
|
|
17
|
+
# - EX: Existential there
|
|
18
|
+
# - FW: Foreign word
|
|
19
|
+
# - IN: Preposition or subordinating conjunction
|
|
20
|
+
# - JJ: Adjective
|
|
21
|
+
# - JJR: Adjective, comparative
|
|
22
|
+
# - JJS: Adjective, superlative
|
|
23
|
+
# - LS: List item marker
|
|
24
|
+
# - MD: Modal
|
|
25
|
+
# - NN: Noun, singular or mass
|
|
26
|
+
# - NNS: Noun, plural
|
|
27
|
+
# - NNP: Proper noun, singular
|
|
28
|
+
# - NNPS: Proper noun, plural
|
|
29
|
+
# - PDT: Predeterminer
|
|
30
|
+
# - POS: Possessive ending
|
|
31
|
+
# - PRP: Personal pronoun
|
|
32
|
+
# - PRP$: Possessive pronoun
|
|
33
|
+
# - RB: Adverb
|
|
34
|
+
# - RBR: Adverb, comparative
|
|
35
|
+
# - RBS: Adverb, superlative
|
|
36
|
+
# - RP: Particle
|
|
37
|
+
# - SYM: Symbol
|
|
38
|
+
# - TO: to
|
|
39
|
+
# - UH: Interjection
|
|
40
|
+
# - VB: Verb, base form
|
|
41
|
+
# - VBD: Verb, past tense
|
|
42
|
+
# - VBG: Verb, gerund or present participle
|
|
43
|
+
# - VBN: Verb, past participle
|
|
44
|
+
# - VBP: Verb, non-3rd person singular present
|
|
45
|
+
# - VBZ: Verb, 3rd person singular present
|
|
46
|
+
# - WDT: Wh-determiner
|
|
47
|
+
# - WP: Wh-pronoun
|
|
48
|
+
# - WP$: Possessive wh-pronoun
|
|
49
|
+
# - WRB: Wh-adverb
|
|
50
|
+
#
|
|
51
|
+
# Language-specific tags:
|
|
52
|
+
# - CJK uses its own tagset (e.g., Japanese: 名詞, 動詞, etc.)
|
|
53
|
+
# - German uses STTS tagset
|
|
54
|
+
#
|
|
55
|
+
# @abstract Subclasses must implement #tag
|
|
56
|
+
#
|
|
57
|
+
# @example Tagging tokens
|
|
58
|
+
# tagger = EnglishPosTagger.new(aff_path: "en_US.aff", dic_path: "en_US.dic")
|
|
59
|
+
# tokens = [
|
|
60
|
+
# { token: "The", position: 0, length: 3 },
|
|
61
|
+
# { token: "dog", position: 4, length: 3 },
|
|
62
|
+
# { token: "runs", position: 8, length: 4 }
|
|
63
|
+
# ]
|
|
64
|
+
# tagged = tagger.tag(tokens)
|
|
65
|
+
# # => [
|
|
66
|
+
# # { token: "The", position: 0, length: 3, pos_tag: "DET", lemma: "the" },
|
|
67
|
+
# # { token: "dog", position: 4, length: 3, pos_tag: "NOUN", lemma: "dog" },
|
|
68
|
+
# # { token: "runs", position: 8, length: 4, pos_tag: "VERB", lemma: "run" }
|
|
69
|
+
# # ]
|
|
70
|
+
class PosTagger
|
|
71
|
+
# Tag tokens with POS information.
|
|
72
|
+
#
|
|
73
|
+
# Takes an array of token hashes (from Tokenizer#tokenize) and adds:
|
|
74
|
+
# - :pos_tag (String, nil) - POS category (NOUN, VERB, etc.) or nil if unknown
|
|
75
|
+
# - :lemma (String, nil) - Lemma/base form or nil if unknown
|
|
76
|
+
#
|
|
77
|
+
# @abstract Subclasses must implement
|
|
78
|
+
# @param tokens [Array<Hash>] Array of token hashes from Tokenizer
|
|
79
|
+
# @return [Array<Hash>] Token hashes with added :pos_tag and :lemma keys
|
|
80
|
+
# @raise [NotImplementedError] if not implemented by subclass
|
|
81
|
+
def tag(tokens)
|
|
82
|
+
raise NotImplementedError, "#{self.class} must implement #tag"
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Tag a single word.
|
|
86
|
+
#
|
|
87
|
+
# Convenience method for single-word tagging.
|
|
88
|
+
#
|
|
89
|
+
# @param word [String] The word to tag
|
|
90
|
+
# @return [Hash] Hash with :pos_tag and :lemma keys (may be nil)
|
|
91
|
+
def tag_word(word)
|
|
92
|
+
token = { token: word, position: 0, length: word.length }
|
|
93
|
+
result = tag([token])
|
|
94
|
+
result.first || { pos_tag: nil, lemma: nil }
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Components
|
|
5
|
+
# Base class for spell checkers.
|
|
6
|
+
#
|
|
7
|
+
# Spell checkers validate words and provide suggestions for misspelled words.
|
|
8
|
+
# Different languages use different spell checking strategies:
|
|
9
|
+
# - Latin scripts: Dictionary lookup (Hunspell, Morfologik)
|
|
10
|
+
# - CJK: Confusion rule checking (no dictionary)
|
|
11
|
+
# - RTL: Dictionary lookup with bidirectional text handling
|
|
12
|
+
#
|
|
13
|
+
# @abstract Subclasses must implement #check and #suggest
|
|
14
|
+
#
|
|
15
|
+
# @example Checking a word
|
|
16
|
+
# checker = EnglishSpellChecker.new(aff_path: "en_US.aff", dic_path: "en_US.dic")
|
|
17
|
+
# result = checker.check("hello")
|
|
18
|
+
# # => { found: true, stem: "hello", flags: [] }
|
|
19
|
+
#
|
|
20
|
+
# @example Getting suggestions
|
|
21
|
+
# result = checker.check("helo")
|
|
22
|
+
# # => { found: false, stem: nil, flags: [] }
|
|
23
|
+
# suggestions = checker.suggest("helo")
|
|
24
|
+
# # => [
|
|
25
|
+
# # { word: "hello", distance: 1, score: 0.9 },
|
|
26
|
+
# # { word: "help", distance: 2, score: 0.7 }
|
|
27
|
+
# # ]
|
|
28
|
+
class SpellChecker
|
|
29
|
+
# Check if a word is spelled correctly.
|
|
30
|
+
#
|
|
31
|
+
# Returns a hash with:
|
|
32
|
+
# - :found (Boolean) - true if word is in dictionary
|
|
33
|
+
# - :stem (String, nil) - The stem/lemma if found
|
|
34
|
+
# - :flags (Array<String>) - Morphological flags
|
|
35
|
+
#
|
|
36
|
+
# @abstract Subclasses must implement
|
|
37
|
+
# @param word [String] The word to check
|
|
38
|
+
# @return [Hash] Result with :found, :stem, :flags
|
|
39
|
+
# @raise [NotImplementedError] if not implemented by subclass
|
|
40
|
+
def check(word)
|
|
41
|
+
raise NotImplementedError, "#{self.class} must implement #check"
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Get spelling suggestions for a misspelled word.
|
|
45
|
+
#
|
|
46
|
+
# Returns an array of suggestion hashes with:
|
|
47
|
+
# - :word (String) - The suggested word
|
|
48
|
+
# - :distance (Integer) - Edit distance from original word
|
|
49
|
+
# - :score (Float) - Confidence score (0-1, higher is better)
|
|
50
|
+
#
|
|
51
|
+
# Suggestions are sorted by relevance (highest score first).
|
|
52
|
+
#
|
|
53
|
+
# @abstract Subclasses must implement
|
|
54
|
+
# @param word [String] The misspelled word
|
|
55
|
+
# @param max_suggestions [Integer] Maximum number of suggestions to return
|
|
56
|
+
# @return [Array<Hash>] Array of suggestion hashes
|
|
57
|
+
# @raise [NotImplementedError] if not implemented by subclass
|
|
58
|
+
def suggest(word, max_suggestions: 10)
|
|
59
|
+
raise NotImplementedError, "#{self.class} must implement #suggest"
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Check if a word is spelled correctly.
|
|
63
|
+
#
|
|
64
|
+
# Convenience method that returns a boolean.
|
|
65
|
+
#
|
|
66
|
+
# @param word [String] The word to check
|
|
67
|
+
# @return [Boolean] true if word is correct
|
|
68
|
+
def correct?(word)
|
|
69
|
+
check(word)[:found]
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Components
|
|
5
|
+
# Base class for word form synthesizers.
|
|
6
|
+
#
|
|
7
|
+
# Synthesizers generate inflected forms from a lemma (base form).
|
|
8
|
+
# This is the inverse of lemmatization:
|
|
9
|
+
# - Lemmatization: "runs" → "run"
|
|
10
|
+
# - Synthesis: "run" → ["run", "runs", "running", "ran"]
|
|
11
|
+
#
|
|
12
|
+
# Different languages use different synthesis strategies:
|
|
13
|
+
# - Latin scripts: Hunspell affix rules
|
|
14
|
+
# - CJK: Not applicable (no inflection)
|
|
15
|
+
# - German: Compound word + affix synthesis
|
|
16
|
+
# - Finnish: Complex agglutinative patterns
|
|
17
|
+
#
|
|
18
|
+
# @abstract Subclasses must implement #synthesize
|
|
19
|
+
#
|
|
20
|
+
# @example Synthesizing English verb forms
|
|
21
|
+
# synthesizer = EnglishSynthesizer.new(aff_path: "en_US.aff", dic_path: "en_US.dic")
|
|
22
|
+
# forms = synthesizer.synthesize("run", "VERB")
|
|
23
|
+
# # => ["run", "runs", "running", "ran"]
|
|
24
|
+
#
|
|
25
|
+
# @example Synthesizing with POS constraint
|
|
26
|
+
# forms = synthesizer.synthesize("happy", "ADJ")
|
|
27
|
+
# # => ["happy", "happier", "happiest"]
|
|
28
|
+
class Synthesizer
|
|
29
|
+
# Generate inflected forms of a word.
|
|
30
|
+
#
|
|
31
|
+
# Given a lemma (base form) and a POS tag, returns all possible
|
|
32
|
+
# inflected forms of that word.
|
|
33
|
+
#
|
|
34
|
+
# @abstract Subclasses must implement
|
|
35
|
+
# @param lemma [String] The base form (lemma)
|
|
36
|
+
# @param pos_tag [String] The POS tag to constrain generation
|
|
37
|
+
# @return [Array<String>] Array of inflected forms
|
|
38
|
+
# @raise [NotImplementedError] if not implemented by subclass
|
|
39
|
+
def synthesize(lemma, pos_tag)
|
|
40
|
+
raise NotImplementedError, "#{self.class} must implement #synthesize"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Generate all inflected forms (all POS tags).
|
|
44
|
+
#
|
|
45
|
+
# Convenience method that generates forms for all possible POS tags.
|
|
46
|
+
#
|
|
47
|
+
# @param lemma [String] The base form (lemma)
|
|
48
|
+
# @return [Hash] Hash mapping POS tags to arrays of forms
|
|
49
|
+
def synthesize_all(lemma)
|
|
50
|
+
# Default implementation - subclasses can optimize
|
|
51
|
+
{
|
|
52
|
+
'NOUN' => synthesize(lemma, 'NOUN'),
|
|
53
|
+
'VERB' => synthesize(lemma, 'VERB'),
|
|
54
|
+
'ADJ' => synthesize(lemma, 'ADJ'),
|
|
55
|
+
'ADV' => synthesize(lemma, 'ADV')
|
|
56
|
+
}
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kotoshu
|
|
4
|
+
module Components
|
|
5
|
+
# Base class for tokenizers.
|
|
6
|
+
#
|
|
7
|
+
# Tokenizers split text into individual tokens (words, punctuation).
|
|
8
|
+
# Different languages use different tokenization strategies:
|
|
9
|
+
# - Latin scripts: Whitespace + punctuation
|
|
10
|
+
# - CJK: Morphological analysis
|
|
11
|
+
# - German: Compound word splitting
|
|
12
|
+
# - RTL: Right-to-left text handling
|
|
13
|
+
#
|
|
14
|
+
# @abstract Subclasses must implement #tokenize
|
|
15
|
+
#
|
|
16
|
+
# @example Tokenizing English text
|
|
17
|
+
# tokenizer = WhitespaceTokenizer.new
|
|
18
|
+
# tokens = tokenizer.tokenize("Hello, world!")
|
|
19
|
+
# # => [
|
|
20
|
+
# # { token: "Hello", position: 0, length: 5 },
|
|
21
|
+
# # { token: ",", position: 5, length: 1 },
|
|
22
|
+
# # { token: "world", position: 7, length: 5 },
|
|
23
|
+
# # { token: "!", position: 12, length: 1 }
|
|
24
|
+
# # ]
|
|
25
|
+
class Tokenizer
|
|
26
|
+
# Split text into tokens.
|
|
27
|
+
#
|
|
28
|
+
# Each token is a hash with:
|
|
29
|
+
# - :token (String) - The token text
|
|
30
|
+
# - :position (Integer) - Character position in original text
|
|
31
|
+
# - :length (Integer) - Token length in characters
|
|
32
|
+
#
|
|
33
|
+
# Additional keys may be added by subclasses:
|
|
34
|
+
# - :pos_tag (String) - Part of speech tag
|
|
35
|
+
# - :lemma (String) - Base form / lemma
|
|
36
|
+
# - :compound_part (Boolean) - Whether this is a compound word part
|
|
37
|
+
# - :script (Symbol) - Script type for multilingual text
|
|
38
|
+
#
|
|
39
|
+
# @abstract Subclasses must implement
|
|
40
|
+
# @param text [String] The input text
|
|
41
|
+
# @return [Array<Hash>] Array of token hashes
|
|
42
|
+
# @raise [NotImplementedError] if not implemented by subclass
|
|
43
|
+
def tokenize(text)
|
|
44
|
+
raise NotImplementedError, "#{self.class} must implement #tokenize"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Tokenize and return just the token strings.
|
|
48
|
+
#
|
|
49
|
+
# Convenience method for when you only need the text content.
|
|
50
|
+
#
|
|
51
|
+
# @param text [String] The input text
|
|
52
|
+
# @return [Array<String>] Array of token strings
|
|
53
|
+
def tokenize_to_strings(text)
|
|
54
|
+
tokenize(text).map { |t| t[:token] }
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'tokenizer'
|
|
4
|
+
|
|
5
|
+
module Kotoshu
|
|
6
|
+
module Components
|
|
7
|
+
# Whitespace-based tokenizer for Latin-script languages.
|
|
8
|
+
#
|
|
9
|
+
# Splits text on whitespace and separates punctuation.
|
|
10
|
+
# Suitable for languages with space-separated words (English, French, German, etc.).
|
|
11
|
+
#
|
|
12
|
+
# This is a simple tokenizer that works well for most Latin-script languages.
|
|
13
|
+
# For more advanced tokenization (contractions, compounds), use language-specific
|
|
14
|
+
# tokenizers.
|
|
15
|
+
#
|
|
16
|
+
# @example Basic tokenization
|
|
17
|
+
# tokenizer = WhitespaceTokenizer.new
|
|
18
|
+
# tokens = tokenizer.tokenize("Hello, world!")
|
|
19
|
+
# # => [
|
|
20
|
+
# # { token: "Hello", position: 0, length: 5 },
|
|
21
|
+
# # { token: ",", position: 5, length: 1 },
|
|
22
|
+
# # { token: "world", position: 7, length: 5 },
|
|
23
|
+
# # { token: "!", position: 12, length: 1 }
|
|
24
|
+
# # ]
|
|
25
|
+
#
|
|
26
|
+
# @example Tokenizing to strings
|
|
27
|
+
# tokenizer.tokenize_to_strings("Hello, world!")
|
|
28
|
+
# # => ["Hello", ",", "world", "!"]
|
|
29
|
+
class WhitespaceTokenizer < Tokenizer
|
|
30
|
+
# Regex pattern for matching tokens (words or punctuation).
|
|
31
|
+
TOKEN_PATTERN = /[\w']+|[^\w\s]/.freeze
|
|
32
|
+
|
|
33
|
+
# Create a new whitespace tokenizer.
|
|
34
|
+
#
|
|
35
|
+
# @param pattern [Regexp] Optional custom token pattern
|
|
36
|
+
def initialize(pattern: TOKEN_PATTERN)
|
|
37
|
+
@pattern = pattern
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Split text into tokens.
|
|
41
|
+
#
|
|
42
|
+
# Each token is a hash with:
|
|
43
|
+
# - :token (String) - The token text
|
|
44
|
+
# - :position (Integer) - Character position in original text
|
|
45
|
+
# - :length (Integer) - Token length in characters
|
|
46
|
+
#
|
|
47
|
+
# @param text [String] The input text
|
|
48
|
+
# @return [Array<Hash>] Array of token hashes
|
|
49
|
+
def tokenize(text)
|
|
50
|
+
return [] if text.nil? || text.empty?
|
|
51
|
+
|
|
52
|
+
tokens = []
|
|
53
|
+
position = 0
|
|
54
|
+
|
|
55
|
+
# Find all matches
|
|
56
|
+
text.scan(@pattern) do |match|
|
|
57
|
+
match_str = match.is_a?(Array) ? match.first : match
|
|
58
|
+
start_pos = text.index(match_str, position)
|
|
59
|
+
|
|
60
|
+
tokens << {
|
|
61
|
+
token: match_str,
|
|
62
|
+
position: start_pos,
|
|
63
|
+
length: match_str.length
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
position = start_pos + match_str.length
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
tokens
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Get the token pattern used by this tokenizer.
|
|
73
|
+
#
|
|
74
|
+
# @return [Regexp] The token pattern
|
|
75
|
+
def pattern
|
|
76
|
+
@pattern
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Check if a character is a word character.
|
|
80
|
+
#
|
|
81
|
+
# @param char [String] Single character
|
|
82
|
+
# @return [Boolean] True if word character
|
|
83
|
+
def word_char?(char)
|
|
84
|
+
char.match?(/[\w]/)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Check if a character is punctuation.
|
|
88
|
+
#
|
|
89
|
+
# @param char [String] Single character
|
|
90
|
+
# @return [Boolean] True if punctuation
|
|
91
|
+
def punctuation?(char)
|
|
92
|
+
char.match?(/[^\w\s]/)
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../configuration"
|
|
4
|
+
|
|
5
|
+
module Kotoshu
|
|
6
|
+
class Configuration
|
|
7
|
+
# Builder for creating immutable Configuration objects.
|
|
8
|
+
#
|
|
9
|
+
# Provides a fluent interface for building configuration objects
|
|
10
|
+
# that are frozen after creation, ensuring thread-safety and immutability.
|
|
11
|
+
#
|
|
12
|
+
# @example Building with block
|
|
13
|
+
# config = Configuration::Builder.build do |b|
|
|
14
|
+
# b.dictionary_path = "words.txt"
|
|
15
|
+
# b.language = "en-GB"
|
|
16
|
+
# end
|
|
17
|
+
#
|
|
18
|
+
# @example Building with fluent methods
|
|
19
|
+
# config = Configuration::Builder.build
|
|
20
|
+
# .with_dictionary_path("words.txt")
|
|
21
|
+
# .with_language("en-GB")
|
|
22
|
+
class Builder
|
|
23
|
+
# Build an immutable configuration.
|
|
24
|
+
#
|
|
25
|
+
# @yield [builder] Optional block for configuration
|
|
26
|
+
# @return [Configuration] Frozen configuration object
|
|
27
|
+
#
|
|
28
|
+
# @example With block
|
|
29
|
+
# config = Builder.build do |b|
|
|
30
|
+
# b.dictionary_path = "words.txt"
|
|
31
|
+
# end
|
|
32
|
+
#
|
|
33
|
+
# @example Without block (uses defaults)
|
|
34
|
+
# config = Builder.build
|
|
35
|
+
def self.build
|
|
36
|
+
builder_instance = new
|
|
37
|
+
yield(builder_instance) if block_given?
|
|
38
|
+
builder_instance.to_config
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Create a new builder.
|
|
42
|
+
def initialize
|
|
43
|
+
@settings = DEFAULTS.dup
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Set dictionary path.
|
|
47
|
+
#
|
|
48
|
+
# @param path [String] Path to dictionary file
|
|
49
|
+
# @return [self] Self for chaining
|
|
50
|
+
def dictionary_path=(path)
|
|
51
|
+
@settings[:dictionary_path] = path
|
|
52
|
+
self
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Set dictionary type.
|
|
56
|
+
#
|
|
57
|
+
# @param type [Symbol] Dictionary type
|
|
58
|
+
# @return [self] Self for chaining
|
|
59
|
+
def dictionary_type=(type)
|
|
60
|
+
@settings[:dictionary_type] = type
|
|
61
|
+
self
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Set language code.
|
|
65
|
+
#
|
|
66
|
+
# @param lang [String] Language code
|
|
67
|
+
# @return [self] Self for chaining
|
|
68
|
+
def language=(lang)
|
|
69
|
+
@settings[:language] = lang
|
|
70
|
+
self
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Set locale.
|
|
74
|
+
#
|
|
75
|
+
# @param locale [String, nil] Locale
|
|
76
|
+
# @return [self] Self for chaining
|
|
77
|
+
def locale=(locale)
|
|
78
|
+
@settings[:locale] = locale
|
|
79
|
+
self
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Set max suggestions.
|
|
83
|
+
#
|
|
84
|
+
# @param max [Integer] Maximum suggestions
|
|
85
|
+
# @return [self] Self for chaining
|
|
86
|
+
def max_suggestions=(max)
|
|
87
|
+
@settings[:max_suggestions] = max
|
|
88
|
+
self
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Set case sensitivity.
|
|
92
|
+
#
|
|
93
|
+
# @param sensitive [Boolean] Case sensitive flag
|
|
94
|
+
# @return [self] Self for chaining
|
|
95
|
+
def case_sensitive=(sensitive)
|
|
96
|
+
@settings[:case_sensitive] = sensitive
|
|
97
|
+
self
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Set verbose mode.
|
|
101
|
+
#
|
|
102
|
+
# @param verbose [Boolean] Verbose flag
|
|
103
|
+
# @return [self] Self for chaining
|
|
104
|
+
def verbose=(verbose)
|
|
105
|
+
@settings[:verbose] = verbose
|
|
106
|
+
self
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Set suggestion algorithms.
|
|
110
|
+
#
|
|
111
|
+
# @param algorithms [Array<Class>, nil] Suggestion algorithms
|
|
112
|
+
# @return [self] Self for chaining
|
|
113
|
+
def suggestion_algorithms=(algorithms)
|
|
114
|
+
@settings[:suggestion_algorithms] = algorithms
|
|
115
|
+
self
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Set custom words.
|
|
119
|
+
#
|
|
120
|
+
# @param words [Array<String>] Custom words
|
|
121
|
+
# @return [self] Self for chaining
|
|
122
|
+
def custom_words=(words)
|
|
123
|
+
@settings[:custom_words] = words.dup.freeze
|
|
124
|
+
self
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Set encoding.
|
|
128
|
+
#
|
|
129
|
+
# @param encoding [String] Character encoding
|
|
130
|
+
# @return [self] Self for chaining
|
|
131
|
+
def encoding=(encoding)
|
|
132
|
+
@settings[:encoding] = encoding
|
|
133
|
+
self
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Fluent method to set dictionary path.
|
|
137
|
+
#
|
|
138
|
+
# @param path [String] Path to dictionary file
|
|
139
|
+
# @return [Configuration] New configuration
|
|
140
|
+
def with_dictionary_path(path)
|
|
141
|
+
@settings[:dictionary_path] = path
|
|
142
|
+
self
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Fluent method to set dictionary type.
|
|
146
|
+
#
|
|
147
|
+
# @param type [Symbol] Dictionary type
|
|
148
|
+
# @return [Configuration] New configuration
|
|
149
|
+
def with_dictionary_type(type)
|
|
150
|
+
@settings[:dictionary_type] = type
|
|
151
|
+
self
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Fluent method to set language.
|
|
155
|
+
#
|
|
156
|
+
# @param lang [String] Language code
|
|
157
|
+
# @return [Configuration] New configuration
|
|
158
|
+
def with_language(lang)
|
|
159
|
+
@settings[:language] = lang
|
|
160
|
+
self
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Fluent method to set locale.
|
|
164
|
+
#
|
|
165
|
+
# @param locale [String, nil] Locale
|
|
166
|
+
# @return [Configuration] New configuration
|
|
167
|
+
def with_locale(locale)
|
|
168
|
+
@settings[:locale] = locale
|
|
169
|
+
self
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Fluent method to set max suggestions.
|
|
173
|
+
#
|
|
174
|
+
# @param max [Integer] Maximum suggestions
|
|
175
|
+
# @return [Configuration] New configuration
|
|
176
|
+
def with_max_suggestions(max)
|
|
177
|
+
@settings[:max_suggestions] = max
|
|
178
|
+
self
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Fluent method to set case sensitivity.
|
|
182
|
+
#
|
|
183
|
+
# @param sensitive [Boolean] Case sensitive flag
|
|
184
|
+
# @return [Configuration] New configuration
|
|
185
|
+
def with_case_sensitive(sensitive)
|
|
186
|
+
@settings[:case_sensitive] = sensitive
|
|
187
|
+
self
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Fluent method to set verbose mode.
|
|
191
|
+
#
|
|
192
|
+
# @param verbose [Boolean] Verbose flag
|
|
193
|
+
# @return [Configuration] New configuration
|
|
194
|
+
def with_verbose(verbose)
|
|
195
|
+
@settings[:verbose] = verbose
|
|
196
|
+
self
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Convert builder to frozen Configuration.
|
|
200
|
+
#
|
|
201
|
+
# @return [Configuration] Frozen configuration object
|
|
202
|
+
def to_config
|
|
203
|
+
config = Configuration.new(@settings.dup)
|
|
204
|
+
config.freeze
|
|
205
|
+
config
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
end
|