kabosu 0.6.10-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,157 @@
1
+ module Kabosu
2
+ class MorphemeList
3
+ include Enumerable
4
+
5
+ attr_accessor :internal_cost
6
+
7
+ def initialize(source_or_morphemes, internal_cost: nil)
8
+ @source = source_or_morphemes if lazy_source?(source_or_morphemes)
9
+ @morphemes = @source ? Array.new(@source.size) : source_or_morphemes
10
+ @internal_cost = internal_cost || (@source&.internal_cost)
11
+ end
12
+
13
+ def each(&block)
14
+ return enum_for(:each) unless block
15
+
16
+ if @source
17
+ i = 0
18
+ while i < size
19
+ block.call(fetch(i))
20
+ i += 1
21
+ end
22
+ else
23
+ @morphemes.each(&block)
24
+ end
25
+ end
26
+
27
+ def [](index)
28
+ return to_a[index] if index.is_a?(Range)
29
+
30
+ idx = normalize_index(index)
31
+ return nil if idx.nil?
32
+
33
+ fetch(idx)
34
+ end
35
+
36
+ def first(n = nil)
37
+ return self[0] unless n
38
+
39
+ n = Integer(n)
40
+ raise ArgumentError, "negative array size" if n.negative?
41
+
42
+ limit = [n, size].min
43
+ (0...limit).map { fetch(_1) }
44
+ end
45
+
46
+ def last(n = nil)
47
+ return self[-1] unless n
48
+
49
+ n = Integer(n)
50
+ raise ArgumentError, "negative array size" if n.negative?
51
+
52
+ start = [size - n, 0].max
53
+ (start...size).map { fetch(_1) }
54
+ end
55
+
56
+ def size
57
+ @morphemes.size
58
+ end
59
+
60
+ def empty?
61
+ @morphemes.empty?
62
+ end
63
+
64
+ def surfaces
65
+ return @source.surfaces if @source&.respond_to?(:surfaces)
66
+
67
+ map(&:surface)
68
+ end
69
+
70
+ def readings
71
+ map(&:reading_form)
72
+ end
73
+
74
+ def dictionary_forms
75
+ map(&:dictionary_form)
76
+ end
77
+
78
+ def normalized_forms
79
+ map(&:normalized_form)
80
+ end
81
+
82
+ def total_costs
83
+ map(&:total_cost)
84
+ end
85
+
86
+ def synonym_group_ids
87
+ map(&:synonym_group_ids)
88
+ end
89
+
90
+ # Joins all surfaces back into the original text (no spaces, for Japanese text).
91
+ def to_text
92
+ surfaces.join
93
+ end
94
+
95
+ # Filter morphemes by POS. Accepts a PosMatcher or an array pattern.
96
+ # Returns a new MorphemeList with only matching morphemes.
97
+ #
98
+ # list.select_pos(Kabosu::PosMatcher.nouns)
99
+ # list.select_pos(["名詞", "固有名詞"])
100
+ #
101
+ def select_pos(matcher_or_pattern)
102
+ matcher = coerce_to_matcher(matcher_or_pattern)
103
+ self.class.new(matcher.filter(to_a))
104
+ end
105
+
106
+ # Inverse of select_pos. Returns a new MorphemeList excluding matching morphemes.
107
+ def reject_pos(matcher_or_pattern)
108
+ matcher = coerce_to_matcher(matcher_or_pattern)
109
+ self.class.new(matcher.reject(to_a))
110
+ end
111
+
112
+ def to_a
113
+ return @morphemes.dup unless @source
114
+
115
+ (0...size).map { fetch(_1) }
116
+ end
117
+
118
+ def inspect
119
+ base = "#<Kabosu::MorphemeList (#{size} morphemes)"
120
+ base += " cost=#{@internal_cost}" if @internal_cost
121
+ base + ": #{surfaces.join(" | ")}>"
122
+ end
123
+
124
+ private
125
+
126
+ def coerce_to_matcher(matcher_or_pattern)
127
+ case matcher_or_pattern
128
+ when PosMatcher
129
+ matcher_or_pattern
130
+ when Array
131
+ PosMatcher.new(matcher_or_pattern)
132
+ else
133
+ raise ArgumentError, "expected a PosMatcher or an Array pattern, got #{matcher_or_pattern.class}"
134
+ end
135
+ end
136
+
137
+ def lazy_source?(obj)
138
+ obj.respond_to?(:morpheme_at) && obj.respond_to?(:size) && obj.respond_to?(:surfaces)
139
+ end
140
+
141
+ def normalize_index(index)
142
+ return nil if size.zero?
143
+
144
+ idx = Integer(index)
145
+ idx += size if idx.negative?
146
+ return nil if idx.negative? || idx >= size
147
+
148
+ idx
149
+ end
150
+
151
+ def fetch(idx)
152
+ return @morphemes[idx] unless @source
153
+
154
+ @morphemes[idx] ||= @source.morpheme_at(idx)
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,119 @@
1
+ module Kabosu
2
+ class PosMatcher
3
+ # Build a matcher from POS patterns or a block.
4
+ #
5
+ # # From a block
6
+ # PosMatcher.new { |pos| pos[0] == "名詞" }
7
+ #
8
+ # # Single pattern (array of strings; "*" or nil = match anything)
9
+ # PosMatcher.new(["名詞", "固有名詞", "*", "*"])
10
+ #
11
+ # # Multiple patterns (matches if any pattern matches)
12
+ # PosMatcher.new(["名詞", "固有名詞"], ["動詞", "*", "*", "*"])
13
+ #
14
+ def initialize(*patterns, &block)
15
+ if block
16
+ raise ArgumentError, "cannot supply both patterns and a block" unless patterns.empty?
17
+
18
+ @proc = block
19
+ elsif patterns.empty?
20
+ raise ArgumentError, "must supply at least one pattern or a block"
21
+ else
22
+ @patterns = patterns.map(&:freeze).freeze
23
+ end
24
+
25
+ freeze
26
+ end
27
+
28
+ # Returns true if the morpheme (or raw POS array) matches this matcher.
29
+ def match?(morpheme_or_pos)
30
+ pos = extract_pos(morpheme_or_pos)
31
+
32
+ if @proc
33
+ @proc.call(pos)
34
+ else
35
+ @patterns.any? { |pattern| pattern_match?(pattern, pos) }
36
+ end
37
+ end
38
+
39
+ # Return matching morphemes as an Array.
40
+ def filter(morphemes)
41
+ morphemes.select { |m| match?(m) }
42
+ end
43
+
44
+ # Return non-matching morphemes as an Array.
45
+ def reject(morphemes)
46
+ morphemes.reject { |m| match?(m) }
47
+ end
48
+
49
+ # Union: matches if either matcher matches.
50
+ def |(other)
51
+ a, b = self, other
52
+ PosMatcher.new { |pos| a.match?(pos) || b.match?(pos) }
53
+ end
54
+
55
+ # Intersection: matches if both matchers match.
56
+ def &(other)
57
+ a, b = self, other
58
+ PosMatcher.new { |pos| a.match?(pos) && b.match?(pos) }
59
+ end
60
+
61
+ # Difference: matches self but not other.
62
+ def -(other)
63
+ difference(other)
64
+ end
65
+
66
+ def difference(other)
67
+ a, b = self, other
68
+ PosMatcher.new { |pos| a.match?(pos) && !b.match?(pos) }
69
+ end
70
+
71
+ # ── Pre-built matchers ──
72
+
73
+ def self.nouns
74
+ @nouns ||= new(["名詞"])
75
+ end
76
+
77
+ def self.verbs
78
+ @verbs ||= new(["動詞"])
79
+ end
80
+
81
+ def self.adjectives
82
+ @adjectives ||= new(["形容詞"])
83
+ end
84
+
85
+ def self.particles
86
+ @particles ||= new(["助詞"])
87
+ end
88
+
89
+ def self.auxiliary_verbs
90
+ @auxiliary_verbs ||= new(["助動詞"])
91
+ end
92
+
93
+ def self.adverbs
94
+ @adverbs ||= new(["副詞"])
95
+ end
96
+
97
+ def self.proper_nouns
98
+ @proper_nouns ||= new(["名詞", "固有名詞"])
99
+ end
100
+
101
+ private
102
+
103
+ def extract_pos(morpheme_or_pos)
104
+ if morpheme_or_pos.is_a?(Array)
105
+ morpheme_or_pos
106
+ elsif morpheme_or_pos.respond_to?(:part_of_speech)
107
+ morpheme_or_pos.part_of_speech
108
+ else
109
+ raise ArgumentError, "expected an Array or an object responding to #part_of_speech"
110
+ end
111
+ end
112
+
113
+ def pattern_match?(pattern, pos)
114
+ pattern.each_with_index.all? do |slot, i|
115
+ slot.nil? || slot == "*" || slot == pos[i]
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,39 @@
1
+ namespace :release do
2
+ desc "Bump version. Usage: rake release:bump[1.2.3]"
3
+ task :bump, [:version] do |_t, args|
4
+ version = args[:version]
5
+ abort "Usage: rake release:bump[1.2.3]" unless version&.match?(/\A\d+\.\d+\.\d+\z/)
6
+
7
+ # Update lib/kabosu/version.rb
8
+ version_file = File.expand_path("version.rb", __dir__)
9
+ content = File.read(version_file)
10
+ new_content = content.sub(/VERSION = ".*"/, %(VERSION = "#{version}"))
11
+ File.write(version_file, new_content)
12
+ puts "Updated #{version_file}"
13
+
14
+ # Update ext/kabosu/Cargo.toml
15
+ cargo_file = File.expand_path("../../ext/kabosu/Cargo.toml", __dir__)
16
+ content = File.read(cargo_file)
17
+ new_content = content.sub(/^version = ".*"/, %(version = "#{version}"))
18
+ File.write(cargo_file, new_content)
19
+ puts "Updated #{cargo_file}"
20
+
21
+ puts "\nVersion bumped to #{version}"
22
+ puts "Next steps:"
23
+ puts " jj commit -m 'Bump version to #{version}'"
24
+ puts " jj tag set v#{version}"
25
+ puts " jj git push --all"
26
+ end
27
+ end
28
+
29
+ desc "Build the gem"
30
+ task :build do
31
+ sh "gem build kabosu.gemspec"
32
+ end
33
+
34
+ desc "Build and push the gem to RubyGems"
35
+ task release: :build do
36
+ gemfile = Dir["kabosu-*.gem"].max_by { |f| File.mtime(f) }
37
+ abort "No .gem file found" unless gemfile
38
+ sh "gem push #{gemfile}"
39
+ end
@@ -0,0 +1,86 @@
1
+ require "kabosu/dict_manager"
2
+
3
+ namespace :kabosu do
4
+ # ── Install ──
5
+
6
+ desc "Install the core dictionary (default). VERSION=YYYYMMDD to pin a specific release."
7
+ task :install do
8
+ Kabosu::DictManager.new.install("core", version: ENV["VERSION"])
9
+ end
10
+
11
+ namespace :install do
12
+ desc "Install the small dictionary. VERSION=YYYYMMDD to pin a specific release."
13
+ task :small do
14
+ Kabosu::DictManager.new.install("small", version: ENV["VERSION"])
15
+ end
16
+
17
+ desc "Install the core dictionary. VERSION=YYYYMMDD to pin a specific release."
18
+ task :core do
19
+ Kabosu::DictManager.new.install("core", version: ENV["VERSION"])
20
+ end
21
+
22
+ desc "Install the full dictionary. VERSION=YYYYMMDD to pin a specific release."
23
+ task :full do
24
+ Kabosu::DictManager.new.install("full", version: ENV["VERSION"])
25
+ end
26
+ end
27
+
28
+ # ── Remove ──
29
+
30
+ desc "Remove all installed dictionaries, or a specific one with EDITION=small|core|full and/or VERSION=YYYYMMDD"
31
+ task :remove do
32
+ Kabosu::DictManager.new.remove(edition: ENV["EDITION"], version: ENV["VERSION"])
33
+ end
34
+
35
+ namespace :remove do
36
+ desc "Remove the small dictionary."
37
+ task :small do
38
+ Kabosu::DictManager.new.remove(edition: "small", version: ENV["VERSION"])
39
+ end
40
+
41
+ desc "Remove the core dictionary."
42
+ task :core do
43
+ Kabosu::DictManager.new.remove(edition: "core", version: ENV["VERSION"])
44
+ end
45
+
46
+ desc "Remove the full dictionary."
47
+ task :full do
48
+ Kabosu::DictManager.new.remove(edition: "full", version: ENV["VERSION"])
49
+ end
50
+ end
51
+
52
+ # ── Info ──
53
+
54
+ desc "List installed Sudachi dictionaries"
55
+ task :list do
56
+ manager = Kabosu::DictManager.new
57
+ dicts = manager.installed
58
+
59
+ if dicts.empty?
60
+ puts "No dictionaries installed. Run: rake kabosu:install"
61
+ else
62
+ puts "Installed dictionaries (#{manager.dir}):"
63
+ puts
64
+ dicts.each do |d|
65
+ size_mb = (File.size(d[:path]).to_f / 1024 / 1024).round(1)
66
+ puts " #{d[:version]} / #{d[:edition].ljust(5)} (#{size_mb} MB)"
67
+ puts " #{d[:path]}"
68
+ end
69
+ end
70
+ end
71
+
72
+ desc "Show available dictionary versions from GitHub"
73
+ task :versions do
74
+ Kabosu::DictManager.new.available_versions.each { |v| puts v }
75
+ end
76
+
77
+ desc "Show the path to the best available dictionary. EDITION=small|core|full to be specific."
78
+ task :path do
79
+ begin
80
+ puts Kabosu::DictManager.new.find(edition: ENV["EDITION"])
81
+ rescue Kabosu::DictManager::DictNotFound => e
82
+ $stderr.puts e.message
83
+ exit 1
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,3 @@
1
+ module Kabosu
2
+ VERSION = "0.6.10"
3
+ end
data/lib/kabosu.rb ADDED
@@ -0,0 +1,239 @@
1
+ require_relative "kabosu/version"
2
+ require_relative "kabosu/kabosu"
3
+ require_relative "kabosu/dict_manager"
4
+ require_relative "kabosu/pos_matcher"
5
+ require_relative "kabosu/morpheme_list"
6
+
7
+ module Kabosu
8
+ class Error < StandardError; end
9
+ class ConfigError < Error; end
10
+ class DictionaryError < Error; end
11
+ class LookupError < Error; end
12
+ class TokenizationError < Error; end
13
+ class SentenceSplitError < Error; end
14
+
15
+ class SentenceRange
16
+ attr_reader :start, :end, :text
17
+
18
+ def initialize(start, finish, text)
19
+ @start = start
20
+ @end = finish
21
+ @text = text
22
+ end
23
+
24
+ def to_a
25
+ [@start, @end, @text]
26
+ end
27
+ end
28
+
29
+ remove_const(:MODE_A) if const_defined?(:MODE_A, false)
30
+ remove_const(:MODE_B) if const_defined?(:MODE_B, false)
31
+ remove_const(:MODE_C) if const_defined?(:MODE_C, false)
32
+ MODE_A = :a
33
+ MODE_B = :b
34
+ MODE_C = :c
35
+ private_constant :TokenBatch if const_defined?(:TokenBatch, false)
36
+
37
+ def self.normalize_mode(mode, param_name: "mode")
38
+ case mode
39
+ when :a
40
+ "A"
41
+ when :b
42
+ "B"
43
+ when :c
44
+ "C"
45
+ else
46
+ raise ArgumentError, "invalid #{param_name} #{mode.inspect}; expected :a, :b, or :c"
47
+ end
48
+ end
49
+ private_class_method :normalize_mode
50
+
51
+ # ── Dictionary: keyword API and management ──
52
+
53
+ class Dictionary
54
+ class << self
55
+ alias_method :_new, :new
56
+
57
+ def new(config: nil, system_dict: nil, user_dicts: nil)
58
+ unless config.nil? || config.is_a?(String)
59
+ raise ArgumentError, "config must be a String or nil"
60
+ end
61
+ unless system_dict.nil? || system_dict.is_a?(String)
62
+ raise ArgumentError, "system_dict must be a String or nil"
63
+ end
64
+ unless user_dicts.nil? || user_dicts.is_a?(Array)
65
+ raise ArgumentError, "user_dicts must be an Array<String> or nil"
66
+ end
67
+ if user_dicts&.any? { !_1.is_a?(String) }
68
+ raise ArgumentError, "user_dicts must contain only String values"
69
+ end
70
+
71
+ if config.nil? && system_dict.nil?
72
+ raise ArgumentError, "either config or system_dict is required"
73
+ end
74
+ _new(config, system_dict, user_dicts)
75
+ rescue RuntimeError => e
76
+ raise map_dictionary_init_error(e, config: config, system_dict: system_dict)
77
+ end
78
+
79
+ def install(edition = "core", version: nil)
80
+ dict_manager.install(edition, version: version)
81
+ end
82
+
83
+ def path(edition: nil)
84
+ dict_manager.find(edition: edition)
85
+ end
86
+
87
+ def list
88
+ dict_manager.installed
89
+ end
90
+
91
+ private
92
+
93
+ def map_dictionary_init_error(error, config:, system_dict:)
94
+ message = error.message
95
+ if config && system_dict.nil?
96
+ ConfigError.new(message)
97
+ elsif message.match?(/config|setting\.json|json/i)
98
+ ConfigError.new(message)
99
+ else
100
+ DictionaryError.new(message)
101
+ end
102
+ end
103
+
104
+ def dict_manager
105
+ @dict_manager ||= DictManager.new
106
+ end
107
+ end
108
+
109
+ alias_method :_create, :create
110
+ alias_method :_lookup, :lookup
111
+
112
+ def create(**options)
113
+ unknown = options.keys - %i[mode fields debug projection]
114
+ raise ArgumentError, "unknown keyword(s): #{unknown.join(', ')}" unless unknown.empty?
115
+
116
+ mode = options.fetch(:mode, MODE_C)
117
+ fields = options.fetch(:fields, nil)
118
+ debug = options.fetch(:debug, false)
119
+ projection = options.fetch(:projection, nil)
120
+
121
+ unless fields.nil? || fields.is_a?(Array)
122
+ raise ArgumentError, "fields must be an Array<String|Symbol> or nil"
123
+ end
124
+ if fields&.any? { !(_1.is_a?(String) || _1.is_a?(Symbol)) }
125
+ raise ArgumentError, "fields must contain only String or Symbol values"
126
+ end
127
+ unless debug == true || debug == false
128
+ raise ArgumentError, "debug must be true or false"
129
+ end
130
+
131
+ unless projection.nil?
132
+ raise NotImplementedError, "projection is not supported yet"
133
+ end
134
+
135
+ mode_str = Kabosu.__send__(:normalize_mode, mode)
136
+ _create(mode_str, fields, debug)
137
+ end
138
+
139
+ def lookup(text)
140
+ unless text.is_a?(String)
141
+ raise ArgumentError, "text must be a String"
142
+ end
143
+ MorphemeList.new(_lookup(text))
144
+ rescue RuntimeError => e
145
+ raise LookupError.new(e.message), cause: e
146
+ end
147
+ end
148
+
149
+ # ── Tokenizer: wrap output in MorphemeList ──
150
+
151
+ class Tokenizer
152
+ alias_method :_tokenize, :tokenize
153
+
154
+ def tokenize(text)
155
+ unless text.is_a?(String)
156
+ raise ArgumentError, "text must be a String"
157
+ end
158
+
159
+ batch = _tokenize(text)
160
+ cost = batch.respond_to?(:internal_cost) ? batch.internal_cost : nil
161
+ MorphemeList.new(batch, internal_cost: cost)
162
+ rescue RuntimeError => e
163
+ raise TokenizationError.new(e.message), cause: e
164
+ end
165
+ end
166
+
167
+ class Morpheme
168
+ alias_method :_split, :split
169
+
170
+ def split(mode: MODE_C, add_single: true)
171
+ unless add_single == true || add_single == false
172
+ raise ArgumentError, "add_single must be true or false"
173
+ end
174
+ mode_str = Kabosu.__send__(:normalize_mode, mode)
175
+ MorphemeList.new(_split(mode_str, nil, add_single))
176
+ rescue RuntimeError => e
177
+ raise TokenizationError.new(e.message), cause: e
178
+ end
179
+ end
180
+
181
+ def self.split_sentences(text, limit: nil, with_checker: false, ranges: false, dictionary: nil)
182
+ unless text.is_a?(String)
183
+ raise ArgumentError, "text must be a String"
184
+ end
185
+ unless limit.nil? || limit.is_a?(Integer)
186
+ raise ArgumentError, "limit must be an Integer or nil"
187
+ end
188
+ if limit && limit < 1
189
+ raise ArgumentError, "limit must be greater than 0"
190
+ end
191
+ unless with_checker == true || with_checker == false
192
+ raise ArgumentError, "with_checker must be true or false"
193
+ end
194
+ unless ranges == true || ranges == false
195
+ raise ArgumentError, "ranges must be true or false"
196
+ end
197
+ unless dictionary.nil? || dictionary.is_a?(String)
198
+ raise ArgumentError, "dictionary must be a String path or nil"
199
+ end
200
+
201
+ dict_path = nil
202
+ if with_checker
203
+ dict_path = dictionary || Dictionary.path
204
+ end
205
+
206
+ if ranges
207
+ _split_sentences_with_ranges(text, limit, dict_path).map do |(start, finish, sentence)|
208
+ SentenceRange.new(start, finish, sentence)
209
+ end
210
+ else
211
+ _split_sentences(text, limit, dict_path)
212
+ end
213
+ rescue RuntimeError => e
214
+ raise SentenceSplitError.new(e.message), cause: e
215
+ end
216
+
217
+ # ── Convenience tokenization ──
218
+
219
+ # Tokenize text using an explicitly provided tokenizer.
220
+ #
221
+ # dict = Kabosu::Dictionary.new(system_dict: Kabosu::Dictionary.path)
222
+ # tok = dict.create(mode: :a)
223
+ # Kabosu.tokenize("東京都に住んでいる", tokenizer: tok)
224
+ #
225
+ def self.tokenize(text, tokenizer:)
226
+ unless text.is_a?(String)
227
+ raise ArgumentError, "text must be a String"
228
+ end
229
+ unless tokenizer.is_a?(Tokenizer)
230
+ raise ArgumentError, "tokenizer must be a Kabosu::Tokenizer"
231
+ end
232
+
233
+ batch = tokenizer.__send__(:_tokenize, text)
234
+ cost = batch.respond_to?(:internal_cost) ? batch.internal_cost : nil
235
+ MorphemeList.new(batch, internal_cost: cost)
236
+ rescue RuntimeError => e
237
+ raise TokenizationError.new(e.message), cause: e
238
+ end
239
+ end