kabosu 0.6.10-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +191 -0
- data/README.md +205 -0
- data/lib/kabosu/3.1/kabosu.so +0 -0
- data/lib/kabosu/3.2/kabosu.so +0 -0
- data/lib/kabosu/3.3/kabosu.so +0 -0
- data/lib/kabosu/3.4/kabosu.so +0 -0
- data/lib/kabosu/4.0/kabosu.so +0 -0
- data/lib/kabosu/dict_manager.rb +239 -0
- data/lib/kabosu/morpheme_list.rb +157 -0
- data/lib/kabosu/pos_matcher.rb +119 -0
- data/lib/kabosu/release.rake +39 -0
- data/lib/kabosu/tasks.rake +86 -0
- data/lib/kabosu/version.rb +3 -0
- data/lib/kabosu.rb +239 -0
- metadata +116 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
module Kabosu
|
|
2
|
+
class MorphemeList
|
|
3
|
+
include Enumerable
|
|
4
|
+
|
|
5
|
+
attr_accessor :internal_cost
|
|
6
|
+
|
|
7
|
+
def initialize(source_or_morphemes, internal_cost: nil)
|
|
8
|
+
@source = source_or_morphemes if lazy_source?(source_or_morphemes)
|
|
9
|
+
@morphemes = @source ? Array.new(@source.size) : source_or_morphemes
|
|
10
|
+
@internal_cost = internal_cost || (@source&.internal_cost)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def each(&block)
|
|
14
|
+
return enum_for(:each) unless block
|
|
15
|
+
|
|
16
|
+
if @source
|
|
17
|
+
i = 0
|
|
18
|
+
while i < size
|
|
19
|
+
block.call(fetch(i))
|
|
20
|
+
i += 1
|
|
21
|
+
end
|
|
22
|
+
else
|
|
23
|
+
@morphemes.each(&block)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def [](index)
|
|
28
|
+
return to_a[index] if index.is_a?(Range)
|
|
29
|
+
|
|
30
|
+
idx = normalize_index(index)
|
|
31
|
+
return nil if idx.nil?
|
|
32
|
+
|
|
33
|
+
fetch(idx)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def first(n = nil)
|
|
37
|
+
return self[0] unless n
|
|
38
|
+
|
|
39
|
+
n = Integer(n)
|
|
40
|
+
raise ArgumentError, "negative array size" if n.negative?
|
|
41
|
+
|
|
42
|
+
limit = [n, size].min
|
|
43
|
+
(0...limit).map { fetch(_1) }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def last(n = nil)
|
|
47
|
+
return self[-1] unless n
|
|
48
|
+
|
|
49
|
+
n = Integer(n)
|
|
50
|
+
raise ArgumentError, "negative array size" if n.negative?
|
|
51
|
+
|
|
52
|
+
start = [size - n, 0].max
|
|
53
|
+
(start...size).map { fetch(_1) }
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def size
|
|
57
|
+
@morphemes.size
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def empty?
|
|
61
|
+
@morphemes.empty?
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def surfaces
|
|
65
|
+
return @source.surfaces if @source&.respond_to?(:surfaces)
|
|
66
|
+
|
|
67
|
+
map(&:surface)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def readings
|
|
71
|
+
map(&:reading_form)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def dictionary_forms
|
|
75
|
+
map(&:dictionary_form)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def normalized_forms
|
|
79
|
+
map(&:normalized_form)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def total_costs
|
|
83
|
+
map(&:total_cost)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def synonym_group_ids
|
|
87
|
+
map(&:synonym_group_ids)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Joins all surfaces back into the original text (no spaces, for Japanese text).
|
|
91
|
+
def to_text
|
|
92
|
+
surfaces.join
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Filter morphemes by POS. Accepts a PosMatcher or an array pattern.
|
|
96
|
+
# Returns a new MorphemeList with only matching morphemes.
|
|
97
|
+
#
|
|
98
|
+
# list.select_pos(Kabosu::PosMatcher.nouns)
|
|
99
|
+
# list.select_pos(["名詞", "固有名詞"])
|
|
100
|
+
#
|
|
101
|
+
def select_pos(matcher_or_pattern)
|
|
102
|
+
matcher = coerce_to_matcher(matcher_or_pattern)
|
|
103
|
+
self.class.new(matcher.filter(to_a))
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Inverse of select_pos. Returns a new MorphemeList excluding matching morphemes.
|
|
107
|
+
def reject_pos(matcher_or_pattern)
|
|
108
|
+
matcher = coerce_to_matcher(matcher_or_pattern)
|
|
109
|
+
self.class.new(matcher.reject(to_a))
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def to_a
|
|
113
|
+
return @morphemes.dup unless @source
|
|
114
|
+
|
|
115
|
+
(0...size).map { fetch(_1) }
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def inspect
|
|
119
|
+
base = "#<Kabosu::MorphemeList (#{size} morphemes)"
|
|
120
|
+
base += " cost=#{@internal_cost}" if @internal_cost
|
|
121
|
+
base + ": #{surfaces.join(" | ")}>"
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
private
|
|
125
|
+
|
|
126
|
+
def coerce_to_matcher(matcher_or_pattern)
|
|
127
|
+
case matcher_or_pattern
|
|
128
|
+
when PosMatcher
|
|
129
|
+
matcher_or_pattern
|
|
130
|
+
when Array
|
|
131
|
+
PosMatcher.new(matcher_or_pattern)
|
|
132
|
+
else
|
|
133
|
+
raise ArgumentError, "expected a PosMatcher or an Array pattern, got #{matcher_or_pattern.class}"
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def lazy_source?(obj)
|
|
138
|
+
obj.respond_to?(:morpheme_at) && obj.respond_to?(:size) && obj.respond_to?(:surfaces)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def normalize_index(index)
|
|
142
|
+
return nil if size.zero?
|
|
143
|
+
|
|
144
|
+
idx = Integer(index)
|
|
145
|
+
idx += size if idx.negative?
|
|
146
|
+
return nil if idx.negative? || idx >= size
|
|
147
|
+
|
|
148
|
+
idx
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def fetch(idx)
|
|
152
|
+
return @morphemes[idx] unless @source
|
|
153
|
+
|
|
154
|
+
@morphemes[idx] ||= @source.morpheme_at(idx)
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
module Kabosu
|
|
2
|
+
class PosMatcher
|
|
3
|
+
# Build a matcher from POS patterns or a block.
|
|
4
|
+
#
|
|
5
|
+
# # From a block
|
|
6
|
+
# PosMatcher.new { |pos| pos[0] == "名詞" }
|
|
7
|
+
#
|
|
8
|
+
# # Single pattern (array of strings; "*" or nil = match anything)
|
|
9
|
+
# PosMatcher.new(["名詞", "固有名詞", "*", "*"])
|
|
10
|
+
#
|
|
11
|
+
# # Multiple patterns (matches if any pattern matches)
|
|
12
|
+
# PosMatcher.new(["名詞", "固有名詞"], ["動詞", "*", "*", "*"])
|
|
13
|
+
#
|
|
14
|
+
def initialize(*patterns, &block)
|
|
15
|
+
if block
|
|
16
|
+
raise ArgumentError, "cannot supply both patterns and a block" unless patterns.empty?
|
|
17
|
+
|
|
18
|
+
@proc = block
|
|
19
|
+
elsif patterns.empty?
|
|
20
|
+
raise ArgumentError, "must supply at least one pattern or a block"
|
|
21
|
+
else
|
|
22
|
+
@patterns = patterns.map(&:freeze).freeze
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
freeze
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Returns true if the morpheme (or raw POS array) matches this matcher.
|
|
29
|
+
def match?(morpheme_or_pos)
|
|
30
|
+
pos = extract_pos(morpheme_or_pos)
|
|
31
|
+
|
|
32
|
+
if @proc
|
|
33
|
+
@proc.call(pos)
|
|
34
|
+
else
|
|
35
|
+
@patterns.any? { |pattern| pattern_match?(pattern, pos) }
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Return matching morphemes as an Array.
|
|
40
|
+
def filter(morphemes)
|
|
41
|
+
morphemes.select { |m| match?(m) }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Return non-matching morphemes as an Array.
|
|
45
|
+
def reject(morphemes)
|
|
46
|
+
morphemes.reject { |m| match?(m) }
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Union: matches if either matcher matches.
|
|
50
|
+
def |(other)
|
|
51
|
+
a, b = self, other
|
|
52
|
+
PosMatcher.new { |pos| a.match?(pos) || b.match?(pos) }
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Intersection: matches if both matchers match.
|
|
56
|
+
def &(other)
|
|
57
|
+
a, b = self, other
|
|
58
|
+
PosMatcher.new { |pos| a.match?(pos) && b.match?(pos) }
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Difference: matches self but not other.
|
|
62
|
+
def -(other)
|
|
63
|
+
difference(other)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def difference(other)
|
|
67
|
+
a, b = self, other
|
|
68
|
+
PosMatcher.new { |pos| a.match?(pos) && !b.match?(pos) }
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# ── Pre-built matchers ──
|
|
72
|
+
|
|
73
|
+
def self.nouns
|
|
74
|
+
@nouns ||= new(["名詞"])
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def self.verbs
|
|
78
|
+
@verbs ||= new(["動詞"])
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def self.adjectives
|
|
82
|
+
@adjectives ||= new(["形容詞"])
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def self.particles
|
|
86
|
+
@particles ||= new(["助詞"])
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def self.auxiliary_verbs
|
|
90
|
+
@auxiliary_verbs ||= new(["助動詞"])
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def self.adverbs
|
|
94
|
+
@adverbs ||= new(["副詞"])
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def self.proper_nouns
|
|
98
|
+
@proper_nouns ||= new(["名詞", "固有名詞"])
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
private
|
|
102
|
+
|
|
103
|
+
def extract_pos(morpheme_or_pos)
|
|
104
|
+
if morpheme_or_pos.is_a?(Array)
|
|
105
|
+
morpheme_or_pos
|
|
106
|
+
elsif morpheme_or_pos.respond_to?(:part_of_speech)
|
|
107
|
+
morpheme_or_pos.part_of_speech
|
|
108
|
+
else
|
|
109
|
+
raise ArgumentError, "expected an Array or an object responding to #part_of_speech"
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def pattern_match?(pattern, pos)
|
|
114
|
+
pattern.each_with_index.all? do |slot, i|
|
|
115
|
+
slot.nil? || slot == "*" || slot == pos[i]
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
namespace :release do
|
|
2
|
+
desc "Bump version. Usage: rake release:bump[1.2.3]"
|
|
3
|
+
task :bump, [:version] do |_t, args|
|
|
4
|
+
version = args[:version]
|
|
5
|
+
abort "Usage: rake release:bump[1.2.3]" unless version&.match?(/\A\d+\.\d+\.\d+\z/)
|
|
6
|
+
|
|
7
|
+
# Update lib/kabosu/version.rb
|
|
8
|
+
version_file = File.expand_path("version.rb", __dir__)
|
|
9
|
+
content = File.read(version_file)
|
|
10
|
+
new_content = content.sub(/VERSION = ".*"/, %(VERSION = "#{version}"))
|
|
11
|
+
File.write(version_file, new_content)
|
|
12
|
+
puts "Updated #{version_file}"
|
|
13
|
+
|
|
14
|
+
# Update ext/kabosu/Cargo.toml
|
|
15
|
+
cargo_file = File.expand_path("../../ext/kabosu/Cargo.toml", __dir__)
|
|
16
|
+
content = File.read(cargo_file)
|
|
17
|
+
new_content = content.sub(/^version = ".*"/, %(version = "#{version}"))
|
|
18
|
+
File.write(cargo_file, new_content)
|
|
19
|
+
puts "Updated #{cargo_file}"
|
|
20
|
+
|
|
21
|
+
puts "\nVersion bumped to #{version}"
|
|
22
|
+
puts "Next steps:"
|
|
23
|
+
puts " jj commit -m 'Bump version to #{version}'"
|
|
24
|
+
puts " jj tag set v#{version}"
|
|
25
|
+
puts " jj git push --all"
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
desc "Build the gem"
|
|
30
|
+
task :build do
|
|
31
|
+
sh "gem build kabosu.gemspec"
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
desc "Build and push the gem to RubyGems"
|
|
35
|
+
task release: :build do
|
|
36
|
+
gemfile = Dir["kabosu-*.gem"].max_by { |f| File.mtime(f) }
|
|
37
|
+
abort "No .gem file found" unless gemfile
|
|
38
|
+
sh "gem push #{gemfile}"
|
|
39
|
+
end
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
require "kabosu/dict_manager"
|
|
2
|
+
|
|
3
|
+
namespace :kabosu do
|
|
4
|
+
# ── Install ──
|
|
5
|
+
|
|
6
|
+
desc "Install the core dictionary (default). VERSION=YYYYMMDD to pin a specific release."
|
|
7
|
+
task :install do
|
|
8
|
+
Kabosu::DictManager.new.install("core", version: ENV["VERSION"])
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
namespace :install do
|
|
12
|
+
desc "Install the small dictionary. VERSION=YYYYMMDD to pin a specific release."
|
|
13
|
+
task :small do
|
|
14
|
+
Kabosu::DictManager.new.install("small", version: ENV["VERSION"])
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
desc "Install the core dictionary. VERSION=YYYYMMDD to pin a specific release."
|
|
18
|
+
task :core do
|
|
19
|
+
Kabosu::DictManager.new.install("core", version: ENV["VERSION"])
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
desc "Install the full dictionary. VERSION=YYYYMMDD to pin a specific release."
|
|
23
|
+
task :full do
|
|
24
|
+
Kabosu::DictManager.new.install("full", version: ENV["VERSION"])
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# ── Remove ──
|
|
29
|
+
|
|
30
|
+
desc "Remove all installed dictionaries, or a specific one with EDITION=small|core|full and/or VERSION=YYYYMMDD"
|
|
31
|
+
task :remove do
|
|
32
|
+
Kabosu::DictManager.new.remove(edition: ENV["EDITION"], version: ENV["VERSION"])
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
namespace :remove do
|
|
36
|
+
desc "Remove the small dictionary."
|
|
37
|
+
task :small do
|
|
38
|
+
Kabosu::DictManager.new.remove(edition: "small", version: ENV["VERSION"])
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
desc "Remove the core dictionary."
|
|
42
|
+
task :core do
|
|
43
|
+
Kabosu::DictManager.new.remove(edition: "core", version: ENV["VERSION"])
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
desc "Remove the full dictionary."
|
|
47
|
+
task :full do
|
|
48
|
+
Kabosu::DictManager.new.remove(edition: "full", version: ENV["VERSION"])
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# ── Info ──
|
|
53
|
+
|
|
54
|
+
desc "List installed Sudachi dictionaries"
|
|
55
|
+
task :list do
|
|
56
|
+
manager = Kabosu::DictManager.new
|
|
57
|
+
dicts = manager.installed
|
|
58
|
+
|
|
59
|
+
if dicts.empty?
|
|
60
|
+
puts "No dictionaries installed. Run: rake kabosu:install"
|
|
61
|
+
else
|
|
62
|
+
puts "Installed dictionaries (#{manager.dir}):"
|
|
63
|
+
puts
|
|
64
|
+
dicts.each do |d|
|
|
65
|
+
size_mb = (File.size(d[:path]).to_f / 1024 / 1024).round(1)
|
|
66
|
+
puts " #{d[:version]} / #{d[:edition].ljust(5)} (#{size_mb} MB)"
|
|
67
|
+
puts " #{d[:path]}"
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
desc "Show available dictionary versions from GitHub"
|
|
73
|
+
task :versions do
|
|
74
|
+
Kabosu::DictManager.new.available_versions.each { |v| puts v }
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
desc "Show the path to the best available dictionary. EDITION=small|core|full to be specific."
|
|
78
|
+
task :path do
|
|
79
|
+
begin
|
|
80
|
+
puts Kabosu::DictManager.new.find(edition: ENV["EDITION"])
|
|
81
|
+
rescue Kabosu::DictManager::DictNotFound => e
|
|
82
|
+
$stderr.puts e.message
|
|
83
|
+
exit 1
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
data/lib/kabosu.rb
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
require_relative "kabosu/version"
|
|
2
|
+
require_relative "kabosu/kabosu"
|
|
3
|
+
require_relative "kabosu/dict_manager"
|
|
4
|
+
require_relative "kabosu/pos_matcher"
|
|
5
|
+
require_relative "kabosu/morpheme_list"
|
|
6
|
+
|
|
7
|
+
module Kabosu
|
|
8
|
+
class Error < StandardError; end
|
|
9
|
+
class ConfigError < Error; end
|
|
10
|
+
class DictionaryError < Error; end
|
|
11
|
+
class LookupError < Error; end
|
|
12
|
+
class TokenizationError < Error; end
|
|
13
|
+
class SentenceSplitError < Error; end
|
|
14
|
+
|
|
15
|
+
class SentenceRange
|
|
16
|
+
attr_reader :start, :end, :text
|
|
17
|
+
|
|
18
|
+
def initialize(start, finish, text)
|
|
19
|
+
@start = start
|
|
20
|
+
@end = finish
|
|
21
|
+
@text = text
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def to_a
|
|
25
|
+
[@start, @end, @text]
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
remove_const(:MODE_A) if const_defined?(:MODE_A, false)
|
|
30
|
+
remove_const(:MODE_B) if const_defined?(:MODE_B, false)
|
|
31
|
+
remove_const(:MODE_C) if const_defined?(:MODE_C, false)
|
|
32
|
+
MODE_A = :a
|
|
33
|
+
MODE_B = :b
|
|
34
|
+
MODE_C = :c
|
|
35
|
+
private_constant :TokenBatch if const_defined?(:TokenBatch, false)
|
|
36
|
+
|
|
37
|
+
def self.normalize_mode(mode, param_name: "mode")
|
|
38
|
+
case mode
|
|
39
|
+
when :a
|
|
40
|
+
"A"
|
|
41
|
+
when :b
|
|
42
|
+
"B"
|
|
43
|
+
when :c
|
|
44
|
+
"C"
|
|
45
|
+
else
|
|
46
|
+
raise ArgumentError, "invalid #{param_name} #{mode.inspect}; expected :a, :b, or :c"
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
private_class_method :normalize_mode
|
|
50
|
+
|
|
51
|
+
# ── Dictionary: keyword API and management ──
|
|
52
|
+
|
|
53
|
+
class Dictionary
|
|
54
|
+
class << self
|
|
55
|
+
alias_method :_new, :new
|
|
56
|
+
|
|
57
|
+
def new(config: nil, system_dict: nil, user_dicts: nil)
|
|
58
|
+
unless config.nil? || config.is_a?(String)
|
|
59
|
+
raise ArgumentError, "config must be a String or nil"
|
|
60
|
+
end
|
|
61
|
+
unless system_dict.nil? || system_dict.is_a?(String)
|
|
62
|
+
raise ArgumentError, "system_dict must be a String or nil"
|
|
63
|
+
end
|
|
64
|
+
unless user_dicts.nil? || user_dicts.is_a?(Array)
|
|
65
|
+
raise ArgumentError, "user_dicts must be an Array<String> or nil"
|
|
66
|
+
end
|
|
67
|
+
if user_dicts&.any? { !_1.is_a?(String) }
|
|
68
|
+
raise ArgumentError, "user_dicts must contain only String values"
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
if config.nil? && system_dict.nil?
|
|
72
|
+
raise ArgumentError, "either config or system_dict is required"
|
|
73
|
+
end
|
|
74
|
+
_new(config, system_dict, user_dicts)
|
|
75
|
+
rescue RuntimeError => e
|
|
76
|
+
raise map_dictionary_init_error(e, config: config, system_dict: system_dict)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def install(edition = "core", version: nil)
|
|
80
|
+
dict_manager.install(edition, version: version)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def path(edition: nil)
|
|
84
|
+
dict_manager.find(edition: edition)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def list
|
|
88
|
+
dict_manager.installed
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
private
|
|
92
|
+
|
|
93
|
+
def map_dictionary_init_error(error, config:, system_dict:)
|
|
94
|
+
message = error.message
|
|
95
|
+
if config && system_dict.nil?
|
|
96
|
+
ConfigError.new(message)
|
|
97
|
+
elsif message.match?(/config|setting\.json|json/i)
|
|
98
|
+
ConfigError.new(message)
|
|
99
|
+
else
|
|
100
|
+
DictionaryError.new(message)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def dict_manager
|
|
105
|
+
@dict_manager ||= DictManager.new
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
alias_method :_create, :create
|
|
110
|
+
alias_method :_lookup, :lookup
|
|
111
|
+
|
|
112
|
+
def create(**options)
|
|
113
|
+
unknown = options.keys - %i[mode fields debug projection]
|
|
114
|
+
raise ArgumentError, "unknown keyword(s): #{unknown.join(', ')}" unless unknown.empty?
|
|
115
|
+
|
|
116
|
+
mode = options.fetch(:mode, MODE_C)
|
|
117
|
+
fields = options.fetch(:fields, nil)
|
|
118
|
+
debug = options.fetch(:debug, false)
|
|
119
|
+
projection = options.fetch(:projection, nil)
|
|
120
|
+
|
|
121
|
+
unless fields.nil? || fields.is_a?(Array)
|
|
122
|
+
raise ArgumentError, "fields must be an Array<String|Symbol> or nil"
|
|
123
|
+
end
|
|
124
|
+
if fields&.any? { !(_1.is_a?(String) || _1.is_a?(Symbol)) }
|
|
125
|
+
raise ArgumentError, "fields must contain only String or Symbol values"
|
|
126
|
+
end
|
|
127
|
+
unless debug == true || debug == false
|
|
128
|
+
raise ArgumentError, "debug must be true or false"
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
unless projection.nil?
|
|
132
|
+
raise NotImplementedError, "projection is not supported yet"
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
mode_str = Kabosu.__send__(:normalize_mode, mode)
|
|
136
|
+
_create(mode_str, fields, debug)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def lookup(text)
|
|
140
|
+
unless text.is_a?(String)
|
|
141
|
+
raise ArgumentError, "text must be a String"
|
|
142
|
+
end
|
|
143
|
+
MorphemeList.new(_lookup(text))
|
|
144
|
+
rescue RuntimeError => e
|
|
145
|
+
raise LookupError.new(e.message), cause: e
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# ── Tokenizer: wrap output in MorphemeList ──
|
|
150
|
+
|
|
151
|
+
class Tokenizer
|
|
152
|
+
alias_method :_tokenize, :tokenize
|
|
153
|
+
|
|
154
|
+
def tokenize(text)
|
|
155
|
+
unless text.is_a?(String)
|
|
156
|
+
raise ArgumentError, "text must be a String"
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
batch = _tokenize(text)
|
|
160
|
+
cost = batch.respond_to?(:internal_cost) ? batch.internal_cost : nil
|
|
161
|
+
MorphemeList.new(batch, internal_cost: cost)
|
|
162
|
+
rescue RuntimeError => e
|
|
163
|
+
raise TokenizationError.new(e.message), cause: e
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
class Morpheme
|
|
168
|
+
alias_method :_split, :split
|
|
169
|
+
|
|
170
|
+
def split(mode: MODE_C, add_single: true)
|
|
171
|
+
unless add_single == true || add_single == false
|
|
172
|
+
raise ArgumentError, "add_single must be true or false"
|
|
173
|
+
end
|
|
174
|
+
mode_str = Kabosu.__send__(:normalize_mode, mode)
|
|
175
|
+
MorphemeList.new(_split(mode_str, nil, add_single))
|
|
176
|
+
rescue RuntimeError => e
|
|
177
|
+
raise TokenizationError.new(e.message), cause: e
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def self.split_sentences(text, limit: nil, with_checker: false, ranges: false, dictionary: nil)
|
|
182
|
+
unless text.is_a?(String)
|
|
183
|
+
raise ArgumentError, "text must be a String"
|
|
184
|
+
end
|
|
185
|
+
unless limit.nil? || limit.is_a?(Integer)
|
|
186
|
+
raise ArgumentError, "limit must be an Integer or nil"
|
|
187
|
+
end
|
|
188
|
+
if limit && limit < 1
|
|
189
|
+
raise ArgumentError, "limit must be greater than 0"
|
|
190
|
+
end
|
|
191
|
+
unless with_checker == true || with_checker == false
|
|
192
|
+
raise ArgumentError, "with_checker must be true or false"
|
|
193
|
+
end
|
|
194
|
+
unless ranges == true || ranges == false
|
|
195
|
+
raise ArgumentError, "ranges must be true or false"
|
|
196
|
+
end
|
|
197
|
+
unless dictionary.nil? || dictionary.is_a?(String)
|
|
198
|
+
raise ArgumentError, "dictionary must be a String path or nil"
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
dict_path = nil
|
|
202
|
+
if with_checker
|
|
203
|
+
dict_path = dictionary || Dictionary.path
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
if ranges
|
|
207
|
+
_split_sentences_with_ranges(text, limit, dict_path).map do |(start, finish, sentence)|
|
|
208
|
+
SentenceRange.new(start, finish, sentence)
|
|
209
|
+
end
|
|
210
|
+
else
|
|
211
|
+
_split_sentences(text, limit, dict_path)
|
|
212
|
+
end
|
|
213
|
+
rescue RuntimeError => e
|
|
214
|
+
raise SentenceSplitError.new(e.message), cause: e
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# ── Convenience tokenization ──
|
|
218
|
+
|
|
219
|
+
# Tokenize text using an explicitly provided tokenizer.
|
|
220
|
+
#
|
|
221
|
+
# dict = Kabosu::Dictionary.new(system_dict: Kabosu::Dictionary.path)
|
|
222
|
+
# tok = dict.create(mode: :a)
|
|
223
|
+
# Kabosu.tokenize("東京都に住んでいる", tokenizer: tok)
|
|
224
|
+
#
|
|
225
|
+
def self.tokenize(text, tokenizer:)
|
|
226
|
+
unless text.is_a?(String)
|
|
227
|
+
raise ArgumentError, "text must be a String"
|
|
228
|
+
end
|
|
229
|
+
unless tokenizer.is_a?(Tokenizer)
|
|
230
|
+
raise ArgumentError, "tokenizer must be a Kabosu::Tokenizer"
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
batch = tokenizer.__send__(:_tokenize, text)
|
|
234
|
+
cost = batch.respond_to?(:internal_cost) ? batch.internal_cost : nil
|
|
235
|
+
MorphemeList.new(batch, internal_cost: cost)
|
|
236
|
+
rescue RuntimeError => e
|
|
237
|
+
raise TokenizationError.new(e.message), cause: e
|
|
238
|
+
end
|
|
239
|
+
end
|