kabosu 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +94 -0
- data/ext/kabosu/Cargo.toml +12 -0
- data/ext/kabosu/extconf.rb +4 -0
- data/ext/kabosu/src/lib.rs +490 -0
- data/lib/kabosu/dict_manager.rb +239 -0
- data/lib/kabosu/morpheme_list.rb +97 -0
- data/lib/kabosu/pos_matcher.rb +119 -0
- data/lib/kabosu/tasks.rake +86 -0
- data/lib/kabosu/version.rb +3 -0
- data/lib/kabosu.rb +143 -0
- metadata +106 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
require "net/http"
|
|
2
|
+
require "uri"
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "json"
|
|
5
|
+
require "open3"
|
|
6
|
+
|
|
7
|
+
module Kabosu
|
|
8
|
+
class DictManager
|
|
9
|
+
EDITIONS = %w[small core full].freeze
|
|
10
|
+
EDITION_PRIORITY = %w[full core small].freeze
|
|
11
|
+
GITHUB_REPO = "WorksApplications/SudachiDict"
|
|
12
|
+
GITHUB_API = "https://api.github.com"
|
|
13
|
+
|
|
14
|
+
class DictNotFound < StandardError; end
|
|
15
|
+
class DownloadError < StandardError; end
|
|
16
|
+
|
|
17
|
+
# Default storage directory: ~/.kabosu/dict/
|
|
18
|
+
def self.default_dir
|
|
19
|
+
File.join(Dir.home, ".kabosu", "dict")
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def initialize(dir: self.class.default_dir)
|
|
23
|
+
@dir = dir
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
attr_reader :dir
|
|
27
|
+
|
|
28
|
+
# ── Install ──
|
|
29
|
+
|
|
30
|
+
# Download and extract a dictionary edition.
|
|
31
|
+
#
|
|
32
|
+
# manager.install("small")
|
|
33
|
+
# manager.install("core", version: "20260116")
|
|
34
|
+
#
|
|
35
|
+
def install(edition = "core", version: nil)
|
|
36
|
+
edition = validate_edition(edition)
|
|
37
|
+
version ||= latest_version
|
|
38
|
+
|
|
39
|
+
dest_dir = File.join(@dir, "sudachi-dictionary-#{version}")
|
|
40
|
+
dic_path = File.join(dest_dir, "system_#{edition}.dic")
|
|
41
|
+
|
|
42
|
+
if File.exist?(dic_path)
|
|
43
|
+
$stderr.puts "Already installed: #{dic_path}"
|
|
44
|
+
return dic_path
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
url = release_asset_url(version, edition)
|
|
48
|
+
zip_path = File.join(@dir, "sudachi-dictionary-#{version}-#{edition}.zip")
|
|
49
|
+
|
|
50
|
+
FileUtils.mkdir_p(@dir)
|
|
51
|
+
download(url, zip_path)
|
|
52
|
+
extract(zip_path, @dir)
|
|
53
|
+
FileUtils.rm_f(zip_path)
|
|
54
|
+
|
|
55
|
+
unless File.exist?(dic_path)
|
|
56
|
+
raise DownloadError, "Expected #{dic_path} after extraction, but file not found"
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
$stderr.puts "Installed: #{dic_path}"
|
|
60
|
+
dic_path
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# ── Discovery ──
|
|
64
|
+
|
|
65
|
+
# List all installed dictionaries.
|
|
66
|
+
# Returns an array of hashes: { version:, edition:, path: }
|
|
67
|
+
def installed
|
|
68
|
+
results = []
|
|
69
|
+
return results unless Dir.exist?(@dir)
|
|
70
|
+
|
|
71
|
+
Dir.glob(File.join(@dir, "sudachi-dictionary-*")).sort.reverse.each do |version_dir|
|
|
72
|
+
next unless File.directory?(version_dir)
|
|
73
|
+
|
|
74
|
+
version = File.basename(version_dir).sub("sudachi-dictionary-", "")
|
|
75
|
+
EDITIONS.each do |edition|
|
|
76
|
+
dic = File.join(version_dir, "system_#{edition}.dic")
|
|
77
|
+
next unless File.exist?(dic)
|
|
78
|
+
|
|
79
|
+
results << { version: version, edition: edition, path: dic }
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
results
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Find the best available dictionary path.
|
|
87
|
+
# Prefers: latest version, then full > core > small.
|
|
88
|
+
def find(edition: nil)
|
|
89
|
+
candidates = installed
|
|
90
|
+
raise DictNotFound, "No dictionaries installed. Run: rake kabosu:install" if candidates.empty?
|
|
91
|
+
|
|
92
|
+
if edition
|
|
93
|
+
edition = validate_edition(edition)
|
|
94
|
+
match = candidates.find { |d| d[:edition] == edition }
|
|
95
|
+
raise DictNotFound, "No #{edition} dictionary installed" unless match
|
|
96
|
+
return match[:path]
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Group by version (already sorted newest-first), pick best edition
|
|
100
|
+
by_version = candidates.group_by { |d| d[:version] }
|
|
101
|
+
latest_version_dicts = by_version.values.first
|
|
102
|
+
|
|
103
|
+
best = EDITION_PRIORITY.each do |ed|
|
|
104
|
+
found = latest_version_dicts.find { |d| d[:edition] == ed }
|
|
105
|
+
break found if found
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
best.is_a?(Hash) ? best[:path] : latest_version_dicts.first[:path]
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# ── Remove ──
|
|
112
|
+
|
|
113
|
+
# Remove a specific dictionary edition, or an entire version.
|
|
114
|
+
def remove(edition: nil, version: nil)
|
|
115
|
+
targets = installed
|
|
116
|
+
targets = targets.select { |d| d[:version] == version } if version
|
|
117
|
+
targets = targets.select { |d| d[:edition] == edition } if edition
|
|
118
|
+
|
|
119
|
+
raise DictNotFound, "No matching dictionary found" if targets.empty?
|
|
120
|
+
|
|
121
|
+
targets.each do |d|
|
|
122
|
+
FileUtils.rm_f(d[:path])
|
|
123
|
+
$stderr.puts "Removed: #{d[:path]}"
|
|
124
|
+
|
|
125
|
+
# Clean up empty version directories
|
|
126
|
+
version_dir = File.dirname(d[:path])
|
|
127
|
+
dics_remaining = Dir.glob(File.join(version_dir, "system_*.dic"))
|
|
128
|
+
if dics_remaining.empty?
|
|
129
|
+
FileUtils.rm_rf(version_dir)
|
|
130
|
+
$stderr.puts "Removed empty directory: #{version_dir}"
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# ── Version resolution ──
|
|
136
|
+
|
|
137
|
+
# Fetch the latest release tag from GitHub.
|
|
138
|
+
def latest_version
|
|
139
|
+
uri = URI("#{GITHUB_API}/repos/#{GITHUB_REPO}/releases/latest")
|
|
140
|
+
response = http_get(uri, headers: { "Accept" => "application/json" })
|
|
141
|
+
data = JSON.parse(response.body)
|
|
142
|
+
tag = data["tag_name"]
|
|
143
|
+
# Tags are like "v20260116" — strip the "v" prefix
|
|
144
|
+
tag.sub(/\Av/, "")
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# List available versions from GitHub releases.
|
|
148
|
+
def available_versions
|
|
149
|
+
uri = URI("#{GITHUB_API}/repos/#{GITHUB_REPO}/releases")
|
|
150
|
+
response = http_get(uri, headers: { "Accept" => "application/json" })
|
|
151
|
+
JSON.parse(response.body).map { |r| r["tag_name"].sub(/\Av/, "") }
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
private
|
|
155
|
+
|
|
156
|
+
def validate_edition(edition)
|
|
157
|
+
edition = edition.to_s.downcase
|
|
158
|
+
unless EDITIONS.include?(edition)
|
|
159
|
+
raise ArgumentError, "Unknown edition '#{edition}'. Must be one of: #{EDITIONS.join(", ")}"
|
|
160
|
+
end
|
|
161
|
+
edition
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def release_asset_url(version, edition)
|
|
165
|
+
"https://github.com/#{GITHUB_REPO}/releases/download/v#{version}/sudachi-dictionary-#{version}-#{edition}.zip"
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def download(url, dest)
|
|
169
|
+
$stderr.puts "Downloading #{url}..."
|
|
170
|
+
uri = resolve_redirects(URI(url))
|
|
171
|
+
|
|
172
|
+
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
|
|
173
|
+
http.request(Net::HTTP::Get.new(uri)) do |response|
|
|
174
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
175
|
+
raise DownloadError, "Failed to download: #{response.code} #{response.message}"
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
total = response["Content-Length"]&.to_i
|
|
179
|
+
written = 0
|
|
180
|
+
|
|
181
|
+
File.open(dest, "wb") do |f|
|
|
182
|
+
response.read_body do |chunk|
|
|
183
|
+
f.write(chunk)
|
|
184
|
+
written += chunk.bytesize
|
|
185
|
+
if total && total > 0
|
|
186
|
+
pct = (written * 100 / total).clamp(0, 100)
|
|
187
|
+
$stderr.print "\r #{(written.to_f / 1024 / 1024).round(1)} / #{(total.to_f / 1024 / 1024).round(1)} MB (#{pct}%)"
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
$stderr.puts "\r #{(written.to_f / 1024 / 1024).round(1)} MB downloaded"
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def resolve_redirects(uri, limit: 5)
|
|
198
|
+
raise DownloadError, "Too many redirects" if limit == 0
|
|
199
|
+
|
|
200
|
+
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
|
|
201
|
+
response = http.request(Net::HTTP::Head.new(uri))
|
|
202
|
+
case response
|
|
203
|
+
when Net::HTTPRedirection
|
|
204
|
+
resolve_redirects(URI(response["location"]), limit: limit - 1)
|
|
205
|
+
else
|
|
206
|
+
uri
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def http_get(uri, headers: {}, redirect_limit: 5)
|
|
212
|
+
raise DownloadError, "Too many redirects" if redirect_limit == 0
|
|
213
|
+
|
|
214
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
215
|
+
http.use_ssl = (uri.scheme == "https")
|
|
216
|
+
|
|
217
|
+
request = Net::HTTP::Get.new(uri)
|
|
218
|
+
headers.each { |k, v| request[k] = v }
|
|
219
|
+
|
|
220
|
+
response = http.request(request)
|
|
221
|
+
|
|
222
|
+
case response
|
|
223
|
+
when Net::HTTPRedirection
|
|
224
|
+
http_get(URI(response["location"]), headers: headers, redirect_limit: redirect_limit - 1)
|
|
225
|
+
else
|
|
226
|
+
response
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def extract(zip_path, dest_dir)
|
|
231
|
+
$stderr.puts "Extracting..."
|
|
232
|
+
# Use system unzip — available everywhere, handles large files well
|
|
233
|
+
_stdout, stderr, status = Open3.capture3("unzip", "-o", zip_path, "-d", dest_dir)
|
|
234
|
+
unless status.success?
|
|
235
|
+
raise DownloadError, "Failed to extract #{zip_path}: #{stderr}"
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
module Kabosu
|
|
2
|
+
class MorphemeList
|
|
3
|
+
include Enumerable
|
|
4
|
+
|
|
5
|
+
attr_accessor :internal_cost
|
|
6
|
+
|
|
7
|
+
def initialize(morphemes, internal_cost: nil)
|
|
8
|
+
@morphemes = morphemes
|
|
9
|
+
@internal_cost = internal_cost
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def each(&block)
|
|
13
|
+
@morphemes.each(&block)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def [](index)
|
|
17
|
+
@morphemes[index]
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def first(n = nil)
|
|
21
|
+
n ? @morphemes.first(n) : @morphemes.first
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def last(n = nil)
|
|
25
|
+
n ? @morphemes.last(n) : @morphemes.last
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def size
|
|
29
|
+
@morphemes.size
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def surfaces
|
|
33
|
+
map(&:surface)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def readings
|
|
37
|
+
map(&:reading_form)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def dictionary_forms
|
|
41
|
+
map(&:dictionary_form)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def normalized_forms
|
|
45
|
+
map(&:normalized_form)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def total_costs
|
|
49
|
+
map(&:total_cost)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def synonym_group_ids
|
|
53
|
+
map(&:synonym_group_ids)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Joins all surfaces back into the original text (no spaces, for Japanese text).
|
|
57
|
+
def to_text
|
|
58
|
+
surfaces.join
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Filter morphemes by POS. Accepts a PosMatcher or an array pattern.
|
|
62
|
+
# Returns a new MorphemeList with only matching morphemes.
|
|
63
|
+
#
|
|
64
|
+
# list.select_pos(Kabosu::PosMatcher.nouns)
|
|
65
|
+
# list.select_pos(["名詞", "固有名詞"])
|
|
66
|
+
#
|
|
67
|
+
def select_pos(matcher_or_pattern)
|
|
68
|
+
matcher = coerce_to_matcher(matcher_or_pattern)
|
|
69
|
+
self.class.new(matcher.filter(@morphemes))
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Inverse of select_pos. Returns a new MorphemeList excluding matching morphemes.
|
|
73
|
+
def reject_pos(matcher_or_pattern)
|
|
74
|
+
matcher = coerce_to_matcher(matcher_or_pattern)
|
|
75
|
+
self.class.new(matcher.reject(@morphemes))
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def inspect
|
|
79
|
+
base = "#<Kabosu::MorphemeList (#{size} morphemes)"
|
|
80
|
+
base += " cost=#{@internal_cost}" if @internal_cost
|
|
81
|
+
base + ": #{surfaces.join(" | ")}>"
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
private
|
|
85
|
+
|
|
86
|
+
def coerce_to_matcher(matcher_or_pattern)
|
|
87
|
+
case matcher_or_pattern
|
|
88
|
+
when PosMatcher
|
|
89
|
+
matcher_or_pattern
|
|
90
|
+
when Array
|
|
91
|
+
PosMatcher.new(matcher_or_pattern)
|
|
92
|
+
else
|
|
93
|
+
raise ArgumentError, "expected a PosMatcher or an Array pattern, got #{matcher_or_pattern.class}"
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
module Kabosu
|
|
2
|
+
class PosMatcher
|
|
3
|
+
# Build a matcher from POS patterns or a block.
|
|
4
|
+
#
|
|
5
|
+
# # From a block
|
|
6
|
+
# PosMatcher.new { |pos| pos[0] == "名詞" }
|
|
7
|
+
#
|
|
8
|
+
# # Single pattern (array of strings; "*" or nil = match anything)
|
|
9
|
+
# PosMatcher.new(["名詞", "固有名詞", "*", "*"])
|
|
10
|
+
#
|
|
11
|
+
# # Multiple patterns (matches if any pattern matches)
|
|
12
|
+
# PosMatcher.new(["名詞", "固有名詞"], ["動詞", "*", "*", "*"])
|
|
13
|
+
#
|
|
14
|
+
def initialize(*patterns, &block)
|
|
15
|
+
if block
|
|
16
|
+
raise ArgumentError, "cannot supply both patterns and a block" unless patterns.empty?
|
|
17
|
+
|
|
18
|
+
@proc = block
|
|
19
|
+
elsif patterns.empty?
|
|
20
|
+
raise ArgumentError, "must supply at least one pattern or a block"
|
|
21
|
+
else
|
|
22
|
+
@patterns = patterns.map(&:freeze).freeze
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
freeze
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Returns true if the morpheme (or raw POS array) matches this matcher.
|
|
29
|
+
def match?(morpheme_or_pos)
|
|
30
|
+
pos = extract_pos(morpheme_or_pos)
|
|
31
|
+
|
|
32
|
+
if @proc
|
|
33
|
+
@proc.call(pos)
|
|
34
|
+
else
|
|
35
|
+
@patterns.any? { |pattern| pattern_match?(pattern, pos) }
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Return matching morphemes as an Array.
|
|
40
|
+
def filter(morphemes)
|
|
41
|
+
morphemes.select { |m| match?(m) }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Return non-matching morphemes as an Array.
|
|
45
|
+
def reject(morphemes)
|
|
46
|
+
morphemes.reject { |m| match?(m) }
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Union: matches if either matcher matches.
|
|
50
|
+
def |(other)
|
|
51
|
+
a, b = self, other
|
|
52
|
+
PosMatcher.new { |pos| a.match?(pos) || b.match?(pos) }
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Intersection: matches if both matchers match.
|
|
56
|
+
def &(other)
|
|
57
|
+
a, b = self, other
|
|
58
|
+
PosMatcher.new { |pos| a.match?(pos) && b.match?(pos) }
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Difference: matches self but not other.
|
|
62
|
+
def -(other)
|
|
63
|
+
difference(other)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def difference(other)
|
|
67
|
+
a, b = self, other
|
|
68
|
+
PosMatcher.new { |pos| a.match?(pos) && !b.match?(pos) }
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# ── Pre-built matchers ──
|
|
72
|
+
|
|
73
|
+
def self.nouns
|
|
74
|
+
@nouns ||= new(["名詞"])
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def self.verbs
|
|
78
|
+
@verbs ||= new(["動詞"])
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def self.adjectives
|
|
82
|
+
@adjectives ||= new(["形容詞"])
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def self.particles
|
|
86
|
+
@particles ||= new(["助詞"])
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def self.auxiliary_verbs
|
|
90
|
+
@auxiliary_verbs ||= new(["助動詞"])
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def self.adverbs
|
|
94
|
+
@adverbs ||= new(["副詞"])
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def self.proper_nouns
|
|
98
|
+
@proper_nouns ||= new(["名詞", "固有名詞"])
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
private
|
|
102
|
+
|
|
103
|
+
def extract_pos(morpheme_or_pos)
|
|
104
|
+
if morpheme_or_pos.is_a?(Array)
|
|
105
|
+
morpheme_or_pos
|
|
106
|
+
elsif morpheme_or_pos.respond_to?(:part_of_speech)
|
|
107
|
+
morpheme_or_pos.part_of_speech
|
|
108
|
+
else
|
|
109
|
+
raise ArgumentError, "expected an Array or an object responding to #part_of_speech"
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def pattern_match?(pattern, pos)
|
|
114
|
+
pattern.each_with_index.all? do |slot, i|
|
|
115
|
+
slot.nil? || slot == "*" || slot == pos[i]
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
require "kabosu/dict_manager"
|
|
2
|
+
|
|
3
|
+
namespace :kabosu do
|
|
4
|
+
# ── Install ──
|
|
5
|
+
|
|
6
|
+
desc "Install the core dictionary (default). VERSION=YYYYMMDD to pin a specific release."
|
|
7
|
+
task :install do
|
|
8
|
+
Kabosu::DictManager.new.install("core", version: ENV["VERSION"])
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
namespace :install do
|
|
12
|
+
desc "Install the small dictionary. VERSION=YYYYMMDD to pin a specific release."
|
|
13
|
+
task :small do
|
|
14
|
+
Kabosu::DictManager.new.install("small", version: ENV["VERSION"])
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
desc "Install the core dictionary. VERSION=YYYYMMDD to pin a specific release."
|
|
18
|
+
task :core do
|
|
19
|
+
Kabosu::DictManager.new.install("core", version: ENV["VERSION"])
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
desc "Install the full dictionary. VERSION=YYYYMMDD to pin a specific release."
|
|
23
|
+
task :full do
|
|
24
|
+
Kabosu::DictManager.new.install("full", version: ENV["VERSION"])
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# ── Remove ──
|
|
29
|
+
|
|
30
|
+
desc "Remove all installed dictionaries, or a specific one with EDITION=small|core|full and/or VERSION=YYYYMMDD"
|
|
31
|
+
task :remove do
|
|
32
|
+
Kabosu::DictManager.new.remove(edition: ENV["EDITION"], version: ENV["VERSION"])
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
namespace :remove do
|
|
36
|
+
desc "Remove the small dictionary."
|
|
37
|
+
task :small do
|
|
38
|
+
Kabosu::DictManager.new.remove(edition: "small", version: ENV["VERSION"])
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
desc "Remove the core dictionary."
|
|
42
|
+
task :core do
|
|
43
|
+
Kabosu::DictManager.new.remove(edition: "core", version: ENV["VERSION"])
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
desc "Remove the full dictionary."
|
|
47
|
+
task :full do
|
|
48
|
+
Kabosu::DictManager.new.remove(edition: "full", version: ENV["VERSION"])
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# ── Info ──
|
|
53
|
+
|
|
54
|
+
desc "List installed Sudachi dictionaries"
|
|
55
|
+
task :list do
|
|
56
|
+
manager = Kabosu::DictManager.new
|
|
57
|
+
dicts = manager.installed
|
|
58
|
+
|
|
59
|
+
if dicts.empty?
|
|
60
|
+
puts "No dictionaries installed. Run: rake kabosu:install"
|
|
61
|
+
else
|
|
62
|
+
puts "Installed dictionaries (#{manager.dir}):"
|
|
63
|
+
puts
|
|
64
|
+
dicts.each do |d|
|
|
65
|
+
size_mb = (File.size(d[:path]).to_f / 1024 / 1024).round(1)
|
|
66
|
+
puts " #{d[:version]} / #{d[:edition].ljust(5)} (#{size_mb} MB)"
|
|
67
|
+
puts " #{d[:path]}"
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
desc "Show available dictionary versions from GitHub"
|
|
73
|
+
task :versions do
|
|
74
|
+
Kabosu::DictManager.new.available_versions.each { |v| puts v }
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
desc "Show the path to the best available dictionary. EDITION=small|core|full to be specific."
|
|
78
|
+
task :path do
|
|
79
|
+
begin
|
|
80
|
+
puts Kabosu::DictManager.new.find(edition: ENV["EDITION"])
|
|
81
|
+
rescue Kabosu::DictManager::DictNotFound => e
|
|
82
|
+
$stderr.puts e.message
|
|
83
|
+
exit 1
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
data/lib/kabosu.rb
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
require_relative "kabosu/version"
|
|
2
|
+
require_relative "kabosu/kabosu"
|
|
3
|
+
require_relative "kabosu/dict_manager"
|
|
4
|
+
require_relative "kabosu/pos_matcher"
|
|
5
|
+
require_relative "kabosu/morpheme_list"
|
|
6
|
+
|
|
7
|
+
module Kabosu
|
|
8
|
+
# ── Dictionary.new: keyword API + auto-discovery ──
|
|
9
|
+
|
|
10
|
+
class Dictionary
|
|
11
|
+
class << self
|
|
12
|
+
alias_method :_new, :new
|
|
13
|
+
|
|
14
|
+
def new(config: nil, dict: nil)
|
|
15
|
+
dict ||= Kabosu.dict_path
|
|
16
|
+
_new(config, dict)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# ── Tokenizer: wrap output in MorphemeList ──
|
|
22
|
+
|
|
23
|
+
class Tokenizer
|
|
24
|
+
alias_method :_tokenize, :tokenize
|
|
25
|
+
|
|
26
|
+
def tokenize(text)
|
|
27
|
+
morphemes = _tokenize(text)
|
|
28
|
+
cost = respond_to?(:internal_cost) ? internal_cost : nil
|
|
29
|
+
MorphemeList.new(morphemes, internal_cost: cost)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
if method_defined?(:tokenize_sentences) || instance_methods(false).include?(:tokenize_sentences)
|
|
33
|
+
alias_method :_tokenize_sentences, :tokenize_sentences
|
|
34
|
+
|
|
35
|
+
def tokenize_sentences(text)
|
|
36
|
+
_tokenize_sentences(text).map { |morphemes| MorphemeList.new(morphemes) }
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# ── StatefulTokenizer: wrap output in MorphemeList ──
|
|
42
|
+
|
|
43
|
+
class StatefulTokenizer
|
|
44
|
+
alias_method :_tokenize, :tokenize
|
|
45
|
+
|
|
46
|
+
def tokenize(text)
|
|
47
|
+
morphemes = _tokenize(text)
|
|
48
|
+
cost = respond_to?(:internal_cost) ? internal_cost : nil
|
|
49
|
+
MorphemeList.new(morphemes, internal_cost: cost)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
if method_defined?(:tokenize_sentences) || instance_methods(false).include?(:tokenize_sentences)
|
|
53
|
+
alias_method :_tokenize_sentences, :tokenize_sentences
|
|
54
|
+
|
|
55
|
+
def tokenize_sentences(text)
|
|
56
|
+
_tokenize_sentences(text).map { |morphemes| MorphemeList.new(morphemes) }
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# ── Dictionary management ──
|
|
62
|
+
|
|
63
|
+
def self.dict_manager
|
|
64
|
+
@dict_manager ||= DictManager.new
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def self.install_dictionary(edition = "core", version: nil)
|
|
68
|
+
dict_manager.install(edition, version: version)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def self.dict_path(edition: nil)
|
|
72
|
+
dict_manager.find(edition: edition)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def self.dictionaries
|
|
76
|
+
dict_manager.installed
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# ── Convenience tokenization ──
|
|
80
|
+
|
|
81
|
+
# Tokenize text using the best available dictionary.
|
|
82
|
+
#
|
|
83
|
+
# Kabosu.tokenize("東京都に住んでいる")
|
|
84
|
+
# Kabosu.tokenize("東京都に住んでいる", mode: "A")
|
|
85
|
+
# Kabosu.tokenize("東京都に住んでいる", edition: "small")
|
|
86
|
+
#
|
|
87
|
+
def self.tokenize(text, mode: "C", edition: nil)
|
|
88
|
+
tokenizer = cached_tokenizer(edition: edition, mode: mode)
|
|
89
|
+
tokenizer.tokenize(text)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Tokenize text into sentences. Returns an array of MorphemeList objects,
|
|
93
|
+
# one per sentence.
|
|
94
|
+
#
|
|
95
|
+
# Kabosu.tokenize_sentences("東京都に住んでいる。大阪も好きだ。")
|
|
96
|
+
#
|
|
97
|
+
def self.tokenize_sentences(text, mode: "C", edition: nil)
|
|
98
|
+
tokenizer = cached_tokenizer(edition: edition, mode: mode)
|
|
99
|
+
if tokenizer.respond_to?(:_tokenize_sentences)
|
|
100
|
+
tokenizer.tokenize_sentences(text)
|
|
101
|
+
elsif tokenizer.respond_to?(:tokenize_sentences)
|
|
102
|
+
# Fallback if the alias wasn't set up (Rust method not yet available)
|
|
103
|
+
tokenizer.tokenize_sentences(text).map { |morphemes| MorphemeList.new(morphemes) }
|
|
104
|
+
else
|
|
105
|
+
# Final fallback: treat entire text as a single sentence
|
|
106
|
+
[tokenizer.tokenize(text)]
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Convenience factory for PosMatcher.
|
|
111
|
+
#
|
|
112
|
+
# nouns = Kabosu.pos_matcher("名詞")
|
|
113
|
+
# verbs = Kabosu.pos_matcher(["動詞", "*", "*", "*"])
|
|
114
|
+
# custom = Kabosu.pos_matcher { |pos| pos[0] == "動詞" }
|
|
115
|
+
#
|
|
116
|
+
def self.pos_matcher(*patterns, &block)
|
|
117
|
+
if block
|
|
118
|
+
PosMatcher.new(&block)
|
|
119
|
+
elsif patterns.length == 1 && patterns[0].is_a?(String)
|
|
120
|
+
# Shorthand: single string is treated as first POS element
|
|
121
|
+
PosMatcher.new([patterns[0]])
|
|
122
|
+
else
|
|
123
|
+
PosMatcher.new(*patterns)
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Create a StatefulTokenizer for efficient batch processing.
|
|
128
|
+
# Reuses internal buffers across calls.
|
|
129
|
+
#
|
|
130
|
+
# tok = Kabosu.batch_tokenizer(mode: "A")
|
|
131
|
+
# texts.each { |t| results << tok.tokenize(t) }
|
|
132
|
+
#
|
|
133
|
+
def self.batch_tokenizer(mode: "C", edition: nil)
|
|
134
|
+
Dictionary.new(dict: dict_path(edition: edition)).create_stateful(mode)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# @api private
|
|
138
|
+
def self.cached_tokenizer(edition:, mode:)
|
|
139
|
+
@tokenizers ||= {}
|
|
140
|
+
@tokenizers[[edition, mode]] ||= Dictionary.new(dict: dict_path(edition: edition)).create(mode)
|
|
141
|
+
end
|
|
142
|
+
private_class_method :cached_tokenizer
|
|
143
|
+
end
|