kabosu 0.6.10.2 → 0.6.11.0.dev.20260627.a5a69e7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +3 -3
- data/ext/kabosu/Cargo.toml +2 -2
- data/ext/kabosu/src/grouping.rs +18 -14
- data/ext/kabosu/src/lib.rs +5 -0
- data/ext/kabosu/src/morpheme.rs +9 -2
- data/lib/kabosu/dict_manager.rb +20 -17
- data/lib/kabosu/morpheme_list.rb +5 -5
- data/lib/kabosu/pos_matcher.rb +7 -4
- data/lib/kabosu/tasks.rake +15 -16
- data/lib/kabosu/version.rb +1 -1
- data/lib/kabosu.rb +28 -66
- metadata +32 -17
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: '09c9e8c9a75657b66be036aa6b88ed8f3d9e540551d01612fb73c0f73e3fb45c'
|
|
4
|
+
data.tar.gz: 104e52c5170dd5c8d5a7f03e88b6eb4185647b1804019733093e09b976af9b75
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a5bc855d6659efe5450e43bf70c6d24438fe20fe333f7bcd08c0ea6983221c6b522952e7e5a65150051c191b8a2791f1d2f7ceb468ab80faa58e6844ff804dda
|
|
7
|
+
data.tar.gz: 7e86601193b2207986fd0fc43f875d0ff8f252f6264a1968f282bca643061a0307c97dff63a95dfae4a7035770fb4da288ab40b00a38e9957d91bc99cf7dba0a
|
data/Cargo.lock
CHANGED
|
@@ -159,7 +159,7 @@ checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
|
|
|
159
159
|
|
|
160
160
|
[[package]]
|
|
161
161
|
name = "kabosu"
|
|
162
|
-
version = "0.6.
|
|
162
|
+
version = "0.6.11"
|
|
163
163
|
dependencies = [
|
|
164
164
|
"magnus",
|
|
165
165
|
"sudachi",
|
|
@@ -393,8 +393,8 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
|
|
393
393
|
|
|
394
394
|
[[package]]
|
|
395
395
|
name = "sudachi"
|
|
396
|
-
version = "0.6.
|
|
397
|
-
source = "git+https://github.com/WorksApplications/sudachi.rs?tag=v0.6.
|
|
396
|
+
version = "0.6.11"
|
|
397
|
+
source = "git+https://github.com/WorksApplications/sudachi.rs?tag=v0.6.11#90fd6068c80c2fc3b63e0dbab0e341475bad4d8f"
|
|
398
398
|
dependencies = [
|
|
399
399
|
"aho-corasick",
|
|
400
400
|
"bitflags",
|
data/ext/kabosu/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kabosu"
|
|
3
|
-
version = "0.6.
|
|
3
|
+
version = "0.6.11"
|
|
4
4
|
edition = "2021"
|
|
5
5
|
publish = false
|
|
6
6
|
|
|
@@ -9,4 +9,4 @@ crate-type = ["cdylib"]
|
|
|
9
9
|
|
|
10
10
|
[dependencies]
|
|
11
11
|
magnus = { version = "0.8", features = ["rb-sys"] }
|
|
12
|
-
sudachi = { git = "https://github.com/WorksApplications/sudachi.rs", tag = "v0.6.
|
|
12
|
+
sudachi = { git = "https://github.com/WorksApplications/sudachi.rs", tag = "v0.6.11" }
|
data/ext/kabosu/src/grouping.rs
CHANGED
|
@@ -39,17 +39,16 @@ pub(crate) fn group_morphemes_rust(
|
|
|
39
39
|
// POS helpers
|
|
40
40
|
|
|
41
41
|
fn is_content_word(pos_id: u16, dict: &JapaneseDictionary) -> bool {
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
42
|
+
!matches!(
|
|
43
|
+
dict.grammar()
|
|
44
|
+
.pos_components(pos_id)
|
|
45
|
+
.first()
|
|
46
|
+
.map(|s| s.as_str()),
|
|
47
|
+
Some("助詞") | Some("助動詞") | Some("補助記号") | Some("記号") | Some("空白")
|
|
48
|
+
)
|
|
46
49
|
}
|
|
47
50
|
|
|
48
|
-
fn extends_group(
|
|
49
|
-
m: &MorphemeData,
|
|
50
|
-
prev: &MorphemeData,
|
|
51
|
-
dict: &JapaneseDictionary,
|
|
52
|
-
) -> bool {
|
|
51
|
+
fn extends_group(m: &MorphemeData, prev: &MorphemeData, dict: &JapaneseDictionary) -> bool {
|
|
53
52
|
let comps = dict.grammar().pos_components(m.pos_id);
|
|
54
53
|
let pos0 = comps.first().map(|s| s.as_str());
|
|
55
54
|
let pos1 = comps.get(1).map(|s| s.as_str());
|
|
@@ -74,10 +73,12 @@ fn extends_group(
|
|
|
74
73
|
return false;
|
|
75
74
|
}
|
|
76
75
|
// te-form auxiliary chain: て/で + いる/ある/くる/etc.
|
|
77
|
-
let prev_pos0 = dict
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
76
|
+
let prev_pos0 = dict
|
|
77
|
+
.grammar()
|
|
78
|
+
.pos_components(prev.pos_id)
|
|
79
|
+
.first()
|
|
80
|
+
.map(|s| s.as_str());
|
|
81
|
+
if prev_pos0 == Some("助詞") && (prev.surface == "て" || prev.surface == "で") {
|
|
81
82
|
return true;
|
|
82
83
|
}
|
|
83
84
|
// compound verb (V+V) intentionally skipped — caller handles DB lookup
|
|
@@ -127,7 +128,10 @@ fn is_clause_boundary_particle(surface: &str) -> bool {
|
|
|
127
128
|
|
|
128
129
|
fn is_verb_adj_adv(pos_id: u16, dict: &JapaneseDictionary) -> bool {
|
|
129
130
|
matches!(
|
|
130
|
-
dict.grammar()
|
|
131
|
+
dict.grammar()
|
|
132
|
+
.pos_components(pos_id)
|
|
133
|
+
.first()
|
|
134
|
+
.map(|s| s.as_str()),
|
|
131
135
|
Some("動詞") | Some("形容詞") | Some("形状詞")
|
|
132
136
|
)
|
|
133
137
|
}
|
data/ext/kabosu/src/lib.rs
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
// sudachi::error::SudachiError is a large external enum; boxing it to satisfy
|
|
2
|
+
// clippy::result_large_err would churn Result signatures throughout the crate
|
|
3
|
+
// for no real benefit on these cold error paths.
|
|
4
|
+
#![allow(clippy::result_large_err)]
|
|
5
|
+
|
|
1
6
|
mod dictionary;
|
|
2
7
|
mod errors;
|
|
3
8
|
mod grouping;
|
data/ext/kabosu/src/morpheme.rs
CHANGED
|
@@ -254,7 +254,10 @@ impl RbMorpheme {
|
|
|
254
254
|
let pos_id = self.data.pos_id;
|
|
255
255
|
|
|
256
256
|
{
|
|
257
|
-
let cache = POS_CACHE
|
|
257
|
+
let cache = POS_CACHE
|
|
258
|
+
.get_or_init(|| Mutex::new(HashMap::new()))
|
|
259
|
+
.lock()
|
|
260
|
+
.unwrap();
|
|
258
261
|
if let Some(&cached) = cache.get(&(dict_ptr, pos_id)) {
|
|
259
262
|
return Ok(cached.0);
|
|
260
263
|
}
|
|
@@ -272,7 +275,11 @@ impl RbMorpheme {
|
|
|
272
275
|
// arrays are tiny. The cache lives for the process lifetime.
|
|
273
276
|
gc::register_mark_object(ary);
|
|
274
277
|
|
|
275
|
-
POS_CACHE
|
|
278
|
+
POS_CACHE
|
|
279
|
+
.get_or_init(|| Mutex::new(HashMap::new()))
|
|
280
|
+
.lock()
|
|
281
|
+
.unwrap()
|
|
282
|
+
.insert((dict_ptr, pos_id), CachedRArray(ary));
|
|
276
283
|
|
|
277
284
|
Ok(ary)
|
|
278
285
|
}
|
data/lib/kabosu/dict_manager.rb
CHANGED
|
@@ -8,8 +8,8 @@ module Kabosu
|
|
|
8
8
|
class DictManager
|
|
9
9
|
EDITIONS = %w[small core full].freeze
|
|
10
10
|
EDITION_PRIORITY = %w[full core small].freeze
|
|
11
|
-
GITHUB_REPO = "WorksApplications/SudachiDict"
|
|
12
|
-
GITHUB_API = "https://api.github.com"
|
|
11
|
+
GITHUB_REPO = "WorksApplications/SudachiDict".freeze
|
|
12
|
+
GITHUB_API = "https://api.github.com".freeze
|
|
13
13
|
|
|
14
14
|
class DictNotFound < StandardError; end
|
|
15
15
|
class DownloadError < StandardError; end
|
|
@@ -42,7 +42,7 @@ module Kabosu
|
|
|
42
42
|
dic_path = File.join(dest_dir, "system_#{edition}.dic")
|
|
43
43
|
|
|
44
44
|
if File.exist?(dic_path)
|
|
45
|
-
|
|
45
|
+
warn "Already installed: #{dic_path}"
|
|
46
46
|
return dic_path
|
|
47
47
|
end
|
|
48
48
|
|
|
@@ -54,11 +54,9 @@ module Kabosu
|
|
|
54
54
|
extract(zip_path, @dir)
|
|
55
55
|
FileUtils.rm_f(zip_path)
|
|
56
56
|
|
|
57
|
-
unless File.exist?(dic_path)
|
|
58
|
-
raise DownloadError, "Expected #{dic_path} after extraction, but file not found"
|
|
59
|
-
end
|
|
57
|
+
raise DownloadError, "Expected #{dic_path} after extraction, but file not found" unless File.exist?(dic_path)
|
|
60
58
|
|
|
61
|
-
|
|
59
|
+
warn "Installed: #{dic_path}"
|
|
62
60
|
dic_path
|
|
63
61
|
end
|
|
64
62
|
|
|
@@ -88,7 +86,7 @@ module Kabosu
|
|
|
88
86
|
results = []
|
|
89
87
|
return results unless Dir.exist?(@dir)
|
|
90
88
|
|
|
91
|
-
Dir.glob(File.join(@dir, "sudachi-dictionary-*")).
|
|
89
|
+
Dir.glob(File.join(@dir, "sudachi-dictionary-*")).reverse.each do |version_dir|
|
|
92
90
|
next unless File.directory?(version_dir)
|
|
93
91
|
|
|
94
92
|
version = File.basename(version_dir).sub("sudachi-dictionary-", "")
|
|
@@ -113,6 +111,7 @@ module Kabosu
|
|
|
113
111
|
edition = validate_edition(edition)
|
|
114
112
|
match = candidates.find { |d| d[:edition] == edition }
|
|
115
113
|
raise DictNotFound, "No #{edition} dictionary installed" unless match
|
|
114
|
+
|
|
116
115
|
return match[:path]
|
|
117
116
|
end
|
|
118
117
|
|
|
@@ -140,14 +139,14 @@ module Kabosu
|
|
|
140
139
|
|
|
141
140
|
targets.each do |d|
|
|
142
141
|
FileUtils.rm_f(d[:path])
|
|
143
|
-
|
|
142
|
+
warn "Removed: #{d[:path]}"
|
|
144
143
|
|
|
145
144
|
# Clean up empty version directories
|
|
146
145
|
version_dir = File.dirname(d[:path])
|
|
147
146
|
dics_remaining = Dir.glob(File.join(version_dir, "system_*.dic"))
|
|
148
147
|
if dics_remaining.empty?
|
|
149
148
|
FileUtils.rm_rf(version_dir)
|
|
150
|
-
|
|
149
|
+
warn "Removed empty directory: #{version_dir}"
|
|
151
150
|
end
|
|
152
151
|
end
|
|
153
152
|
end
|
|
@@ -178,6 +177,7 @@ module Kabosu
|
|
|
178
177
|
unless EDITIONS.include?(edition)
|
|
179
178
|
raise ArgumentError, "Unknown edition '#{edition}'. Must be one of: #{EDITIONS.join(", ")}"
|
|
180
179
|
end
|
|
180
|
+
|
|
181
181
|
edition
|
|
182
182
|
end
|
|
183
183
|
|
|
@@ -186,7 +186,7 @@ module Kabosu
|
|
|
186
186
|
end
|
|
187
187
|
|
|
188
188
|
def download(url, dest)
|
|
189
|
-
|
|
189
|
+
warn "Downloading #{url}..."
|
|
190
190
|
uri = resolve_redirects(URI(url))
|
|
191
191
|
|
|
192
192
|
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
|
|
@@ -202,20 +202,22 @@ module Kabosu
|
|
|
202
202
|
response.read_body do |chunk|
|
|
203
203
|
f.write(chunk)
|
|
204
204
|
written += chunk.bytesize
|
|
205
|
-
if total
|
|
205
|
+
if total&.positive?
|
|
206
206
|
pct = (written * 100 / total).clamp(0, 100)
|
|
207
|
-
|
|
207
|
+
done_mb = (written.to_f / 1024 / 1024).round(1)
|
|
208
|
+
total_mb = (total.to_f / 1024 / 1024).round(1)
|
|
209
|
+
$stderr.print "\r #{done_mb} / #{total_mb} MB (#{pct}%)"
|
|
208
210
|
end
|
|
209
211
|
end
|
|
210
212
|
end
|
|
211
213
|
|
|
212
|
-
|
|
214
|
+
warn "\r #{(written.to_f / 1024 / 1024).round(1)} MB downloaded"
|
|
213
215
|
end
|
|
214
216
|
end
|
|
215
217
|
end
|
|
216
218
|
|
|
217
219
|
def resolve_redirects(uri, limit: 5)
|
|
218
|
-
raise DownloadError, "Too many redirects" if limit
|
|
220
|
+
raise DownloadError, "Too many redirects" if limit.zero?
|
|
219
221
|
|
|
220
222
|
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
|
|
221
223
|
response = http.request(Net::HTTP::Head.new(uri))
|
|
@@ -229,7 +231,7 @@ module Kabosu
|
|
|
229
231
|
end
|
|
230
232
|
|
|
231
233
|
def http_get(uri, headers: {}, redirect_limit: 5)
|
|
232
|
-
raise DownloadError, "Too many redirects" if redirect_limit
|
|
234
|
+
raise DownloadError, "Too many redirects" if redirect_limit.zero?
|
|
233
235
|
|
|
234
236
|
http = Net::HTTP.new(uri.host, uri.port)
|
|
235
237
|
http.use_ssl = (uri.scheme == "https")
|
|
@@ -248,7 +250,7 @@ module Kabosu
|
|
|
248
250
|
end
|
|
249
251
|
|
|
250
252
|
def extract(zip_path, dest_dir)
|
|
251
|
-
|
|
253
|
+
warn "Extracting..."
|
|
252
254
|
Zip::File.open(zip_path) do |archive|
|
|
253
255
|
archive.each do |entry|
|
|
254
256
|
target = File.join(dest_dir, entry.name)
|
|
@@ -256,6 +258,7 @@ module Kabosu
|
|
|
256
258
|
unless File.expand_path(target).start_with?(File.expand_path(dest_dir) + File::SEPARATOR)
|
|
257
259
|
raise DownloadError, "Refusing to extract entry outside dest_dir: #{entry.name}"
|
|
258
260
|
end
|
|
261
|
+
|
|
259
262
|
FileUtils.mkdir_p(File.dirname(target))
|
|
260
263
|
entry.extract(target) { true } # overwrite existing
|
|
261
264
|
end
|
data/lib/kabosu/morpheme_list.rb
CHANGED
|
@@ -7,7 +7,7 @@ module Kabosu
|
|
|
7
7
|
def initialize(source_or_morphemes, internal_cost: nil)
|
|
8
8
|
@source = source_or_morphemes if lazy_source?(source_or_morphemes)
|
|
9
9
|
@morphemes = @source ? Array.new(@source.size) : source_or_morphemes
|
|
10
|
-
@internal_cost = internal_cost ||
|
|
10
|
+
@internal_cost = internal_cost || @source&.internal_cost
|
|
11
11
|
end
|
|
12
12
|
|
|
13
13
|
def each(&block)
|
|
@@ -62,7 +62,7 @@ module Kabosu
|
|
|
62
62
|
end
|
|
63
63
|
|
|
64
64
|
def surfaces
|
|
65
|
-
return @source.surfaces if @source
|
|
65
|
+
return @source.surfaces if @source.respond_to?(:surfaces)
|
|
66
66
|
|
|
67
67
|
map(&:surface)
|
|
68
68
|
end
|
|
@@ -96,9 +96,7 @@ module Kabosu
|
|
|
96
96
|
# source. Falls back to a Ruby implementation for already-materialized
|
|
97
97
|
# lists so the method is always safe to call.
|
|
98
98
|
def group_morphemes
|
|
99
|
-
if @source
|
|
100
|
-
return @source.group_morphemes
|
|
101
|
-
end
|
|
99
|
+
return @source.group_morphemes if @source.respond_to?(:group_morphemes)
|
|
102
100
|
|
|
103
101
|
groups = []
|
|
104
102
|
each do |m|
|
|
@@ -131,10 +129,12 @@ module Kabosu
|
|
|
131
129
|
|
|
132
130
|
def clause_boundary?(morpheme)
|
|
133
131
|
return false unless morpheme
|
|
132
|
+
|
|
134
133
|
pos = morpheme.part_of_speech
|
|
135
134
|
return true if pos[0] == "助詞" &&
|
|
136
135
|
%w[ながら たら ば と のに から ので けれど けど つつ なり や か かどうか とも].include?(morpheme.surface)
|
|
137
136
|
return true if pos[0] == "助詞" && pos[1] == "接続助詞" && morpheme.surface == "が"
|
|
137
|
+
|
|
138
138
|
false
|
|
139
139
|
end
|
|
140
140
|
|
data/lib/kabosu/pos_matcher.rb
CHANGED
|
@@ -48,13 +48,15 @@ module Kabosu
|
|
|
48
48
|
|
|
49
49
|
# Union: matches if either matcher matches.
|
|
50
50
|
def |(other)
|
|
51
|
-
a
|
|
51
|
+
a = self
|
|
52
|
+
b = other
|
|
52
53
|
PosMatcher.new { |pos| a.match?(pos) || b.match?(pos) }
|
|
53
54
|
end
|
|
54
55
|
|
|
55
56
|
# Intersection: matches if both matchers match.
|
|
56
57
|
def &(other)
|
|
57
|
-
a
|
|
58
|
+
a = self
|
|
59
|
+
b = other
|
|
58
60
|
PosMatcher.new { |pos| a.match?(pos) && b.match?(pos) }
|
|
59
61
|
end
|
|
60
62
|
|
|
@@ -64,7 +66,8 @@ module Kabosu
|
|
|
64
66
|
end
|
|
65
67
|
|
|
66
68
|
def difference(other)
|
|
67
|
-
a
|
|
69
|
+
a = self
|
|
70
|
+
b = other
|
|
68
71
|
PosMatcher.new { |pos| a.match?(pos) && !b.match?(pos) }
|
|
69
72
|
end
|
|
70
73
|
|
|
@@ -95,7 +98,7 @@ module Kabosu
|
|
|
95
98
|
end
|
|
96
99
|
|
|
97
100
|
def self.proper_nouns
|
|
98
|
-
@proper_nouns ||= new([
|
|
101
|
+
@proper_nouns ||= new(%w[名詞 固有名詞])
|
|
99
102
|
end
|
|
100
103
|
|
|
101
104
|
private
|
data/lib/kabosu/tasks.rake
CHANGED
|
@@ -5,53 +5,54 @@ namespace :kabosu do
|
|
|
5
5
|
|
|
6
6
|
desc "Install the core dictionary (default). VERSION=YYYYMMDD to pin a specific release."
|
|
7
7
|
task :install do
|
|
8
|
-
Kabosu::DictManager.new.install("core", version: ENV
|
|
8
|
+
Kabosu::DictManager.new.install("core", version: ENV.fetch("VERSION", nil))
|
|
9
9
|
end
|
|
10
10
|
|
|
11
11
|
namespace :install do
|
|
12
12
|
desc "Install the small dictionary. VERSION=YYYYMMDD to pin a specific release."
|
|
13
13
|
task :small do
|
|
14
|
-
Kabosu::DictManager.new.install("small", version: ENV
|
|
14
|
+
Kabosu::DictManager.new.install("small", version: ENV.fetch("VERSION", nil))
|
|
15
15
|
end
|
|
16
16
|
|
|
17
17
|
desc "Install the core dictionary. VERSION=YYYYMMDD to pin a specific release."
|
|
18
18
|
task :core do
|
|
19
|
-
Kabosu::DictManager.new.install("core", version: ENV
|
|
19
|
+
Kabosu::DictManager.new.install("core", version: ENV.fetch("VERSION", nil))
|
|
20
20
|
end
|
|
21
21
|
|
|
22
22
|
desc "Install the full dictionary. VERSION=YYYYMMDD to pin a specific release."
|
|
23
23
|
task :full do
|
|
24
|
-
Kabosu::DictManager.new.install("full", version: ENV
|
|
24
|
+
Kabosu::DictManager.new.install("full", version: ENV.fetch("VERSION", nil))
|
|
25
25
|
end
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
-
desc "Install a dictionary only if a matching one isn't already on disk.
|
|
28
|
+
desc "Install a dictionary only if a matching one isn't already on disk. " \
|
|
29
|
+
"EDITION=core|small|full (default core), VERSION=YYYYMMDD optional."
|
|
29
30
|
task :install_if_missing do
|
|
30
31
|
edition = ENV["EDITION"] || "core"
|
|
31
|
-
Kabosu::DictManager.new.install_if_missing(edition, version: ENV
|
|
32
|
+
Kabosu::DictManager.new.install_if_missing(edition, version: ENV.fetch("VERSION", nil))
|
|
32
33
|
end
|
|
33
34
|
|
|
34
35
|
# ── Remove ──
|
|
35
36
|
|
|
36
37
|
desc "Remove all installed dictionaries, or a specific one with EDITION=small|core|full and/or VERSION=YYYYMMDD"
|
|
37
38
|
task :remove do
|
|
38
|
-
Kabosu::DictManager.new.remove(edition: ENV
|
|
39
|
+
Kabosu::DictManager.new.remove(edition: ENV.fetch("EDITION", nil), version: ENV.fetch("VERSION", nil))
|
|
39
40
|
end
|
|
40
41
|
|
|
41
42
|
namespace :remove do
|
|
42
43
|
desc "Remove the small dictionary."
|
|
43
44
|
task :small do
|
|
44
|
-
Kabosu::DictManager.new.remove(edition: "small", version: ENV
|
|
45
|
+
Kabosu::DictManager.new.remove(edition: "small", version: ENV.fetch("VERSION", nil))
|
|
45
46
|
end
|
|
46
47
|
|
|
47
48
|
desc "Remove the core dictionary."
|
|
48
49
|
task :core do
|
|
49
|
-
Kabosu::DictManager.new.remove(edition: "core", version: ENV
|
|
50
|
+
Kabosu::DictManager.new.remove(edition: "core", version: ENV.fetch("VERSION", nil))
|
|
50
51
|
end
|
|
51
52
|
|
|
52
53
|
desc "Remove the full dictionary."
|
|
53
54
|
task :full do
|
|
54
|
-
Kabosu::DictManager.new.remove(edition: "full", version: ENV
|
|
55
|
+
Kabosu::DictManager.new.remove(edition: "full", version: ENV.fetch("VERSION", nil))
|
|
55
56
|
end
|
|
56
57
|
end
|
|
57
58
|
|
|
@@ -82,11 +83,9 @@ namespace :kabosu do
|
|
|
82
83
|
|
|
83
84
|
desc "Show the path to the best available dictionary. EDITION=small|core|full to be specific."
|
|
84
85
|
task :path do
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
exit 1
|
|
90
|
-
end
|
|
86
|
+
puts Kabosu::DictManager.new.find(edition: ENV.fetch("EDITION", nil))
|
|
87
|
+
rescue Kabosu::DictManager::DictNotFound => e
|
|
88
|
+
warn e.message
|
|
89
|
+
exit 1
|
|
91
90
|
end
|
|
92
91
|
end
|
data/lib/kabosu/version.rb
CHANGED
data/lib/kabosu.rb
CHANGED
|
@@ -66,25 +66,17 @@ module Kabosu
|
|
|
66
66
|
DEFAULT_CONFIG_PATH = File.expand_path("kabosu/resources/sudachi.json", __dir__).freeze
|
|
67
67
|
|
|
68
68
|
class << self
|
|
69
|
-
|
|
69
|
+
alias _new new
|
|
70
70
|
|
|
71
71
|
def new(config: nil, system_dict: nil, user_dicts: nil)
|
|
72
|
-
unless config.nil? || config.is_a?(String)
|
|
73
|
-
|
|
74
|
-
end
|
|
75
|
-
unless system_dict.nil? || system_dict.is_a?(String)
|
|
76
|
-
raise ArgumentError, "system_dict must be a String or nil"
|
|
77
|
-
end
|
|
72
|
+
raise ArgumentError, "config must be a String or nil" unless config.nil? || config.is_a?(String)
|
|
73
|
+
raise ArgumentError, "system_dict must be a String or nil" unless system_dict.nil? || system_dict.is_a?(String)
|
|
78
74
|
unless user_dicts.nil? || user_dicts.is_a?(Array)
|
|
79
75
|
raise ArgumentError, "user_dicts must be an Array<String> or nil"
|
|
80
76
|
end
|
|
81
|
-
if user_dicts&.any? { !_1.is_a?(String) }
|
|
82
|
-
raise ArgumentError, "user_dicts must contain only String values"
|
|
83
|
-
end
|
|
77
|
+
raise ArgumentError, "user_dicts must contain only String values" if user_dicts&.any? { !_1.is_a?(String) }
|
|
84
78
|
|
|
85
|
-
if config.nil? && system_dict.nil?
|
|
86
|
-
raise ArgumentError, "either config or system_dict is required"
|
|
87
|
-
end
|
|
79
|
+
raise ArgumentError, "either config or system_dict is required" if config.nil? && system_dict.nil?
|
|
88
80
|
|
|
89
81
|
# Default to the sudachi.json bundled with this gem when only
|
|
90
82
|
# system_dict is given. sudachi.rs's own default config path is
|
|
@@ -113,9 +105,7 @@ module Kabosu
|
|
|
113
105
|
|
|
114
106
|
def map_dictionary_init_error(error, config:, system_dict:)
|
|
115
107
|
message = error.message
|
|
116
|
-
if config && system_dict.nil?
|
|
117
|
-
ConfigError.new(message)
|
|
118
|
-
elsif message.match?(/config|setting\.json|json/i)
|
|
108
|
+
if (config && system_dict.nil?) || message.match?(/config|setting\.json|json/i)
|
|
119
109
|
ConfigError.new(message)
|
|
120
110
|
else
|
|
121
111
|
DictionaryError.new(message)
|
|
@@ -127,40 +117,33 @@ module Kabosu
|
|
|
127
117
|
end
|
|
128
118
|
end
|
|
129
119
|
|
|
130
|
-
|
|
131
|
-
|
|
120
|
+
alias _create create
|
|
121
|
+
alias _lookup lookup
|
|
132
122
|
|
|
133
123
|
def create(**options)
|
|
134
124
|
unknown = options.keys - %i[mode fields debug projection]
|
|
135
|
-
raise ArgumentError, "unknown keyword(s): #{unknown.join(
|
|
125
|
+
raise ArgumentError, "unknown keyword(s): #{unknown.join(", ")}" unless unknown.empty?
|
|
136
126
|
|
|
137
127
|
mode = options.fetch(:mode, MODE_C)
|
|
138
128
|
fields = options.fetch(:fields, nil)
|
|
139
129
|
debug = options.fetch(:debug, false)
|
|
140
130
|
projection = options.fetch(:projection, nil)
|
|
141
131
|
|
|
142
|
-
unless fields.nil? || fields.is_a?(Array)
|
|
143
|
-
raise ArgumentError, "fields must be an Array<String|Symbol> or nil"
|
|
144
|
-
end
|
|
132
|
+
raise ArgumentError, "fields must be an Array<String|Symbol> or nil" unless fields.nil? || fields.is_a?(Array)
|
|
145
133
|
if fields&.any? { !(_1.is_a?(String) || _1.is_a?(Symbol)) }
|
|
146
134
|
raise ArgumentError, "fields must contain only String or Symbol values"
|
|
147
135
|
end
|
|
148
|
-
|
|
149
|
-
raise ArgumentError, "debug must be true or false"
|
|
150
|
-
end
|
|
136
|
+
raise ArgumentError, "debug must be true or false" unless [true, false].include?(debug)
|
|
151
137
|
|
|
152
|
-
unless projection.nil?
|
|
153
|
-
raise NotImplementedError, "projection is not supported yet"
|
|
154
|
-
end
|
|
138
|
+
raise NotImplementedError, "projection is not supported yet" unless projection.nil?
|
|
155
139
|
|
|
156
140
|
mode_str = Kabosu.__send__(:normalize_mode, mode)
|
|
157
141
|
_create(mode_str, fields, debug)
|
|
158
142
|
end
|
|
159
143
|
|
|
160
144
|
def lookup(text)
|
|
161
|
-
unless text.is_a?(String)
|
|
162
|
-
|
|
163
|
-
end
|
|
145
|
+
raise ArgumentError, "text must be a String" unless text.is_a?(String)
|
|
146
|
+
|
|
164
147
|
MorphemeList.new(_lookup(text))
|
|
165
148
|
rescue RuntimeError => e
|
|
166
149
|
raise LookupError.new(e.message), cause: e
|
|
@@ -170,12 +153,10 @@ module Kabosu
|
|
|
170
153
|
# ── Tokenizer: wrap output in MorphemeList ──
|
|
171
154
|
|
|
172
155
|
class Tokenizer
|
|
173
|
-
|
|
156
|
+
alias _tokenize tokenize
|
|
174
157
|
|
|
175
158
|
def tokenize(text)
|
|
176
|
-
unless text.is_a?(String)
|
|
177
|
-
raise ArgumentError, "text must be a String"
|
|
178
|
-
end
|
|
159
|
+
raise ArgumentError, "text must be a String" unless text.is_a?(String)
|
|
179
160
|
|
|
180
161
|
batch = _tokenize(text)
|
|
181
162
|
cost = batch.respond_to?(:internal_cost) ? batch.internal_cost : nil
|
|
@@ -186,12 +167,11 @@ module Kabosu
|
|
|
186
167
|
end
|
|
187
168
|
|
|
188
169
|
class Morpheme
|
|
189
|
-
|
|
170
|
+
alias _split split
|
|
190
171
|
|
|
191
172
|
def split(mode: MODE_C, add_single: true)
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
end
|
|
173
|
+
raise ArgumentError, "add_single must be true or false" unless [true, false].include?(add_single)
|
|
174
|
+
|
|
195
175
|
mode_str = Kabosu.__send__(:normalize_mode, mode)
|
|
196
176
|
MorphemeList.new(_split(mode_str, nil, add_single))
|
|
197
177
|
rescue RuntimeError => e
|
|
@@ -200,29 +180,15 @@ module Kabosu
|
|
|
200
180
|
end
|
|
201
181
|
|
|
202
182
|
def self.split_sentences(text, limit: nil, with_checker: false, ranges: false, dictionary: nil)
|
|
203
|
-
unless text.is_a?(String)
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
unless
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
if limit && limit < 1
|
|
210
|
-
raise ArgumentError, "limit must be greater than 0"
|
|
211
|
-
end
|
|
212
|
-
unless with_checker == true || with_checker == false
|
|
213
|
-
raise ArgumentError, "with_checker must be true or false"
|
|
214
|
-
end
|
|
215
|
-
unless ranges == true || ranges == false
|
|
216
|
-
raise ArgumentError, "ranges must be true or false"
|
|
217
|
-
end
|
|
218
|
-
unless dictionary.nil? || dictionary.is_a?(String)
|
|
219
|
-
raise ArgumentError, "dictionary must be a String path or nil"
|
|
220
|
-
end
|
|
183
|
+
raise ArgumentError, "text must be a String" unless text.is_a?(String)
|
|
184
|
+
raise ArgumentError, "limit must be an Integer or nil" unless limit.nil? || limit.is_a?(Integer)
|
|
185
|
+
raise ArgumentError, "limit must be greater than 0" if limit && limit < 1
|
|
186
|
+
raise ArgumentError, "with_checker must be true or false" unless [true, false].include?(with_checker)
|
|
187
|
+
raise ArgumentError, "ranges must be true or false" unless [true, false].include?(ranges)
|
|
188
|
+
raise ArgumentError, "dictionary must be a String path or nil" unless dictionary.nil? || dictionary.is_a?(String)
|
|
221
189
|
|
|
222
190
|
dict_path = nil
|
|
223
|
-
if with_checker
|
|
224
|
-
dict_path = dictionary || Dictionary.path
|
|
225
|
-
end
|
|
191
|
+
dict_path = dictionary || Dictionary.path if with_checker
|
|
226
192
|
|
|
227
193
|
if ranges
|
|
228
194
|
_split_sentences_with_ranges(text, limit, dict_path).map do |(start, finish, sentence)|
|
|
@@ -244,12 +210,8 @@ module Kabosu
|
|
|
244
210
|
# Kabosu.tokenize("東京都に住んでいる", tokenizer: tok)
|
|
245
211
|
#
|
|
246
212
|
def self.tokenize(text, tokenizer:)
|
|
247
|
-
unless text.is_a?(String)
|
|
248
|
-
|
|
249
|
-
end
|
|
250
|
-
unless tokenizer.is_a?(Tokenizer)
|
|
251
|
-
raise ArgumentError, "tokenizer must be a Kabosu::Tokenizer"
|
|
252
|
-
end
|
|
213
|
+
raise ArgumentError, "text must be a String" unless text.is_a?(String)
|
|
214
|
+
raise ArgumentError, "tokenizer must be a Kabosu::Tokenizer" unless tokenizer.is_a?(Tokenizer)
|
|
253
215
|
|
|
254
216
|
batch = tokenizer.__send__(:_tokenize, text)
|
|
255
217
|
cost = batch.respond_to?(:internal_cost) ? batch.internal_cost : nil
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kabosu
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.6.
|
|
4
|
+
version: 0.6.11.0.dev.20260627.a5a69e7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- davafons
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-06-27 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -39,61 +39,75 @@ dependencies:
|
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: '2.3'
|
|
41
41
|
- !ruby/object:Gem::Dependency
|
|
42
|
-
name:
|
|
42
|
+
name: benchmark
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ">="
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0'
|
|
48
|
+
type: :development
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ">="
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: minitest
|
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
|
44
58
|
requirements:
|
|
45
59
|
- - "~>"
|
|
46
60
|
- !ruby/object:Gem::Version
|
|
47
|
-
version: '
|
|
61
|
+
version: '5.0'
|
|
48
62
|
type: :development
|
|
49
63
|
prerelease: false
|
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
65
|
requirements:
|
|
52
66
|
- - "~>"
|
|
53
67
|
- !ruby/object:Gem::Version
|
|
54
|
-
version: '
|
|
68
|
+
version: '5.0'
|
|
55
69
|
- !ruby/object:Gem::Dependency
|
|
56
|
-
name: rake
|
|
70
|
+
name: rake
|
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
|
58
72
|
requirements:
|
|
59
73
|
- - "~>"
|
|
60
74
|
- !ruby/object:Gem::Version
|
|
61
|
-
version: '
|
|
75
|
+
version: '13.0'
|
|
62
76
|
type: :development
|
|
63
77
|
prerelease: false
|
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
|
65
79
|
requirements:
|
|
66
80
|
- - "~>"
|
|
67
81
|
- !ruby/object:Gem::Version
|
|
68
|
-
version: '
|
|
82
|
+
version: '13.0'
|
|
69
83
|
- !ruby/object:Gem::Dependency
|
|
70
|
-
name:
|
|
84
|
+
name: rake-compiler
|
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
|
72
86
|
requirements:
|
|
73
|
-
- - "
|
|
87
|
+
- - "~>"
|
|
74
88
|
- !ruby/object:Gem::Version
|
|
75
|
-
version: '
|
|
89
|
+
version: '1.2'
|
|
76
90
|
type: :development
|
|
77
91
|
prerelease: false
|
|
78
92
|
version_requirements: !ruby/object:Gem::Requirement
|
|
79
93
|
requirements:
|
|
80
|
-
- - "
|
|
94
|
+
- - "~>"
|
|
81
95
|
- !ruby/object:Gem::Version
|
|
82
|
-
version: '
|
|
96
|
+
version: '1.2'
|
|
83
97
|
- !ruby/object:Gem::Dependency
|
|
84
|
-
name:
|
|
98
|
+
name: rubocop
|
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
|
86
100
|
requirements:
|
|
87
101
|
- - "~>"
|
|
88
102
|
- !ruby/object:Gem::Version
|
|
89
|
-
version: '
|
|
103
|
+
version: '1.0'
|
|
90
104
|
type: :development
|
|
91
105
|
prerelease: false
|
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
|
93
107
|
requirements:
|
|
94
108
|
- - "~>"
|
|
95
109
|
- !ruby/object:Gem::Version
|
|
96
|
-
version: '
|
|
110
|
+
version: '1.0'
|
|
97
111
|
description: Kabosu provides Ruby bindings for sudachi.rs, a Rust implementation of
|
|
98
112
|
the Sudachi Japanese morphological analyzer.
|
|
99
113
|
email:
|
|
@@ -133,7 +147,8 @@ files:
|
|
|
133
147
|
homepage: https://github.com/davafons/kabosu
|
|
134
148
|
licenses:
|
|
135
149
|
- Apache-2.0
|
|
136
|
-
metadata:
|
|
150
|
+
metadata:
|
|
151
|
+
rubygems_mfa_required: 'true'
|
|
137
152
|
post_install_message:
|
|
138
153
|
rdoc_options: []
|
|
139
154
|
require_paths:
|