kabosu 0.6.10.dev.20260225.c3c6711 → 0.6.10.dev.20260226.98055fb
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -7
- data/ext/kabosu/src/dictionary.rs +3 -0
- data/ext/kabosu/src/lib.rs +0 -10
- data/ext/kabosu/src/morpheme.rs +53 -26
- data/ext/kabosu/src/tokenizer.rs +0 -49
- data/lib/kabosu/version.rb +1 -1
- data/lib/kabosu.rb +0 -41
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 88012b4a49377c642ff1d694ae3d6da5e6400e2b4bf494c34ee99e1467b1be0c
|
|
4
|
+
data.tar.gz: 4c7324c24a1880ba62e006d8f5f3cea4610cc9e574122a4ece812b2acc649d68
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7bb1f40e57f79f5aab99bb3867db3ddaa0e6b0ef34f12b5d479dc961a7a6e5ea7dbbf79db1ac33e95b155843d722e45452716f1cca954f9a42346a9bfbc41b8c
|
|
7
|
+
data.tar.gz: 7b5de7868de3e7725a3c777a5d3374aa8b452c2611abcb0ffecf22cc74d5a1321055f85db690ca42a78fe0d114530fb354cdec2c1d60e47c00e042caa7cdef8b
|
data/README.md
CHANGED
|
@@ -54,6 +54,10 @@ morpheme.begin_c # => 0 - start character offset
|
|
|
54
54
|
morpheme.end_c # => 3 - end character offset
|
|
55
55
|
morpheme.system? # => true - from system dictionary?
|
|
56
56
|
morpheme.user? # => false - from user dictionary?
|
|
57
|
+
|
|
58
|
+
# Split text into natural Japanese sentence boundaries
|
|
59
|
+
Kabosu.split_sentences("東京都に住んでいる。大阪も好きだ。")
|
|
60
|
+
# => ["東京都に住んでいる。", "大阪も好きだ。"]
|
|
57
61
|
```
|
|
58
62
|
|
|
59
63
|
## Installation
|
|
@@ -109,7 +113,6 @@ tok_c.tokenize("東京都").surfaces # => ["東京都"]
|
|
|
109
113
|
```
|
|
110
114
|
|
|
111
115
|
Modes are symbols only (`:a`, `:b`, `:c` or `Kabosu::MODE_A/B/C`).
|
|
112
|
-
Invalid modes now raise `ArgumentError` (for example, `"A"`).
|
|
113
116
|
|
|
114
117
|
## Advanced Use Cases
|
|
115
118
|
|
|
@@ -135,12 +138,6 @@ dict.lookup("東京都").surfaces
|
|
|
135
138
|
m = tokenizer.tokenize("東京都").first
|
|
136
139
|
m.split(mode: :a).surfaces
|
|
137
140
|
|
|
138
|
-
# Bulk extractors
|
|
139
|
-
tokenizer.tokenize_surfaces("東京都に住んでいる")
|
|
140
|
-
tokenizer.tokenize_readings("東京都に住んでいる")
|
|
141
|
-
tokenizer.tokenize_dictionary_forms("東京都に住んでいる")
|
|
142
|
-
tokenizer.tokenize_normalized_forms("東京都に住んでいる")
|
|
143
|
-
|
|
144
141
|
# Sentence splitting
|
|
145
142
|
Kabosu.split_sentences("東京都に住んでいる。大阪も好きだ。", ranges: true)
|
|
146
143
|
Kabosu.split_sentences("長い文...", limit: 12, with_checker: true)
|
|
@@ -203,5 +200,6 @@ bundle exec rake kabosu:install # Install Sudachi dictionary
|
|
|
203
200
|
|
|
204
201
|
bundle exec rake compile # Build the native extension
|
|
205
202
|
bundle exec rake test # Run tests
|
|
203
|
+
|
|
206
204
|
bench/start # Run benchmarks
|
|
207
205
|
```
|
|
@@ -64,6 +64,9 @@ impl RbDictionary {
|
|
|
64
64
|
|
|
65
65
|
let data = MorphemeData {
|
|
66
66
|
surface: surface_slice.to_string(),
|
|
67
|
+
dictionary_form: info.dictionary_form().to_string(),
|
|
68
|
+
normalized_form: info.normalized_form().to_string(),
|
|
69
|
+
reading_form: info.reading_form().to_string(),
|
|
67
70
|
pos_id: info.pos_id(),
|
|
68
71
|
word_id_raw: entry.word_id.as_raw(),
|
|
69
72
|
is_oov: entry.word_id.is_oov(),
|
data/ext/kabosu/src/lib.rs
CHANGED
|
@@ -42,16 +42,6 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
42
42
|
// Kabosu::Tokenizer
|
|
43
43
|
let tok_class = module.define_class("Tokenizer", ruby.class_object())?;
|
|
44
44
|
tok_class.define_method("tokenize", method!(RbTokenizer::tokenize, 1))?;
|
|
45
|
-
tok_class.define_method("tokenize_surfaces", method!(RbTokenizer::tokenize_surfaces, 1))?;
|
|
46
|
-
tok_class.define_method("tokenize_readings", method!(RbTokenizer::tokenize_readings, 1))?;
|
|
47
|
-
tok_class.define_method(
|
|
48
|
-
"tokenize_dictionary_forms",
|
|
49
|
-
method!(RbTokenizer::tokenize_dictionary_forms, 1),
|
|
50
|
-
)?;
|
|
51
|
-
tok_class.define_method(
|
|
52
|
-
"tokenize_normalized_forms",
|
|
53
|
-
method!(RbTokenizer::tokenize_normalized_forms, 1),
|
|
54
|
-
)?;
|
|
55
45
|
tok_class.define_method("mode", method!(RbTokenizer::mode, 0))?;
|
|
56
46
|
tok_class.define_method("fields", method!(RbTokenizer::fields, 0))?;
|
|
57
47
|
tok_class.define_method("debug?", method!(RbTokenizer::is_debug, 0))?;
|
data/ext/kabosu/src/morpheme.rs
CHANGED
|
@@ -13,6 +13,9 @@ use crate::parsing::parse_mode;
|
|
|
13
13
|
#[derive(Clone)]
|
|
14
14
|
pub(crate) struct MorphemeData {
|
|
15
15
|
pub(crate) surface: String,
|
|
16
|
+
pub(crate) dictionary_form: String,
|
|
17
|
+
pub(crate) normalized_form: String,
|
|
18
|
+
pub(crate) reading_form: String,
|
|
16
19
|
pub(crate) pos_id: u16,
|
|
17
20
|
pub(crate) word_id_raw: u32,
|
|
18
21
|
pub(crate) is_oov: bool,
|
|
@@ -37,6 +40,9 @@ where
|
|
|
37
40
|
|
|
38
41
|
MorphemeData {
|
|
39
42
|
surface,
|
|
43
|
+
dictionary_form: m.dictionary_form().to_string(),
|
|
44
|
+
normalized_form: m.normalized_form().to_string(),
|
|
45
|
+
reading_form: m.reading_form().to_string(),
|
|
40
46
|
pos_id: m.part_of_speech_id(),
|
|
41
47
|
word_id_raw: m.word_id().as_raw(),
|
|
42
48
|
is_oov: m.is_oov(),
|
|
@@ -84,9 +90,6 @@ pub(crate) struct RbMorpheme {
|
|
|
84
90
|
}
|
|
85
91
|
|
|
86
92
|
struct LazyWordFields {
|
|
87
|
-
dictionary_form: String,
|
|
88
|
-
normalized_form: String,
|
|
89
|
-
reading_form: String,
|
|
90
93
|
synonym_group_ids: Vec<u32>,
|
|
91
94
|
dictionary_form_word_id: i32,
|
|
92
95
|
head_word_length: usize,
|
|
@@ -96,14 +99,31 @@ struct LazyWordFields {
|
|
|
96
99
|
}
|
|
97
100
|
|
|
98
101
|
impl RbMorpheme {
|
|
102
|
+
fn fallback_word_fields(&self) -> LazyWordFields {
|
|
103
|
+
LazyWordFields {
|
|
104
|
+
synonym_group_ids: Vec::new(),
|
|
105
|
+
dictionary_form_word_id: -1,
|
|
106
|
+
head_word_length: self.data.surface.chars().count(),
|
|
107
|
+
a_unit_split: Vec::new(),
|
|
108
|
+
b_unit_split: Vec::new(),
|
|
109
|
+
word_structure: Vec::new(),
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
99
113
|
fn resolve_word_fields(&self) -> &LazyWordFields {
|
|
100
114
|
self.word_fields.get_or_init(|| {
|
|
115
|
+
if self.data.is_oov || self.data.dictionary_id < 0 {
|
|
116
|
+
return self.fallback_word_fields();
|
|
117
|
+
}
|
|
118
|
+
|
|
101
119
|
let wid = WordId::from_raw(self.data.word_id_raw);
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
120
|
+
|
|
121
|
+
let info_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
|
122
|
+
self.dict.lexicon().get_word_info(wid)
|
|
123
|
+
}));
|
|
124
|
+
|
|
125
|
+
match info_result {
|
|
126
|
+
Ok(Ok(info)) => LazyWordFields {
|
|
107
127
|
synonym_group_ids: info.synonym_group_ids().to_vec(),
|
|
108
128
|
dictionary_form_word_id: info.dictionary_form_word_id(),
|
|
109
129
|
head_word_length: info.head_word_length(),
|
|
@@ -111,28 +131,25 @@ impl RbMorpheme {
|
|
|
111
131
|
b_unit_split: info.b_unit_split().iter().map(WordId::as_raw).collect(),
|
|
112
132
|
word_structure: info.word_structure().iter().map(WordId::as_raw).collect(),
|
|
113
133
|
},
|
|
114
|
-
|
|
115
|
-
dictionary_form: self.data.surface.clone(),
|
|
116
|
-
normalized_form: self.data.surface.clone(),
|
|
117
|
-
reading_form: self.data.surface.clone(),
|
|
118
|
-
synonym_group_ids: Vec::new(),
|
|
119
|
-
dictionary_form_word_id: -1,
|
|
120
|
-
head_word_length: self.data.surface.chars().count(),
|
|
121
|
-
a_unit_split: Vec::new(),
|
|
122
|
-
b_unit_split: Vec::new(),
|
|
123
|
-
word_structure: Vec::new(),
|
|
124
|
-
},
|
|
134
|
+
_ => self.fallback_word_fields(),
|
|
125
135
|
}
|
|
126
136
|
})
|
|
127
137
|
}
|
|
128
138
|
|
|
129
139
|
fn split_ids_from_lexicon(&self, mode: Mode) -> Result<Vec<u32>, Error> {
|
|
130
|
-
|
|
131
|
-
if wid.is_oov() {
|
|
140
|
+
if self.data.is_oov || self.data.dictionary_id < 0 {
|
|
132
141
|
return Ok(Vec::new());
|
|
133
142
|
}
|
|
143
|
+
let wid = WordId::from_raw(self.data.word_id_raw);
|
|
134
144
|
|
|
135
|
-
let
|
|
145
|
+
let info_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
|
146
|
+
self.dict.lexicon().get_word_info(wid)
|
|
147
|
+
}));
|
|
148
|
+
let info = match info_result {
|
|
149
|
+
Ok(Ok(info)) => info,
|
|
150
|
+
Ok(Err(e)) => return Err(sudachi_error(e)),
|
|
151
|
+
Err(_) => return Err(sudachi_error("panic while reading word info for split")),
|
|
152
|
+
};
|
|
136
153
|
let ids = match mode {
|
|
137
154
|
Mode::A => info.a_unit_split(),
|
|
138
155
|
Mode::B => info.b_unit_split(),
|
|
@@ -174,7 +191,14 @@ impl RbMorpheme {
|
|
|
174
191
|
|
|
175
192
|
for (i, &raw_wid) in split_ids.iter().enumerate() {
|
|
176
193
|
let wid = WordId::from_raw(raw_wid);
|
|
177
|
-
let
|
|
194
|
+
let info_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
|
195
|
+
self.dict.lexicon().get_word_info(wid)
|
|
196
|
+
}));
|
|
197
|
+
let info = match info_result {
|
|
198
|
+
Ok(Ok(info)) => info,
|
|
199
|
+
Ok(Err(e)) => return Err(sudachi_error(e)),
|
|
200
|
+
Err(_) => return Err(sudachi_error("panic while reading child word info for split")),
|
|
201
|
+
};
|
|
178
202
|
|
|
179
203
|
// head_word_length is in codepoints; clamp to remaining characters.
|
|
180
204
|
let mut span_chars = info.head_word_length();
|
|
@@ -190,6 +214,9 @@ impl RbMorpheme {
|
|
|
190
214
|
|
|
191
215
|
let child = MorphemeData {
|
|
192
216
|
surface: surface[start_byte..end_byte].to_string(),
|
|
217
|
+
dictionary_form: info.dictionary_form().to_string(),
|
|
218
|
+
normalized_form: info.normalized_form().to_string(),
|
|
219
|
+
reading_form: info.reading_form().to_string(),
|
|
193
220
|
pos_id: info.pos_id(),
|
|
194
221
|
word_id_raw: raw_wid,
|
|
195
222
|
is_oov: wid.is_oov(),
|
|
@@ -230,15 +257,15 @@ impl RbMorpheme {
|
|
|
230
257
|
}
|
|
231
258
|
|
|
232
259
|
pub(crate) fn dictionary_form(&self) -> &str {
|
|
233
|
-
&self.
|
|
260
|
+
&self.data.dictionary_form
|
|
234
261
|
}
|
|
235
262
|
|
|
236
263
|
pub(crate) fn normalized_form(&self) -> &str {
|
|
237
|
-
&self.
|
|
264
|
+
&self.data.normalized_form
|
|
238
265
|
}
|
|
239
266
|
|
|
240
267
|
pub(crate) fn reading_form(&self) -> &str {
|
|
241
|
-
&self.
|
|
268
|
+
&self.data.reading_form
|
|
242
269
|
}
|
|
243
270
|
|
|
244
271
|
pub(crate) fn is_oov(&self) -> bool {
|
data/ext/kabosu/src/tokenizer.rs
CHANGED
|
@@ -8,7 +8,6 @@ use sudachi::analysis::stateful_tokenizer::StatefulTokenizer;
|
|
|
8
8
|
use sudachi::analysis::Mode;
|
|
9
9
|
use sudachi::dic::dictionary::JapaneseDictionary;
|
|
10
10
|
use sudachi::dic::subset::InfoSubset;
|
|
11
|
-
use sudachi::dic::word_id::WordId;
|
|
12
11
|
use sudachi::prelude::MorphemeList;
|
|
13
12
|
|
|
14
13
|
use crate::errors::sudachi_error;
|
|
@@ -168,20 +167,6 @@ impl RbTokenizer {
|
|
|
168
167
|
}
|
|
169
168
|
}
|
|
170
169
|
|
|
171
|
-
fn collect_strings<F>(&self, text: String, mut project: F) -> Result<RArray, Error>
|
|
172
|
-
where
|
|
173
|
-
F: FnMut(&MorphemeData) -> String,
|
|
174
|
-
{
|
|
175
|
-
let ruby = Ruby::get().unwrap();
|
|
176
|
-
let analyzed = self.analyze(text)?;
|
|
177
|
-
let ary = ruby.ary_new_capa(analyzed.morphemes.len());
|
|
178
|
-
for data in analyzed.morphemes {
|
|
179
|
-
let projected = project(&data);
|
|
180
|
-
ary.push(ruby.str_new(&projected))?;
|
|
181
|
-
}
|
|
182
|
-
Ok(ary)
|
|
183
|
-
}
|
|
184
|
-
|
|
185
170
|
pub(crate) fn tokenize(&self, text: String) -> Result<RbTokenBatch, Error> {
|
|
186
171
|
let analyzed = self.analyze(text)?;
|
|
187
172
|
Ok(RbTokenBatch::new(
|
|
@@ -192,40 +177,6 @@ impl RbTokenizer {
|
|
|
192
177
|
))
|
|
193
178
|
}
|
|
194
179
|
|
|
195
|
-
pub(crate) fn tokenize_surfaces(&self, text: String) -> Result<RArray, Error> {
|
|
196
|
-
self.collect_strings(text, |data| data.surface.clone())
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
pub(crate) fn tokenize_readings(&self, text: String) -> Result<RArray, Error> {
|
|
200
|
-
let lexicon = self.pool.dict.lexicon();
|
|
201
|
-
self.collect_strings(text, |data| {
|
|
202
|
-
lexicon
|
|
203
|
-
.get_word_info(WordId::from_raw(data.word_id_raw))
|
|
204
|
-
.map(|info| info.reading_form().to_string())
|
|
205
|
-
.unwrap_or_else(|_| data.surface.clone())
|
|
206
|
-
})
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
pub(crate) fn tokenize_dictionary_forms(&self, text: String) -> Result<RArray, Error> {
|
|
210
|
-
let lexicon = self.pool.dict.lexicon();
|
|
211
|
-
self.collect_strings(text, |data| {
|
|
212
|
-
lexicon
|
|
213
|
-
.get_word_info(WordId::from_raw(data.word_id_raw))
|
|
214
|
-
.map(|info| info.dictionary_form().to_string())
|
|
215
|
-
.unwrap_or_else(|_| data.surface.clone())
|
|
216
|
-
})
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
pub(crate) fn tokenize_normalized_forms(&self, text: String) -> Result<RArray, Error> {
|
|
220
|
-
let lexicon = self.pool.dict.lexicon();
|
|
221
|
-
self.collect_strings(text, |data| {
|
|
222
|
-
lexicon
|
|
223
|
-
.get_word_info(WordId::from_raw(data.word_id_raw))
|
|
224
|
-
.map(|info| info.normalized_form().to_string())
|
|
225
|
-
.unwrap_or_else(|_| data.surface.clone())
|
|
226
|
-
})
|
|
227
|
-
}
|
|
228
|
-
|
|
229
180
|
pub(crate) fn mode(&self) -> String {
|
|
230
181
|
self.mode.to_string()
|
|
231
182
|
}
|
data/lib/kabosu/version.rb
CHANGED
data/lib/kabosu.rb
CHANGED
|
@@ -162,47 +162,6 @@ module Kabosu
|
|
|
162
162
|
rescue RuntimeError => e
|
|
163
163
|
raise TokenizationError.new(e.message), cause: e
|
|
164
164
|
end
|
|
165
|
-
|
|
166
|
-
alias_method :_tokenize_surfaces, :tokenize_surfaces
|
|
167
|
-
alias_method :_tokenize_readings, :tokenize_readings
|
|
168
|
-
alias_method :_tokenize_dictionary_forms, :tokenize_dictionary_forms
|
|
169
|
-
alias_method :_tokenize_normalized_forms, :tokenize_normalized_forms
|
|
170
|
-
|
|
171
|
-
def tokenize_surfaces(text)
|
|
172
|
-
unless text.is_a?(String)
|
|
173
|
-
raise ArgumentError, "text must be a String"
|
|
174
|
-
end
|
|
175
|
-
_tokenize_surfaces(text)
|
|
176
|
-
rescue RuntimeError => e
|
|
177
|
-
raise TokenizationError.new(e.message), cause: e
|
|
178
|
-
end
|
|
179
|
-
|
|
180
|
-
def tokenize_readings(text)
|
|
181
|
-
unless text.is_a?(String)
|
|
182
|
-
raise ArgumentError, "text must be a String"
|
|
183
|
-
end
|
|
184
|
-
_tokenize_readings(text)
|
|
185
|
-
rescue RuntimeError => e
|
|
186
|
-
raise TokenizationError.new(e.message), cause: e
|
|
187
|
-
end
|
|
188
|
-
|
|
189
|
-
def tokenize_dictionary_forms(text)
|
|
190
|
-
unless text.is_a?(String)
|
|
191
|
-
raise ArgumentError, "text must be a String"
|
|
192
|
-
end
|
|
193
|
-
_tokenize_dictionary_forms(text)
|
|
194
|
-
rescue RuntimeError => e
|
|
195
|
-
raise TokenizationError.new(e.message), cause: e
|
|
196
|
-
end
|
|
197
|
-
|
|
198
|
-
def tokenize_normalized_forms(text)
|
|
199
|
-
unless text.is_a?(String)
|
|
200
|
-
raise ArgumentError, "text must be a String"
|
|
201
|
-
end
|
|
202
|
-
_tokenize_normalized_forms(text)
|
|
203
|
-
rescue RuntimeError => e
|
|
204
|
-
raise TokenizationError.new(e.message), cause: e
|
|
205
|
-
end
|
|
206
165
|
end
|
|
207
166
|
|
|
208
167
|
class Morpheme
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kabosu
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.6.10.dev.
|
|
4
|
+
version: 0.6.10.dev.20260226.98055fb
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- davafons
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-02-
|
|
11
|
+
date: 2026-02-26 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|