kabosu 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +94 -0
- data/ext/kabosu/Cargo.toml +12 -0
- data/ext/kabosu/extconf.rb +4 -0
- data/ext/kabosu/src/lib.rs +490 -0
- data/lib/kabosu/dict_manager.rb +239 -0
- data/lib/kabosu/morpheme_list.rb +97 -0
- data/lib/kabosu/pos_matcher.rb +119 -0
- data/lib/kabosu/tasks.rake +86 -0
- data/lib/kabosu/version.rb +3 -0
- data/lib/kabosu.rb +143 -0
- metadata +106 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: abf1417bc26350c38353f26adabd6742194a545d0f7a781138452c0077730a1b
|
|
4
|
+
data.tar.gz: 23eeec85cccacc705fae1c74fa820e8df2e95fafb0e31f8b04d0676b5776bed6
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 0efce4afc57189336cecf4cbb6962f1fd1db9781baf3f99709c786b3c9fff384f38149fb1bf2b33ca1d0844a0755cd067f1dc4886ca76477c6e044cbfd1b4c0f
|
|
7
|
+
data.tar.gz: a9d0043d1a4807d1964365cdb5b8b65425384845aed2856b829516486426901f6e80d6b16cc7bba2b5ddff9d7c0f3c28279a071c4183f61aad3c24d9ef05f798
|
data/README.md
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Kabosu
|
|
2
|
+
|
|
3
|
+
Ruby bindings for [sudachi.rs](https://github.com/WorksApplications/sudachi.rs), a Rust implementation of the [Sudachi](https://github.com/WorksApplications/Sudachi) Japanese morphological analyzer.
|
|
4
|
+
|
|
5
|
+
## Requirements
|
|
6
|
+
|
|
7
|
+
- Ruby >= 3.1
|
|
8
|
+
- Rust toolchain (for compiling the native extension)
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
Add to your Gemfile:
|
|
13
|
+
|
|
14
|
+
```ruby
|
|
15
|
+
gem "kabosu"
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Then install and download a Sudachi dictionary:
|
|
19
|
+
|
|
20
|
+
```sh
|
|
21
|
+
bundle install
|
|
22
|
+
bundle exec rake kabosu:install[small] # or core, full
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Dictionary editions (from smallest to largest): `small`, `core`, `full`.
|
|
26
|
+
|
|
27
|
+
## Usage
|
|
28
|
+
|
|
29
|
+
```ruby
|
|
30
|
+
require "kabosu"
|
|
31
|
+
|
|
32
|
+
# Tokenize Japanese text (auto-discovers installed dictionary)
|
|
33
|
+
morphemes = Kabosu.tokenize("東京都に住んでいる")
|
|
34
|
+
|
|
35
|
+
morphemes.surfaces # => ["東京都", "に", "住ん", "で", "いる"]
|
|
36
|
+
morphemes.readings # => ["トウキョウト", "ニ", "スン", "デ", "イル"]
|
|
37
|
+
morphemes.dictionary_forms # => ["東京都", "に", "住む", "で", "居る"]
|
|
38
|
+
|
|
39
|
+
morphemes.each do |m|
|
|
40
|
+
puts "#{m.surface}\t#{m.part_of_speech.join(',')}\t#{m.reading_form}"
|
|
41
|
+
end
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Tokenization modes
|
|
45
|
+
|
|
46
|
+
Sudachi provides three split modes:
|
|
47
|
+
|
|
48
|
+
| Mode | Description |
|
|
49
|
+
|------|-------------|
|
|
50
|
+
| `A` | Short units (most granular) |
|
|
51
|
+
| `B` | Middle units |
|
|
52
|
+
| `C` | Named entity units (default) |
|
|
53
|
+
|
|
54
|
+
```ruby
|
|
55
|
+
Kabosu.tokenize("東京都", mode: "A").surfaces # => ["東京", "都"]
|
|
56
|
+
Kabosu.tokenize("東京都", mode: "C").surfaces # => ["東京都"]
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Direct API
|
|
60
|
+
|
|
61
|
+
For more control, create a dictionary and tokenizer directly:
|
|
62
|
+
|
|
63
|
+
```ruby
|
|
64
|
+
dict = Kabosu::Dictionary.new
|
|
65
|
+
tokenizer = dict.create("C")
|
|
66
|
+
morphemes = tokenizer.tokenize("国会議事堂前駅")
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Dictionary management
|
|
70
|
+
|
|
71
|
+
Rake tasks for managing Sudachi dictionaries:
|
|
72
|
+
|
|
73
|
+
```sh
|
|
74
|
+
rake kabosu:install[small] # Install a dictionary (VERSION=YYYYMMDD for a specific version)
|
|
75
|
+
rake kabosu:list # List installed dictionaries
|
|
76
|
+
rake kabosu:versions # Show available versions from GitHub
|
|
77
|
+
rake kabosu:path # Show path to best available dictionary
|
|
78
|
+
rake kabosu:remove[small] # Remove a dictionary (VERSION=YYYYMMDD for a specific version)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Dictionaries are stored in `~/.kabosu/dict/` by default. Set `KABOSU_DICT_DIR` to customize.
|
|
82
|
+
|
|
83
|
+
## Development
|
|
84
|
+
|
|
85
|
+
```sh
|
|
86
|
+
bundle install
|
|
87
|
+
bundle exec rake compile # Build the native extension
|
|
88
|
+
bundle exec rake test # Run tests
|
|
89
|
+
bundle exec rake # Compile + test
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## License
|
|
93
|
+
|
|
94
|
+
MIT
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "kabosu"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
edition = "2021"
|
|
5
|
+
publish = false
|
|
6
|
+
|
|
7
|
+
[lib]
|
|
8
|
+
crate-type = ["cdylib"]
|
|
9
|
+
|
|
10
|
+
[dependencies]
|
|
11
|
+
magnus = { version = "0.8", features = ["rb-sys"] }
|
|
12
|
+
sudachi = { git = "https://github.com/WorksApplications/sudachi.rs", branch = "develop" }
|
|
@@ -0,0 +1,490 @@
|
|
|
1
|
+
use magnus::{function, method, prelude::*, Error, RArray, Ruby, Value};
|
|
2
|
+
use std::cell::{Cell, RefCell};
|
|
3
|
+
use std::sync::Arc;
|
|
4
|
+
use sudachi::analysis::stateful_tokenizer::StatefulTokenizer;
|
|
5
|
+
use sudachi::analysis::stateless_tokenizer::StatelessTokenizer;
|
|
6
|
+
use sudachi::analysis::{Mode, Tokenize};
|
|
7
|
+
use sudachi::config::Config;
|
|
8
|
+
use sudachi::dic::dictionary::JapaneseDictionary;
|
|
9
|
+
use sudachi::prelude::MorphemeList;
|
|
10
|
+
use sudachi::sentence_splitter::{SentenceSplitter, SplitSentences};
|
|
11
|
+
|
|
12
|
+
fn sudachi_error(e: impl std::fmt::Display) -> Error {
|
|
13
|
+
Error::new(
|
|
14
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
|
15
|
+
e.to_string(),
|
|
16
|
+
)
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// ---------- Dictionary ----------
|
|
20
|
+
|
|
21
|
+
#[magnus::wrap(class = "Kabosu::Dictionary")]
|
|
22
|
+
struct RbDictionary {
|
|
23
|
+
inner: Arc<JapaneseDictionary>,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
impl RbDictionary {
|
|
27
|
+
fn new(ruby: &Ruby, args: &[Value]) -> Result<Self, Error> {
|
|
28
|
+
let (config_path, dict_path): (Option<String>, Option<String>) = match args.len() {
|
|
29
|
+
0 => (None, None),
|
|
30
|
+
1 => (<Option<String>>::try_convert(args[0])?, None),
|
|
31
|
+
2 => (
|
|
32
|
+
<Option<String>>::try_convert(args[0])?,
|
|
33
|
+
<Option<String>>::try_convert(args[1])?,
|
|
34
|
+
),
|
|
35
|
+
_ => {
|
|
36
|
+
return Err(Error::new(
|
|
37
|
+
ruby.exception_arg_error(),
|
|
38
|
+
format!(
|
|
39
|
+
"wrong number of arguments (given {}, expected 0..2)",
|
|
40
|
+
args.len()
|
|
41
|
+
),
|
|
42
|
+
))
|
|
43
|
+
}
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
let cfg = match (&config_path, &dict_path) {
|
|
47
|
+
(None, None) => Config::new(None, None, None).map_err(sudachi_error)?,
|
|
48
|
+
(None, Some(dict)) => {
|
|
49
|
+
Config::new(None, None, Some(dict.into())).map_err(sudachi_error)?
|
|
50
|
+
}
|
|
51
|
+
(Some(cfg_path), None) => {
|
|
52
|
+
Config::new(Some(cfg_path.into()), None, None).map_err(sudachi_error)?
|
|
53
|
+
}
|
|
54
|
+
(Some(cfg_path), Some(dict)) => {
|
|
55
|
+
Config::new(Some(cfg_path.into()), None, Some(dict.into()))
|
|
56
|
+
.map_err(sudachi_error)?
|
|
57
|
+
}
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
let dict = JapaneseDictionary::from_cfg(&cfg).map_err(sudachi_error)?;
|
|
61
|
+
|
|
62
|
+
Ok(Self {
|
|
63
|
+
inner: Arc::new(dict),
|
|
64
|
+
})
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
fn create(&self, mode: Option<String>) -> RbTokenizer {
|
|
68
|
+
let mode = parse_mode(mode.as_deref());
|
|
69
|
+
RbTokenizer {
|
|
70
|
+
dict: self.inner.clone(),
|
|
71
|
+
mode,
|
|
72
|
+
debug: Cell::new(false),
|
|
73
|
+
last_internal_cost: Cell::new(0),
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
fn create_stateful(&self, mode: Option<String>) -> RbStatefulTokenizer {
|
|
78
|
+
let mode = parse_mode(mode.as_deref());
|
|
79
|
+
let tokenizer = StatefulTokenizer::new(self.inner.clone(), mode);
|
|
80
|
+
RbStatefulTokenizer {
|
|
81
|
+
dict: self.inner.clone(),
|
|
82
|
+
inner: RefCell::new(tokenizer),
|
|
83
|
+
mode,
|
|
84
|
+
debug: Cell::new(false),
|
|
85
|
+
last_internal_cost: Cell::new(0),
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// ---------- Tokenizer ----------
|
|
91
|
+
|
|
92
|
+
#[magnus::wrap(class = "Kabosu::Tokenizer")]
|
|
93
|
+
struct RbTokenizer {
|
|
94
|
+
dict: Arc<JapaneseDictionary>,
|
|
95
|
+
mode: Mode,
|
|
96
|
+
debug: Cell<bool>,
|
|
97
|
+
last_internal_cost: Cell<i32>,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
impl RbTokenizer {
|
|
101
|
+
fn tokenize(&self, text: String) -> Result<RArray, Error> {
|
|
102
|
+
let ruby = Ruby::get().unwrap();
|
|
103
|
+
let tokenizer = StatelessTokenizer::new(&*self.dict);
|
|
104
|
+
let morphemes = tokenizer
|
|
105
|
+
.tokenize(&text, self.mode, self.debug.get())
|
|
106
|
+
.map_err(sudachi_error)?;
|
|
107
|
+
|
|
108
|
+
self.last_internal_cost.set(morphemes.get_internal_cost());
|
|
109
|
+
|
|
110
|
+
let ary = ruby.ary_new_capa(morphemes.len());
|
|
111
|
+
for i in 0..morphemes.len() {
|
|
112
|
+
let m = morphemes.get(i);
|
|
113
|
+
let wid = m.word_id();
|
|
114
|
+
let rb_m = RbMorpheme {
|
|
115
|
+
surface: m.surface().to_string(),
|
|
116
|
+
pos: m.part_of_speech().iter().map(|s| s.to_string()).collect(),
|
|
117
|
+
pos_id: m.part_of_speech_id(),
|
|
118
|
+
dictionary_form: m.dictionary_form().to_string(),
|
|
119
|
+
normalized_form: m.normalized_form().to_string(),
|
|
120
|
+
reading_form: m.reading_form().to_string(),
|
|
121
|
+
is_oov: m.is_oov(),
|
|
122
|
+
dictionary_id: m.dictionary_id(),
|
|
123
|
+
word_id_raw: wid.as_raw(),
|
|
124
|
+
is_system: wid.is_system(),
|
|
125
|
+
is_user: wid.is_user(),
|
|
126
|
+
begin: m.begin(),
|
|
127
|
+
end: m.end(),
|
|
128
|
+
begin_c: m.begin_c(),
|
|
129
|
+
end_c: m.end_c(),
|
|
130
|
+
synonym_group_ids: m.synonym_group_ids().to_vec(),
|
|
131
|
+
total_cost: m.total_cost(),
|
|
132
|
+
dict: self.dict.clone(),
|
|
133
|
+
debug: self.debug.get(),
|
|
134
|
+
};
|
|
135
|
+
ary.push(rb_m)?;
|
|
136
|
+
}
|
|
137
|
+
Ok(ary)
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
fn mode(&self) -> String {
|
|
141
|
+
self.mode.to_string()
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
fn set_debug(&self, value: bool) {
|
|
145
|
+
self.debug.set(value);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
fn is_debug(&self) -> bool {
|
|
149
|
+
self.debug.get()
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
fn internal_cost(&self) -> i32 {
|
|
153
|
+
self.last_internal_cost.get()
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
fn tokenize_sentences(&self, text: String) -> Result<RArray, Error> {
|
|
157
|
+
let ruby = Ruby::get().unwrap();
|
|
158
|
+
let splitter = SentenceSplitter::new();
|
|
159
|
+
let result = ruby.ary_new();
|
|
160
|
+
|
|
161
|
+
for (_range, sentence) in splitter.split(&text) {
|
|
162
|
+
let morphemes = self.tokenize(sentence.to_string())?;
|
|
163
|
+
result.push(morphemes)?;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
Ok(result)
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// ---------- StatefulTokenizer ----------
|
|
171
|
+
|
|
172
|
+
#[magnus::wrap(class = "Kabosu::StatefulTokenizer")]
|
|
173
|
+
struct RbStatefulTokenizer {
|
|
174
|
+
dict: Arc<JapaneseDictionary>,
|
|
175
|
+
inner: RefCell<StatefulTokenizer<Arc<JapaneseDictionary>>>,
|
|
176
|
+
mode: Mode,
|
|
177
|
+
debug: Cell<bool>,
|
|
178
|
+
last_internal_cost: Cell<i32>,
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
impl RbStatefulTokenizer {
|
|
182
|
+
fn tokenize(&self, text: String) -> Result<RArray, Error> {
|
|
183
|
+
let ruby = Ruby::get().unwrap();
|
|
184
|
+
|
|
185
|
+
let mut tokenizer = self.inner.borrow_mut();
|
|
186
|
+
tokenizer.set_debug(self.debug.get());
|
|
187
|
+
|
|
188
|
+
// Reset and write input text
|
|
189
|
+
tokenizer.reset().push_str(&text);
|
|
190
|
+
|
|
191
|
+
// Perform tokenization
|
|
192
|
+
tokenizer.do_tokenize().map_err(sudachi_error)?;
|
|
193
|
+
|
|
194
|
+
// Collect results into a MorphemeList
|
|
195
|
+
let mut mlist = MorphemeList::empty(self.dict.clone());
|
|
196
|
+
mlist.collect_results(&mut *tokenizer).map_err(sudachi_error)?;
|
|
197
|
+
|
|
198
|
+
self.last_internal_cost.set(mlist.get_internal_cost());
|
|
199
|
+
|
|
200
|
+
let ary = ruby.ary_new_capa(mlist.len());
|
|
201
|
+
for i in 0..mlist.len() {
|
|
202
|
+
let m = mlist.get(i);
|
|
203
|
+
let wid = m.word_id();
|
|
204
|
+
let rb_m = RbMorpheme {
|
|
205
|
+
surface: m.surface().to_string(),
|
|
206
|
+
pos: m.part_of_speech().iter().map(|s| s.to_string()).collect(),
|
|
207
|
+
pos_id: m.part_of_speech_id(),
|
|
208
|
+
dictionary_form: m.dictionary_form().to_string(),
|
|
209
|
+
normalized_form: m.normalized_form().to_string(),
|
|
210
|
+
reading_form: m.reading_form().to_string(),
|
|
211
|
+
is_oov: m.is_oov(),
|
|
212
|
+
dictionary_id: m.dictionary_id(),
|
|
213
|
+
word_id_raw: wid.as_raw(),
|
|
214
|
+
is_system: wid.is_system(),
|
|
215
|
+
is_user: wid.is_user(),
|
|
216
|
+
begin: m.begin(),
|
|
217
|
+
end: m.end(),
|
|
218
|
+
begin_c: m.begin_c(),
|
|
219
|
+
end_c: m.end_c(),
|
|
220
|
+
synonym_group_ids: m.synonym_group_ids().to_vec(),
|
|
221
|
+
total_cost: m.total_cost(),
|
|
222
|
+
dict: self.dict.clone(),
|
|
223
|
+
debug: self.debug.get(),
|
|
224
|
+
};
|
|
225
|
+
ary.push(rb_m)?;
|
|
226
|
+
}
|
|
227
|
+
Ok(ary)
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
fn mode(&self) -> String {
|
|
231
|
+
self.mode.to_string()
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
fn set_debug(&self, value: bool) {
|
|
235
|
+
self.debug.set(value);
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
fn is_debug(&self) -> bool {
|
|
239
|
+
self.debug.get()
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
fn internal_cost(&self) -> i32 {
|
|
243
|
+
self.last_internal_cost.get()
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
fn tokenize_sentences(&self, text: String) -> Result<RArray, Error> {
|
|
247
|
+
let ruby = Ruby::get().unwrap();
|
|
248
|
+
let splitter = SentenceSplitter::new();
|
|
249
|
+
let result = ruby.ary_new();
|
|
250
|
+
|
|
251
|
+
for (_range, sentence) in splitter.split(&text) {
|
|
252
|
+
let morphemes = self.tokenize(sentence.to_string())?;
|
|
253
|
+
result.push(morphemes)?;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
Ok(result)
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// ---------- Morpheme ----------
|
|
261
|
+
|
|
262
|
+
#[magnus::wrap(class = "Kabosu::Morpheme")]
|
|
263
|
+
struct RbMorpheme {
|
|
264
|
+
surface: String,
|
|
265
|
+
pos: Vec<String>,
|
|
266
|
+
pos_id: u16,
|
|
267
|
+
dictionary_form: String,
|
|
268
|
+
normalized_form: String,
|
|
269
|
+
reading_form: String,
|
|
270
|
+
is_oov: bool,
|
|
271
|
+
dictionary_id: i32,
|
|
272
|
+
word_id_raw: u32,
|
|
273
|
+
is_system: bool,
|
|
274
|
+
is_user: bool,
|
|
275
|
+
begin: usize,
|
|
276
|
+
end: usize,
|
|
277
|
+
begin_c: usize,
|
|
278
|
+
end_c: usize,
|
|
279
|
+
synonym_group_ids: Vec<u32>,
|
|
280
|
+
total_cost: i32,
|
|
281
|
+
dict: Arc<JapaneseDictionary>,
|
|
282
|
+
debug: bool,
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
impl RbMorpheme {
|
|
286
|
+
fn surface(&self) -> &str {
|
|
287
|
+
&self.surface
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
fn part_of_speech(&self) -> Vec<String> {
|
|
291
|
+
self.pos.clone()
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
fn part_of_speech_id(&self) -> u16 {
|
|
295
|
+
self.pos_id
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
fn dictionary_form(&self) -> &str {
|
|
299
|
+
&self.dictionary_form
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
fn normalized_form(&self) -> &str {
|
|
303
|
+
&self.normalized_form
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
fn reading_form(&self) -> &str {
|
|
307
|
+
&self.reading_form
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
fn is_oov(&self) -> bool {
|
|
311
|
+
self.is_oov
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
fn dictionary_id(&self) -> i32 {
|
|
315
|
+
self.dictionary_id
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
fn word_id(&self) -> u32 {
|
|
319
|
+
self.word_id_raw
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
fn begin(&self) -> usize {
|
|
323
|
+
self.begin
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
fn end(&self) -> usize {
|
|
327
|
+
self.end
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
fn begin_c(&self) -> usize {
|
|
331
|
+
self.begin_c
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
fn end_c(&self) -> usize {
|
|
335
|
+
self.end_c
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
fn synonym_group_ids(&self) -> RArray {
|
|
339
|
+
let ruby = Ruby::get().unwrap();
|
|
340
|
+
let ary = ruby.ary_new_capa(self.synonym_group_ids.len());
|
|
341
|
+
for &id in &self.synonym_group_ids {
|
|
342
|
+
let _ = ary.push(id);
|
|
343
|
+
}
|
|
344
|
+
ary
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
fn total_cost(&self) -> i32 {
|
|
348
|
+
self.total_cost
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
fn is_system(&self) -> bool {
|
|
352
|
+
self.is_system
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
fn is_user(&self) -> bool {
|
|
356
|
+
self.is_user
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
fn split(&self, mode_str: Option<String>) -> Result<RArray, Error> {
|
|
360
|
+
let target_mode = parse_mode(mode_str.as_deref());
|
|
361
|
+
let ruby = Ruby::get().unwrap();
|
|
362
|
+
|
|
363
|
+
// Re-tokenize the surface text with the target mode
|
|
364
|
+
let tokenizer = StatelessTokenizer::new(&*self.dict);
|
|
365
|
+
let morphemes = tokenizer
|
|
366
|
+
.tokenize(&self.surface, target_mode, self.debug)
|
|
367
|
+
.map_err(sudachi_error)?;
|
|
368
|
+
|
|
369
|
+
let ary = ruby.ary_new_capa(morphemes.len());
|
|
370
|
+
for i in 0..morphemes.len() {
|
|
371
|
+
let m = morphemes.get(i);
|
|
372
|
+
let wid = m.word_id();
|
|
373
|
+
let rb_m = RbMorpheme {
|
|
374
|
+
surface: m.surface().to_string(),
|
|
375
|
+
pos: m.part_of_speech().iter().map(|s| s.to_string()).collect(),
|
|
376
|
+
pos_id: m.part_of_speech_id(),
|
|
377
|
+
dictionary_form: m.dictionary_form().to_string(),
|
|
378
|
+
normalized_form: m.normalized_form().to_string(),
|
|
379
|
+
reading_form: m.reading_form().to_string(),
|
|
380
|
+
is_oov: m.is_oov(),
|
|
381
|
+
dictionary_id: m.dictionary_id(),
|
|
382
|
+
word_id_raw: wid.as_raw(),
|
|
383
|
+
is_system: wid.is_system(),
|
|
384
|
+
is_user: wid.is_user(),
|
|
385
|
+
begin: m.begin(),
|
|
386
|
+
end: m.end(),
|
|
387
|
+
begin_c: m.begin_c(),
|
|
388
|
+
end_c: m.end_c(),
|
|
389
|
+
synonym_group_ids: m.synonym_group_ids().to_vec(),
|
|
390
|
+
total_cost: m.total_cost(),
|
|
391
|
+
dict: self.dict.clone(),
|
|
392
|
+
debug: self.debug,
|
|
393
|
+
};
|
|
394
|
+
ary.push(rb_m)?;
|
|
395
|
+
}
|
|
396
|
+
Ok(ary)
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
fn inspect(&self) -> String {
|
|
400
|
+
format!(
|
|
401
|
+
"#<Kabosu::Morpheme surface=\"{}\" pos=[{}] reading=\"{}\" {}..{}>",
|
|
402
|
+
self.surface,
|
|
403
|
+
self.pos.join(", "),
|
|
404
|
+
self.reading_form,
|
|
405
|
+
self.begin_c,
|
|
406
|
+
self.end_c,
|
|
407
|
+
)
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
fn to_s(&self) -> &str {
|
|
411
|
+
&self.surface
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
// ---------- Helpers ----------
|
|
416
|
+
|
|
417
|
+
fn parse_mode(mode: Option<&str>) -> Mode {
|
|
418
|
+
match mode {
|
|
419
|
+
Some("A") | Some("a") => Mode::A,
|
|
420
|
+
Some("B") | Some("b") => Mode::B,
|
|
421
|
+
_ => Mode::C,
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// ---------- Init ----------
|
|
426
|
+
|
|
427
|
+
#[magnus::init]
|
|
428
|
+
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
429
|
+
let module = ruby.define_module("Kabosu")?;
|
|
430
|
+
|
|
431
|
+
// Kabosu::Dictionary
|
|
432
|
+
let dict_class = module.define_class("Dictionary", ruby.class_object())?;
|
|
433
|
+
dict_class.define_singleton_method("new", function!(RbDictionary::new, -1))?;
|
|
434
|
+
dict_class.define_method("create", method!(RbDictionary::create, 1))?;
|
|
435
|
+
dict_class.define_method("create_stateful", method!(RbDictionary::create_stateful, 1))?;
|
|
436
|
+
|
|
437
|
+
// Kabosu::Tokenizer
|
|
438
|
+
let tok_class = module.define_class("Tokenizer", ruby.class_object())?;
|
|
439
|
+
tok_class.define_method("tokenize", method!(RbTokenizer::tokenize, 1))?;
|
|
440
|
+
tok_class.define_method("mode", method!(RbTokenizer::mode, 0))?;
|
|
441
|
+
tok_class.define_method("debug=", method!(RbTokenizer::set_debug, 1))?;
|
|
442
|
+
tok_class.define_method("debug?", method!(RbTokenizer::is_debug, 0))?;
|
|
443
|
+
tok_class.define_method("internal_cost", method!(RbTokenizer::internal_cost, 0))?;
|
|
444
|
+
tok_class.define_method(
|
|
445
|
+
"tokenize_sentences",
|
|
446
|
+
method!(RbTokenizer::tokenize_sentences, 1),
|
|
447
|
+
)?;
|
|
448
|
+
|
|
449
|
+
// Kabosu::StatefulTokenizer
|
|
450
|
+
let stok_class = module.define_class("StatefulTokenizer", ruby.class_object())?;
|
|
451
|
+
stok_class.define_method("tokenize", method!(RbStatefulTokenizer::tokenize, 1))?;
|
|
452
|
+
stok_class.define_method("mode", method!(RbStatefulTokenizer::mode, 0))?;
|
|
453
|
+
stok_class.define_method("debug=", method!(RbStatefulTokenizer::set_debug, 1))?;
|
|
454
|
+
stok_class.define_method("debug?", method!(RbStatefulTokenizer::is_debug, 0))?;
|
|
455
|
+
stok_class.define_method("internal_cost", method!(RbStatefulTokenizer::internal_cost, 0))?;
|
|
456
|
+
stok_class.define_method(
|
|
457
|
+
"tokenize_sentences",
|
|
458
|
+
method!(RbStatefulTokenizer::tokenize_sentences, 1),
|
|
459
|
+
)?;
|
|
460
|
+
|
|
461
|
+
// Kabosu::Morpheme
|
|
462
|
+
let morph_class = module.define_class("Morpheme", ruby.class_object())?;
|
|
463
|
+
morph_class.define_method("surface", method!(RbMorpheme::surface, 0))?;
|
|
464
|
+
morph_class.define_method("part_of_speech", method!(RbMorpheme::part_of_speech, 0))?;
|
|
465
|
+
morph_class.define_method("part_of_speech_id", method!(RbMorpheme::part_of_speech_id, 0))?;
|
|
466
|
+
morph_class.define_method("dictionary_form", method!(RbMorpheme::dictionary_form, 0))?;
|
|
467
|
+
morph_class.define_method("normalized_form", method!(RbMorpheme::normalized_form, 0))?;
|
|
468
|
+
morph_class.define_method("reading_form", method!(RbMorpheme::reading_form, 0))?;
|
|
469
|
+
morph_class.define_method("oov?", method!(RbMorpheme::is_oov, 0))?;
|
|
470
|
+
morph_class.define_method("dictionary_id", method!(RbMorpheme::dictionary_id, 0))?;
|
|
471
|
+
morph_class.define_method("word_id", method!(RbMorpheme::word_id, 0))?;
|
|
472
|
+
morph_class.define_method("begin", method!(RbMorpheme::begin, 0))?;
|
|
473
|
+
morph_class.define_method("end", method!(RbMorpheme::end, 0))?;
|
|
474
|
+
morph_class.define_method("begin_c", method!(RbMorpheme::begin_c, 0))?;
|
|
475
|
+
morph_class.define_method("end_c", method!(RbMorpheme::end_c, 0))?;
|
|
476
|
+
morph_class.define_method("synonym_group_ids", method!(RbMorpheme::synonym_group_ids, 0))?;
|
|
477
|
+
morph_class.define_method("total_cost", method!(RbMorpheme::total_cost, 0))?;
|
|
478
|
+
morph_class.define_method("system?", method!(RbMorpheme::is_system, 0))?;
|
|
479
|
+
morph_class.define_method("user?", method!(RbMorpheme::is_user, 0))?;
|
|
480
|
+
morph_class.define_method("split", method!(RbMorpheme::split, 1))?;
|
|
481
|
+
morph_class.define_method("inspect", method!(RbMorpheme::inspect, 0))?;
|
|
482
|
+
morph_class.define_method("to_s", method!(RbMorpheme::to_s, 0))?;
|
|
483
|
+
|
|
484
|
+
// Kabosu::MODE_A, MODE_B, MODE_C constants
|
|
485
|
+
module.const_set("MODE_A", ruby.str_new("A"))?;
|
|
486
|
+
module.const_set("MODE_B", ruby.str_new("B"))?;
|
|
487
|
+
module.const_set("MODE_C", ruby.str_new("C"))?;
|
|
488
|
+
|
|
489
|
+
Ok(())
|
|
490
|
+
}
|