kabosu 0.6.10.dev.20260225.c3c6711 → 0.6.10.dev.20260226.98055fb

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3d02b7f222732b36afe4c3da668cc8aa347a00e53d57dc59905dbb40f60cb937
4
- data.tar.gz: f105c63fda73d071b4e659891afa868656cb299ee7e04b6c53e0595a34634751
3
+ metadata.gz: 88012b4a49377c642ff1d694ae3d6da5e6400e2b4bf494c34ee99e1467b1be0c
4
+ data.tar.gz: 4c7324c24a1880ba62e006d8f5f3cea4610cc9e574122a4ece812b2acc649d68
5
5
  SHA512:
6
- metadata.gz: e776b3e802b8a6d6a9439367010b81580cb36d59069406d66ee2746ed059b089004e08d62d962b89d787dcfbe30463ef6ac5fba5128f469f944ef6f662e4fe37
7
- data.tar.gz: a26907ac57248903d6531d62b35888942dbdd15cdb488a64fa6e782c18ffc7419b0ad52153c0e1656242c7fe9d671d6d80a8ea9fcc9fa8c7059d3ec24ec846ff
6
+ metadata.gz: 7bb1f40e57f79f5aab99bb3867db3ddaa0e6b0ef34f12b5d479dc961a7a6e5ea7dbbf79db1ac33e95b155843d722e45452716f1cca954f9a42346a9bfbc41b8c
7
+ data.tar.gz: 7b5de7868de3e7725a3c777a5d3374aa8b452c2611abcb0ffecf22cc74d5a1321055f85db690ca42a78fe0d114530fb354cdec2c1d60e47c00e042caa7cdef8b
data/README.md CHANGED
@@ -54,6 +54,10 @@ morpheme.begin_c # => 0 - start character offset
54
54
  morpheme.end_c # => 3 - end character offset
55
55
  morpheme.system? # => true - from system dictionary?
56
56
  morpheme.user? # => false - from user dictionary?
57
+
58
+ # Split text into natural Japanese sentence boundaries
59
+ Kabosu.split_sentences("東京都に住んでいる。大阪も好きだ。")
60
+ # => ["東京都に住んでいる。", "大阪も好きだ。"]
57
61
  ```
58
62
 
59
63
  ## Installation
@@ -109,7 +113,6 @@ tok_c.tokenize("東京都").surfaces # => ["東京都"]
109
113
  ```
110
114
 
111
115
  Modes are symbols only (`:a`, `:b`, `:c` or `Kabosu::MODE_A/B/C`).
112
- Invalid modes now raise `ArgumentError` (for example, `"A"`).
113
116
 
114
117
  ## Advanced Use Cases
115
118
 
@@ -135,12 +138,6 @@ dict.lookup("東京都").surfaces
135
138
  m = tokenizer.tokenize("東京都").first
136
139
  m.split(mode: :a).surfaces
137
140
 
138
- # Bulk extractors
139
- tokenizer.tokenize_surfaces("東京都に住んでいる")
140
- tokenizer.tokenize_readings("東京都に住んでいる")
141
- tokenizer.tokenize_dictionary_forms("東京都に住んでいる")
142
- tokenizer.tokenize_normalized_forms("東京都に住んでいる")
143
-
144
141
  # Sentence splitting
145
142
  Kabosu.split_sentences("東京都に住んでいる。大阪も好きだ。", ranges: true)
146
143
  Kabosu.split_sentences("長い文...", limit: 12, with_checker: true)
@@ -203,5 +200,6 @@ bundle exec rake kabosu:install # Install Sudachi dictionary
203
200
 
204
201
  bundle exec rake compile # Build the native extension
205
202
  bundle exec rake test # Run tests
203
+
206
204
  bench/start # Run benchmarks
207
205
  ```
@@ -64,6 +64,9 @@ impl RbDictionary {
64
64
 
65
65
  let data = MorphemeData {
66
66
  surface: surface_slice.to_string(),
67
+ dictionary_form: info.dictionary_form().to_string(),
68
+ normalized_form: info.normalized_form().to_string(),
69
+ reading_form: info.reading_form().to_string(),
67
70
  pos_id: info.pos_id(),
68
71
  word_id_raw: entry.word_id.as_raw(),
69
72
  is_oov: entry.word_id.is_oov(),
@@ -42,16 +42,6 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
42
42
  // Kabosu::Tokenizer
43
43
  let tok_class = module.define_class("Tokenizer", ruby.class_object())?;
44
44
  tok_class.define_method("tokenize", method!(RbTokenizer::tokenize, 1))?;
45
- tok_class.define_method("tokenize_surfaces", method!(RbTokenizer::tokenize_surfaces, 1))?;
46
- tok_class.define_method("tokenize_readings", method!(RbTokenizer::tokenize_readings, 1))?;
47
- tok_class.define_method(
48
- "tokenize_dictionary_forms",
49
- method!(RbTokenizer::tokenize_dictionary_forms, 1),
50
- )?;
51
- tok_class.define_method(
52
- "tokenize_normalized_forms",
53
- method!(RbTokenizer::tokenize_normalized_forms, 1),
54
- )?;
55
45
  tok_class.define_method("mode", method!(RbTokenizer::mode, 0))?;
56
46
  tok_class.define_method("fields", method!(RbTokenizer::fields, 0))?;
57
47
  tok_class.define_method("debug?", method!(RbTokenizer::is_debug, 0))?;
@@ -13,6 +13,9 @@ use crate::parsing::parse_mode;
13
13
  #[derive(Clone)]
14
14
  pub(crate) struct MorphemeData {
15
15
  pub(crate) surface: String,
16
+ pub(crate) dictionary_form: String,
17
+ pub(crate) normalized_form: String,
18
+ pub(crate) reading_form: String,
16
19
  pub(crate) pos_id: u16,
17
20
  pub(crate) word_id_raw: u32,
18
21
  pub(crate) is_oov: bool,
@@ -37,6 +40,9 @@ where
37
40
 
38
41
  MorphemeData {
39
42
  surface,
43
+ dictionary_form: m.dictionary_form().to_string(),
44
+ normalized_form: m.normalized_form().to_string(),
45
+ reading_form: m.reading_form().to_string(),
40
46
  pos_id: m.part_of_speech_id(),
41
47
  word_id_raw: m.word_id().as_raw(),
42
48
  is_oov: m.is_oov(),
@@ -84,9 +90,6 @@ pub(crate) struct RbMorpheme {
84
90
  }
85
91
 
86
92
  struct LazyWordFields {
87
- dictionary_form: String,
88
- normalized_form: String,
89
- reading_form: String,
90
93
  synonym_group_ids: Vec<u32>,
91
94
  dictionary_form_word_id: i32,
92
95
  head_word_length: usize,
@@ -96,14 +99,31 @@ struct LazyWordFields {
96
99
  }
97
100
 
98
101
  impl RbMorpheme {
102
+ fn fallback_word_fields(&self) -> LazyWordFields {
103
+ LazyWordFields {
104
+ synonym_group_ids: Vec::new(),
105
+ dictionary_form_word_id: -1,
106
+ head_word_length: self.data.surface.chars().count(),
107
+ a_unit_split: Vec::new(),
108
+ b_unit_split: Vec::new(),
109
+ word_structure: Vec::new(),
110
+ }
111
+ }
112
+
99
113
  fn resolve_word_fields(&self) -> &LazyWordFields {
100
114
  self.word_fields.get_or_init(|| {
115
+ if self.data.is_oov || self.data.dictionary_id < 0 {
116
+ return self.fallback_word_fields();
117
+ }
118
+
101
119
  let wid = WordId::from_raw(self.data.word_id_raw);
102
- match self.dict.lexicon().get_word_info(wid) {
103
- Ok(info) => LazyWordFields {
104
- dictionary_form: info.dictionary_form().to_string(),
105
- normalized_form: info.normalized_form().to_string(),
106
- reading_form: info.reading_form().to_string(),
120
+
121
+ let info_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
122
+ self.dict.lexicon().get_word_info(wid)
123
+ }));
124
+
125
+ match info_result {
126
+ Ok(Ok(info)) => LazyWordFields {
107
127
  synonym_group_ids: info.synonym_group_ids().to_vec(),
108
128
  dictionary_form_word_id: info.dictionary_form_word_id(),
109
129
  head_word_length: info.head_word_length(),
@@ -111,28 +131,25 @@ impl RbMorpheme {
111
131
  b_unit_split: info.b_unit_split().iter().map(WordId::as_raw).collect(),
112
132
  word_structure: info.word_structure().iter().map(WordId::as_raw).collect(),
113
133
  },
114
- Err(_) => LazyWordFields {
115
- dictionary_form: self.data.surface.clone(),
116
- normalized_form: self.data.surface.clone(),
117
- reading_form: self.data.surface.clone(),
118
- synonym_group_ids: Vec::new(),
119
- dictionary_form_word_id: -1,
120
- head_word_length: self.data.surface.chars().count(),
121
- a_unit_split: Vec::new(),
122
- b_unit_split: Vec::new(),
123
- word_structure: Vec::new(),
124
- },
134
+ _ => self.fallback_word_fields(),
125
135
  }
126
136
  })
127
137
  }
128
138
 
129
139
  fn split_ids_from_lexicon(&self, mode: Mode) -> Result<Vec<u32>, Error> {
130
- let wid = WordId::from_raw(self.data.word_id_raw);
131
- if wid.is_oov() {
140
+ if self.data.is_oov || self.data.dictionary_id < 0 {
132
141
  return Ok(Vec::new());
133
142
  }
143
+ let wid = WordId::from_raw(self.data.word_id_raw);
134
144
 
135
- let info = self.dict.lexicon().get_word_info(wid).map_err(sudachi_error)?;
145
+ let info_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
146
+ self.dict.lexicon().get_word_info(wid)
147
+ }));
148
+ let info = match info_result {
149
+ Ok(Ok(info)) => info,
150
+ Ok(Err(e)) => return Err(sudachi_error(e)),
151
+ Err(_) => return Err(sudachi_error("panic while reading word info for split")),
152
+ };
136
153
  let ids = match mode {
137
154
  Mode::A => info.a_unit_split(),
138
155
  Mode::B => info.b_unit_split(),
@@ -174,7 +191,14 @@ impl RbMorpheme {
174
191
 
175
192
  for (i, &raw_wid) in split_ids.iter().enumerate() {
176
193
  let wid = WordId::from_raw(raw_wid);
177
- let info = self.dict.lexicon().get_word_info(wid).map_err(sudachi_error)?;
194
+ let info_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
195
+ self.dict.lexicon().get_word_info(wid)
196
+ }));
197
+ let info = match info_result {
198
+ Ok(Ok(info)) => info,
199
+ Ok(Err(e)) => return Err(sudachi_error(e)),
200
+ Err(_) => return Err(sudachi_error("panic while reading child word info for split")),
201
+ };
178
202
 
179
203
  // head_word_length is in codepoints; clamp to remaining characters.
180
204
  let mut span_chars = info.head_word_length();
@@ -190,6 +214,9 @@ impl RbMorpheme {
190
214
 
191
215
  let child = MorphemeData {
192
216
  surface: surface[start_byte..end_byte].to_string(),
217
+ dictionary_form: info.dictionary_form().to_string(),
218
+ normalized_form: info.normalized_form().to_string(),
219
+ reading_form: info.reading_form().to_string(),
193
220
  pos_id: info.pos_id(),
194
221
  word_id_raw: raw_wid,
195
222
  is_oov: wid.is_oov(),
@@ -230,15 +257,15 @@ impl RbMorpheme {
230
257
  }
231
258
 
232
259
  pub(crate) fn dictionary_form(&self) -> &str {
233
- &self.resolve_word_fields().dictionary_form
260
+ &self.data.dictionary_form
234
261
  }
235
262
 
236
263
  pub(crate) fn normalized_form(&self) -> &str {
237
- &self.resolve_word_fields().normalized_form
264
+ &self.data.normalized_form
238
265
  }
239
266
 
240
267
  pub(crate) fn reading_form(&self) -> &str {
241
- &self.resolve_word_fields().reading_form
268
+ &self.data.reading_form
242
269
  }
243
270
 
244
271
  pub(crate) fn is_oov(&self) -> bool {
@@ -8,7 +8,6 @@ use sudachi::analysis::stateful_tokenizer::StatefulTokenizer;
8
8
  use sudachi::analysis::Mode;
9
9
  use sudachi::dic::dictionary::JapaneseDictionary;
10
10
  use sudachi::dic::subset::InfoSubset;
11
- use sudachi::dic::word_id::WordId;
12
11
  use sudachi::prelude::MorphemeList;
13
12
 
14
13
  use crate::errors::sudachi_error;
@@ -168,20 +167,6 @@ impl RbTokenizer {
168
167
  }
169
168
  }
170
169
 
171
- fn collect_strings<F>(&self, text: String, mut project: F) -> Result<RArray, Error>
172
- where
173
- F: FnMut(&MorphemeData) -> String,
174
- {
175
- let ruby = Ruby::get().unwrap();
176
- let analyzed = self.analyze(text)?;
177
- let ary = ruby.ary_new_capa(analyzed.morphemes.len());
178
- for data in analyzed.morphemes {
179
- let projected = project(&data);
180
- ary.push(ruby.str_new(&projected))?;
181
- }
182
- Ok(ary)
183
- }
184
-
185
170
  pub(crate) fn tokenize(&self, text: String) -> Result<RbTokenBatch, Error> {
186
171
  let analyzed = self.analyze(text)?;
187
172
  Ok(RbTokenBatch::new(
@@ -192,40 +177,6 @@ impl RbTokenizer {
192
177
  ))
193
178
  }
194
179
 
195
- pub(crate) fn tokenize_surfaces(&self, text: String) -> Result<RArray, Error> {
196
- self.collect_strings(text, |data| data.surface.clone())
197
- }
198
-
199
- pub(crate) fn tokenize_readings(&self, text: String) -> Result<RArray, Error> {
200
- let lexicon = self.pool.dict.lexicon();
201
- self.collect_strings(text, |data| {
202
- lexicon
203
- .get_word_info(WordId::from_raw(data.word_id_raw))
204
- .map(|info| info.reading_form().to_string())
205
- .unwrap_or_else(|_| data.surface.clone())
206
- })
207
- }
208
-
209
- pub(crate) fn tokenize_dictionary_forms(&self, text: String) -> Result<RArray, Error> {
210
- let lexicon = self.pool.dict.lexicon();
211
- self.collect_strings(text, |data| {
212
- lexicon
213
- .get_word_info(WordId::from_raw(data.word_id_raw))
214
- .map(|info| info.dictionary_form().to_string())
215
- .unwrap_or_else(|_| data.surface.clone())
216
- })
217
- }
218
-
219
- pub(crate) fn tokenize_normalized_forms(&self, text: String) -> Result<RArray, Error> {
220
- let lexicon = self.pool.dict.lexicon();
221
- self.collect_strings(text, |data| {
222
- lexicon
223
- .get_word_info(WordId::from_raw(data.word_id_raw))
224
- .map(|info| info.normalized_form().to_string())
225
- .unwrap_or_else(|_| data.surface.clone())
226
- })
227
- }
228
-
229
180
  pub(crate) fn mode(&self) -> String {
230
181
  self.mode.to_string()
231
182
  }
@@ -1,3 +1,3 @@
1
1
  module Kabosu
2
- VERSION = "0.6.10.dev.20260225.c3c6711"
2
+ VERSION = "0.6.10.dev.20260226.98055fb"
3
3
  end
data/lib/kabosu.rb CHANGED
@@ -162,47 +162,6 @@ module Kabosu
162
162
  rescue RuntimeError => e
163
163
  raise TokenizationError.new(e.message), cause: e
164
164
  end
165
-
166
- alias_method :_tokenize_surfaces, :tokenize_surfaces
167
- alias_method :_tokenize_readings, :tokenize_readings
168
- alias_method :_tokenize_dictionary_forms, :tokenize_dictionary_forms
169
- alias_method :_tokenize_normalized_forms, :tokenize_normalized_forms
170
-
171
- def tokenize_surfaces(text)
172
- unless text.is_a?(String)
173
- raise ArgumentError, "text must be a String"
174
- end
175
- _tokenize_surfaces(text)
176
- rescue RuntimeError => e
177
- raise TokenizationError.new(e.message), cause: e
178
- end
179
-
180
- def tokenize_readings(text)
181
- unless text.is_a?(String)
182
- raise ArgumentError, "text must be a String"
183
- end
184
- _tokenize_readings(text)
185
- rescue RuntimeError => e
186
- raise TokenizationError.new(e.message), cause: e
187
- end
188
-
189
- def tokenize_dictionary_forms(text)
190
- unless text.is_a?(String)
191
- raise ArgumentError, "text must be a String"
192
- end
193
- _tokenize_dictionary_forms(text)
194
- rescue RuntimeError => e
195
- raise TokenizationError.new(e.message), cause: e
196
- end
197
-
198
- def tokenize_normalized_forms(text)
199
- unless text.is_a?(String)
200
- raise ArgumentError, "text must be a String"
201
- end
202
- _tokenize_normalized_forms(text)
203
- rescue RuntimeError => e
204
- raise TokenizationError.new(e.message), cause: e
205
- end
206
165
  end
207
166
 
208
167
  class Morpheme
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabosu
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.10.dev.20260225.c3c6711
4
+ version: 0.6.10.dev.20260226.98055fb
5
5
  platform: ruby
6
6
  authors:
7
7
  - davafons
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-02-25 00:00:00.000000000 Z
11
+ date: 2026-02-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys