kabosu 0.6.10.1 → 0.6.10.2.dev.20260531.b95a56a
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/kabosu/src/grouping.rs +133 -0
- data/ext/kabosu/src/lib.rs +2 -0
- data/ext/kabosu/src/morpheme.rs +36 -15
- data/ext/kabosu/src/token_batch.rs +7 -0
- data/lib/kabosu/morpheme_list.rb +48 -0
- data/lib/kabosu/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3ba5a31baf20e7e4958a21f14d0b9194a9b83292b2823192bfe988c7bcb3b121
|
|
4
|
+
data.tar.gz: efbd5e6664154257c9ce993d85f2c06e11376f70e444102eb75435210f64def7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d6bfc7e8d82719596d5b25057983136e2ba4f9f6548021c964595d2d3f69bfdee56faf77c2ee67186bad0a4e3f6a08b6f9c13743ebfaaf81f415fd65dbd70b10
|
|
7
|
+
data.tar.gz: '0929bb3dcd92ddbf4bb3d6e7ba7969d9661ad0421aef553a80b2da9560f438401a430ef0f448bf6f7db30464cdee6f0b57444fce9816c2a2a0abda8b718a1175'
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
use magnus::{Error, RArray, Ruby};
|
|
2
|
+
use std::sync::Arc;
|
|
3
|
+
use sudachi::dic::dictionary::JapaneseDictionary;
|
|
4
|
+
|
|
5
|
+
use crate::morpheme::{rb_morpheme_from_data, MorphemeData};
|
|
6
|
+
|
|
7
|
+
/// Group raw `MorphemeData` into jpdb-style chips.
|
|
8
|
+
pub(crate) fn group_morphemes_rust(
|
|
9
|
+
morphemes: &[MorphemeData],
|
|
10
|
+
dict: &Arc<JapaneseDictionary>,
|
|
11
|
+
debug: bool,
|
|
12
|
+
) -> Result<RArray, Error> {
|
|
13
|
+
let ruby = Ruby::get().unwrap();
|
|
14
|
+
let mut groups: Vec<Vec<MorphemeData>> = Vec::with_capacity(morphemes.len());
|
|
15
|
+
|
|
16
|
+
for m in morphemes {
|
|
17
|
+
if let Some(last_group) = groups.last_mut() {
|
|
18
|
+
let head = &last_group[0];
|
|
19
|
+
let prev = last_group.last().unwrap();
|
|
20
|
+
if is_content_word(head.pos_id, dict) && extends_group(m, prev, dict) {
|
|
21
|
+
last_group.push(m.clone());
|
|
22
|
+
continue;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
groups.push(vec![m.clone()]);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
let result = ruby.ary_new();
|
|
29
|
+
for group in groups {
|
|
30
|
+
let group_ary = ruby.ary_new();
|
|
31
|
+
for data in group {
|
|
32
|
+
group_ary.push(rb_morpheme_from_data(data, dict.clone(), debug))?;
|
|
33
|
+
}
|
|
34
|
+
result.push(group_ary)?;
|
|
35
|
+
}
|
|
36
|
+
Ok(result)
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// POS helpers
|
|
40
|
+
|
|
41
|
+
fn is_content_word(pos_id: u16, dict: &JapaneseDictionary) -> bool {
|
|
42
|
+
match dict.grammar().pos_components(pos_id).first().map(|s| s.as_str()) {
|
|
43
|
+
Some("助詞") | Some("助動詞") | Some("補助記号") | Some("記号") | Some("空白") => false,
|
|
44
|
+
_ => true,
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
fn extends_group(
|
|
49
|
+
m: &MorphemeData,
|
|
50
|
+
prev: &MorphemeData,
|
|
51
|
+
dict: &JapaneseDictionary,
|
|
52
|
+
) -> bool {
|
|
53
|
+
let comps = dict.grammar().pos_components(m.pos_id);
|
|
54
|
+
let pos0 = comps.first().map(|s| s.as_str());
|
|
55
|
+
let pos1 = comps.get(1).map(|s| s.as_str());
|
|
56
|
+
|
|
57
|
+
match pos0 {
|
|
58
|
+
Some("助動詞") => true,
|
|
59
|
+
Some("助詞") => {
|
|
60
|
+
if is_clause_boundary(m.surface.as_str(), m.pos_id, dict) {
|
|
61
|
+
return false;
|
|
62
|
+
}
|
|
63
|
+
if pos1 == Some("接続助詞") {
|
|
64
|
+
return true;
|
|
65
|
+
}
|
|
66
|
+
// 副助詞 clings to preceding verb/adjective (e.g. たり/だり)
|
|
67
|
+
if pos1 == Some("副助詞") && is_verb_adj_adv(prev.pos_id, dict) {
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
70
|
+
false
|
|
71
|
+
}
|
|
72
|
+
Some("動詞") => {
|
|
73
|
+
if pos1 != Some("非自立可能") {
|
|
74
|
+
return false;
|
|
75
|
+
}
|
|
76
|
+
// te-form auxiliary chain: て/で + いる/ある/くる/etc.
|
|
77
|
+
let prev_pos0 = dict.grammar().pos_components(prev.pos_id).first().map(|s| s.as_str());
|
|
78
|
+
if prev_pos0 == Some("助詞")
|
|
79
|
+
&& (prev.surface == "て" || prev.surface == "で")
|
|
80
|
+
{
|
|
81
|
+
return true;
|
|
82
|
+
}
|
|
83
|
+
// compound verb (V+V) intentionally skipped — caller handles DB lookup
|
|
84
|
+
false
|
|
85
|
+
}
|
|
86
|
+
_ => false,
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
fn is_clause_boundary(surface: &str, pos_id: u16, dict: &JapaneseDictionary) -> bool {
|
|
91
|
+
let comps = dict.grammar().pos_components(pos_id);
|
|
92
|
+
let pos0 = comps.first().map(|s| s.as_str());
|
|
93
|
+
let pos1 = comps.get(1).map(|s| s.as_str());
|
|
94
|
+
|
|
95
|
+
if pos0 == Some("助詞") {
|
|
96
|
+
if is_clause_boundary_particle(surface) {
|
|
97
|
+
return true;
|
|
98
|
+
}
|
|
99
|
+
// contrastive が (接続助詞) is a boundary, unlike subject が (格助詞)
|
|
100
|
+
if pos1 == Some("接続助詞") && surface == "が" {
|
|
101
|
+
return true;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
false
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
fn is_clause_boundary_particle(surface: &str) -> bool {
|
|
108
|
+
matches!(
|
|
109
|
+
surface,
|
|
110
|
+
"ながら"
|
|
111
|
+
| "たら"
|
|
112
|
+
| "ば"
|
|
113
|
+
| "と"
|
|
114
|
+
| "のに"
|
|
115
|
+
| "から"
|
|
116
|
+
| "ので"
|
|
117
|
+
| "けれど"
|
|
118
|
+
| "けど"
|
|
119
|
+
| "つつ"
|
|
120
|
+
| "なり"
|
|
121
|
+
| "や"
|
|
122
|
+
| "か"
|
|
123
|
+
| "かどうか"
|
|
124
|
+
| "とも"
|
|
125
|
+
)
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
fn is_verb_adj_adv(pos_id: u16, dict: &JapaneseDictionary) -> bool {
|
|
129
|
+
matches!(
|
|
130
|
+
dict.grammar().pos_components(pos_id).first().map(|s| s.as_str()),
|
|
131
|
+
Some("動詞") | Some("形容詞") | Some("形状詞")
|
|
132
|
+
)
|
|
133
|
+
}
|
data/ext/kabosu/src/lib.rs
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
mod dictionary;
|
|
2
2
|
mod errors;
|
|
3
|
+
mod grouping;
|
|
3
4
|
mod morpheme;
|
|
4
5
|
mod nogvl;
|
|
5
6
|
mod parsing;
|
|
@@ -38,6 +39,7 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
38
39
|
batch_class.define_method("internal_cost", method!(RbTokenBatch::internal_cost, 0))?;
|
|
39
40
|
batch_class.define_method("morpheme_at", method!(RbTokenBatch::morpheme_at, 1))?;
|
|
40
41
|
batch_class.define_method("surfaces", method!(RbTokenBatch::surfaces, 0))?;
|
|
42
|
+
batch_class.define_method("group_morphemes", method!(RbTokenBatch::group_morphemes, 0))?;
|
|
41
43
|
|
|
42
44
|
// Kabosu::Tokenizer
|
|
43
45
|
let tok_class = module.define_class("Tokenizer", ruby.class_object())?;
|
data/ext/kabosu/src/morpheme.rs
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
|
-
use magnus::
|
|
1
|
+
use magnus::value::ReprValue;
|
|
2
|
+
use magnus::{gc, Error, RArray, RString, Ruby};
|
|
2
3
|
use std::cell::OnceCell;
|
|
3
|
-
use std::
|
|
4
|
+
use std::collections::HashMap;
|
|
5
|
+
use std::sync::{Arc, Mutex, OnceLock};
|
|
4
6
|
use sudachi::analysis::morpheme::Morpheme as SudachiMorpheme;
|
|
5
7
|
use sudachi::analysis::stateless_tokenizer::DictionaryAccess;
|
|
6
8
|
use sudachi::analysis::Mode;
|
|
@@ -57,6 +59,12 @@ where
|
|
|
57
59
|
}
|
|
58
60
|
}
|
|
59
61
|
|
|
62
|
+
#[derive(Clone, Copy)]
|
|
63
|
+
struct CachedRArray(RArray);
|
|
64
|
+
unsafe impl Send for CachedRArray {}
|
|
65
|
+
|
|
66
|
+
static POS_CACHE: OnceLock<Mutex<HashMap<(usize, u16), CachedRArray>>> = OnceLock::new();
|
|
67
|
+
|
|
60
68
|
pub(crate) fn rb_morpheme_from_data(
|
|
61
69
|
data: MorphemeData,
|
|
62
70
|
dict: Arc<JapaneseDictionary>,
|
|
@@ -66,7 +74,6 @@ pub(crate) fn rb_morpheme_from_data(
|
|
|
66
74
|
data,
|
|
67
75
|
dict,
|
|
68
76
|
debug,
|
|
69
|
-
pos: OnceCell::new(),
|
|
70
77
|
word_fields: OnceCell::new(),
|
|
71
78
|
}
|
|
72
79
|
}
|
|
@@ -85,7 +92,6 @@ pub(crate) struct RbMorpheme {
|
|
|
85
92
|
data: MorphemeData,
|
|
86
93
|
dict: Arc<JapaneseDictionary>,
|
|
87
94
|
debug: bool,
|
|
88
|
-
pos: OnceCell<Vec<String>>,
|
|
89
95
|
word_fields: OnceCell<LazyWordFields>,
|
|
90
96
|
}
|
|
91
97
|
|
|
@@ -243,17 +249,32 @@ impl RbMorpheme {
|
|
|
243
249
|
&self.data.surface
|
|
244
250
|
}
|
|
245
251
|
|
|
246
|
-
pub(crate) fn part_of_speech(&self) ->
|
|
247
|
-
self.
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
252
|
+
pub(crate) fn part_of_speech(&self) -> Result<RArray, Error> {
|
|
253
|
+
let dict_ptr = Arc::as_ptr(&self.dict) as usize;
|
|
254
|
+
let pos_id = self.data.pos_id;
|
|
255
|
+
|
|
256
|
+
{
|
|
257
|
+
let cache = POS_CACHE.get_or_init(|| Mutex::new(HashMap::new())).lock().unwrap();
|
|
258
|
+
if let Some(&cached) = cache.get(&(dict_ptr, pos_id)) {
|
|
259
|
+
return Ok(cached.0);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
let ruby = Ruby::get().unwrap();
|
|
264
|
+
let ary = ruby.ary_new();
|
|
265
|
+
for s in self.dict.grammar().pos_components(pos_id) {
|
|
266
|
+
let rstr: RString = ruby.str_new(s);
|
|
267
|
+
rstr.as_value().freeze();
|
|
268
|
+
ary.push(rstr)?;
|
|
269
|
+
}
|
|
270
|
+
ary.as_value().freeze();
|
|
271
|
+
// Pin permanently: POS combinations are bounded (~hundreds) and frozen
|
|
272
|
+
// arrays are tiny. The cache lives for the process lifetime.
|
|
273
|
+
gc::register_mark_object(ary);
|
|
274
|
+
|
|
275
|
+
POS_CACHE.get_or_init(|| Mutex::new(HashMap::new())).lock().unwrap().insert((dict_ptr, pos_id), CachedRArray(ary));
|
|
276
|
+
|
|
277
|
+
Ok(ary)
|
|
257
278
|
}
|
|
258
279
|
|
|
259
280
|
pub(crate) fn part_of_speech_id(&self) -> u16 {
|
|
@@ -2,6 +2,7 @@ use magnus::{Error, RArray, Ruby};
|
|
|
2
2
|
use std::sync::Arc;
|
|
3
3
|
use sudachi::dic::dictionary::JapaneseDictionary;
|
|
4
4
|
|
|
5
|
+
use crate::grouping::group_morphemes_rust;
|
|
5
6
|
use crate::morpheme::{rb_morpheme_from_data, MorphemeData, RbMorpheme};
|
|
6
7
|
|
|
7
8
|
#[magnus::wrap(class = "Kabosu::TokenBatch")]
|
|
@@ -50,4 +51,10 @@ impl RbTokenBatch {
|
|
|
50
51
|
}
|
|
51
52
|
Ok(ary)
|
|
52
53
|
}
|
|
54
|
+
|
|
55
|
+
/// jpdb-style grouping performed natively in Rust.
|
|
56
|
+
/// Returns an Array-of-Array of Kabosu::Morpheme.
|
|
57
|
+
pub(crate) fn group_morphemes(&self) -> Result<RArray, Error> {
|
|
58
|
+
group_morphemes_rust(&self.morphemes, &self.dict, self.debug)
|
|
59
|
+
}
|
|
53
60
|
}
|
data/lib/kabosu/morpheme_list.rb
CHANGED
|
@@ -92,6 +92,54 @@ module Kabosu
|
|
|
92
92
|
surfaces.join
|
|
93
93
|
end
|
|
94
94
|
|
|
95
|
+
# jpdb-style grouping performed natively in Rust when backed by a lazy
|
|
96
|
+
# source. Falls back to a Ruby implementation for already-materialized
|
|
97
|
+
# lists so the method is always safe to call.
|
|
98
|
+
def group_morphemes
|
|
99
|
+
if @source&.respond_to?(:group_morphemes)
|
|
100
|
+
return @source.group_morphemes
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
groups = []
|
|
104
|
+
each do |m|
|
|
105
|
+
last = groups.last
|
|
106
|
+
if last && content_word?(last.first) && extends_group?(m, last.last)
|
|
107
|
+
last << m
|
|
108
|
+
else
|
|
109
|
+
groups << [m]
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
groups
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
private
|
|
116
|
+
|
|
117
|
+
def content_word?(morpheme)
|
|
118
|
+
!%w[助詞 助動詞 補助記号 記号 空白].include?(morpheme.part_of_speech.first)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def extends_group?(morpheme, prev = nil)
|
|
122
|
+
pos = morpheme.part_of_speech
|
|
123
|
+
pos1 = pos[0]
|
|
124
|
+
pos1 == "助動詞" ||
|
|
125
|
+
(pos1 == "助詞" && !clause_boundary?(morpheme) &&
|
|
126
|
+
(pos[1] == "接続助詞" ||
|
|
127
|
+
(pos[1] == "副助詞" && prev && %w[動詞 形容詞 形状詞].include?(prev.part_of_speech[0])))) ||
|
|
128
|
+
(pos1 == "動詞" && pos[1] == "非自立可能" &&
|
|
129
|
+
prev && prev.part_of_speech[0] == "助詞" && %w[て で].include?(prev.surface))
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def clause_boundary?(morpheme)
|
|
133
|
+
return false unless morpheme
|
|
134
|
+
pos = morpheme.part_of_speech
|
|
135
|
+
return true if pos[0] == "助詞" &&
|
|
136
|
+
%w[ながら たら ば と のに から ので けれど けど つつ なり や か かどうか とも].include?(morpheme.surface)
|
|
137
|
+
return true if pos[0] == "助詞" && pos[1] == "接続助詞" && morpheme.surface == "が"
|
|
138
|
+
false
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
public
|
|
142
|
+
|
|
95
143
|
# Filter morphemes by POS. Accepts a PosMatcher or an array pattern.
|
|
96
144
|
# Returns a new MorphemeList with only matching morphemes.
|
|
97
145
|
#
|
data/lib/kabosu/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kabosu
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.6.10.
|
|
4
|
+
version: 0.6.10.2.dev.20260531.b95a56a
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- davafons
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-05-31 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -110,6 +110,7 @@ files:
|
|
|
110
110
|
- ext/kabosu/extconf.rb
|
|
111
111
|
- ext/kabosu/src/dictionary.rs
|
|
112
112
|
- ext/kabosu/src/errors.rs
|
|
113
|
+
- ext/kabosu/src/grouping.rs
|
|
113
114
|
- ext/kabosu/src/lib.rs
|
|
114
115
|
- ext/kabosu/src/morpheme.rs
|
|
115
116
|
- ext/kabosu/src/nogvl.rs
|