kabosu 0.6.10.1.dev.20260531.8cc8132 → 0.6.10.1.dev.20260531.a468755

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fd256d0abf1e3e9070c8282ca30378858d06d73a2379586f2205158cdb3c7b40
4
- data.tar.gz: 52606c444dbc697639fd15318842b0d226288f002b3bc339ffb42b4094069952
3
+ metadata.gz: b4ab39bb0ce7aacedbd9640abb955beecd3e11f5ba1da56505efa7eccd45afb3
4
+ data.tar.gz: 69cabebc7ae242d7222408929f81613e1e89a431d25b79586593d81170eb83d9
5
5
  SHA512:
6
- metadata.gz: efeabfb7bf03ccda85fad30a0a12973aec341d8d1d5a775778552c13ebb24d0e8111e6aeede41e920701eef1cae7704ccdc760c322f1cd9a07f451dca54e4a73
7
- data.tar.gz: c51bcaf3687ab4ca9b03bc2637a360207a9afe32309787c4ea28da5e1e32e45fe2f836937a17261559be67c175b1176ff8dcf25ce818d4d367e1493aac6556d7
6
+ metadata.gz: 7eaa31344e593da582f9ce264b500ec4360b38294a93813002387f03ce153ee555e53fa49f824ead5fdbabe4efc4a66c12699b1721d4413dbb3e1b3c5d85c794
7
+ data.tar.gz: 04a175af4642fc5757eb1bf7df34e9bf875811a22df7263e7f4b226537017705f9777e774fa81520652c4a671ca79d65d2dfa210d3a76a11127c040def6bdc2d
@@ -1,6 +1,5 @@
1
1
  mod dictionary;
2
2
  mod errors;
3
- mod grouping;
4
3
  mod morpheme;
5
4
  mod nogvl;
6
5
  mod parsing;
@@ -39,7 +38,6 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
39
38
  batch_class.define_method("internal_cost", method!(RbTokenBatch::internal_cost, 0))?;
40
39
  batch_class.define_method("morpheme_at", method!(RbTokenBatch::morpheme_at, 1))?;
41
40
  batch_class.define_method("surfaces", method!(RbTokenBatch::surfaces, 0))?;
42
- batch_class.define_method("group_morphemes", method!(RbTokenBatch::group_morphemes, 0))?;
43
41
 
44
42
  // Kabosu::Tokenizer
45
43
  let tok_class = module.define_class("Tokenizer", ruby.class_object())?;
@@ -2,7 +2,6 @@ use magnus::{Error, RArray, Ruby};
2
2
  use std::sync::Arc;
3
3
  use sudachi::dic::dictionary::JapaneseDictionary;
4
4
 
5
- use crate::grouping::group_morphemes_rust;
6
5
  use crate::morpheme::{rb_morpheme_from_data, MorphemeData, RbMorpheme};
7
6
 
8
7
  #[magnus::wrap(class = "Kabosu::TokenBatch")]
@@ -51,10 +50,4 @@ impl RbTokenBatch {
51
50
  }
52
51
  Ok(ary)
53
52
  }
54
-
55
- /// jpdb-style grouping performed natively in Rust.
56
- /// Returns an Array-of-Array of Kabosu::Morpheme.
57
- pub(crate) fn group_morphemes(&self) -> Result<RArray, Error> {
58
- group_morphemes_rust(&self.morphemes, &self.dict, self.debug)
59
- }
60
53
  }
@@ -92,54 +92,6 @@ module Kabosu
92
92
  surfaces.join
93
93
  end
94
94
 
95
- # jpdb-style grouping performed natively in Rust when backed by a lazy
96
- # source. Falls back to a Ruby implementation for already-materialized
97
- # lists so the method is always safe to call.
98
- def group_morphemes
99
- if @source&.respond_to?(:group_morphemes)
100
- return @source.group_morphemes
101
- end
102
-
103
- groups = []
104
- each do |m|
105
- last = groups.last
106
- if last && content_word?(last.first) && extends_group?(m, last.last)
107
- last << m
108
- else
109
- groups << [m]
110
- end
111
- end
112
- groups
113
- end
114
-
115
- private
116
-
117
- def content_word?(morpheme)
118
- !%w[助詞 助動詞 補助記号 記号 空白].include?(morpheme.part_of_speech.first)
119
- end
120
-
121
- def extends_group?(morpheme, prev = nil)
122
- pos = morpheme.part_of_speech
123
- pos1 = pos[0]
124
- pos1 == "助動詞" ||
125
- (pos1 == "助詞" && !clause_boundary?(morpheme) &&
126
- (pos[1] == "接続助詞" ||
127
- (pos[1] == "副助詞" && prev && %w[動詞 形容詞 形状詞].include?(prev.part_of_speech[0])))) ||
128
- (pos1 == "動詞" && pos[1] == "非自立可能" &&
129
- prev && prev.part_of_speech[0] == "助詞" && %w[て で].include?(prev.surface))
130
- end
131
-
132
- def clause_boundary?(morpheme)
133
- return false unless morpheme
134
- pos = morpheme.part_of_speech
135
- return true if pos[0] == "助詞" &&
136
- %w[ながら たら ば と のに から ので けれど けど つつ なり や か かどうか とも].include?(morpheme.surface)
137
- return true if pos[0] == "助詞" && pos[1] == "接続助詞" && morpheme.surface == "が"
138
- false
139
- end
140
-
141
- public
142
-
143
95
  # Filter morphemes by POS. Accepts a PosMatcher or an array pattern.
144
96
  # Returns a new MorphemeList with only matching morphemes.
145
97
  #
@@ -1,3 +1,3 @@
1
1
  module Kabosu
2
- VERSION = "0.6.10.1.dev.20260531.8cc8132"
2
+ VERSION = "0.6.10.1.dev.20260531.a468755"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabosu
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.10.1.dev.20260531.8cc8132
4
+ version: 0.6.10.1.dev.20260531.a468755
5
5
  platform: ruby
6
6
  authors:
7
7
  - davafons
@@ -110,7 +110,6 @@ files:
110
110
  - ext/kabosu/extconf.rb
111
111
  - ext/kabosu/src/dictionary.rs
112
112
  - ext/kabosu/src/errors.rs
113
- - ext/kabosu/src/grouping.rs
114
113
  - ext/kabosu/src/lib.rs
115
114
  - ext/kabosu/src/morpheme.rs
116
115
  - ext/kabosu/src/nogvl.rs
@@ -1,133 +0,0 @@
1
- use magnus::{Error, RArray, Ruby};
2
- use std::sync::Arc;
3
- use sudachi::dic::dictionary::JapaneseDictionary;
4
-
5
- use crate::morpheme::{rb_morpheme_from_data, MorphemeData};
6
-
7
- /// Group raw `MorphemeData` into jpdb-style chips.
8
- pub(crate) fn group_morphemes_rust(
9
- morphemes: &[MorphemeData],
10
- dict: &Arc<JapaneseDictionary>,
11
- debug: bool,
12
- ) -> Result<RArray, Error> {
13
- let ruby = Ruby::get().unwrap();
14
- let mut groups: Vec<Vec<MorphemeData>> = Vec::with_capacity(morphemes.len());
15
-
16
- for m in morphemes {
17
- if let Some(last_group) = groups.last_mut() {
18
- let head = &last_group[0];
19
- let prev = last_group.last().unwrap();
20
- if is_content_word(head.pos_id, dict) && extends_group(m, prev, dict) {
21
- last_group.push(m.clone());
22
- continue;
23
- }
24
- }
25
- groups.push(vec![m.clone()]);
26
- }
27
-
28
- let result = ruby.ary_new();
29
- for group in groups {
30
- let group_ary = ruby.ary_new();
31
- for data in group {
32
- group_ary.push(rb_morpheme_from_data(data, dict.clone(), debug))?;
33
- }
34
- result.push(group_ary)?;
35
- }
36
- Ok(result)
37
- }
38
-
39
- // POS helpers
40
-
41
- fn is_content_word(pos_id: u16, dict: &JapaneseDictionary) -> bool {
42
- match dict.grammar().pos_components(pos_id).first().map(|s| s.as_str()) {
43
- Some("助詞") | Some("助動詞") | Some("補助記号") | Some("記号") | Some("空白") => false,
44
- _ => true,
45
- }
46
- }
47
-
48
- fn extends_group(
49
- m: &MorphemeData,
50
- prev: &MorphemeData,
51
- dict: &JapaneseDictionary,
52
- ) -> bool {
53
- let comps = dict.grammar().pos_components(m.pos_id);
54
- let pos0 = comps.first().map(|s| s.as_str());
55
- let pos1 = comps.get(1).map(|s| s.as_str());
56
-
57
- match pos0 {
58
- Some("助動詞") => true,
59
- Some("助詞") => {
60
- if is_clause_boundary(m.surface.as_str(), m.pos_id, dict) {
61
- return false;
62
- }
63
- if pos1 == Some("接続助詞") {
64
- return true;
65
- }
66
- // 副助詞 clings to preceding verb/adjective (e.g. たり/だり)
67
- if pos1 == Some("副助詞") && is_verb_adj_adv(prev.pos_id, dict) {
68
- return true;
69
- }
70
- false
71
- }
72
- Some("動詞") => {
73
- if pos1 != Some("非自立可能") {
74
- return false;
75
- }
76
- // te-form auxiliary chain: て/で + いる/ある/くる/etc.
77
- let prev_pos0 = dict.grammar().pos_components(prev.pos_id).first().map(|s| s.as_str());
78
- if prev_pos0 == Some("助詞")
79
- && (prev.surface == "て" || prev.surface == "で")
80
- {
81
- return true;
82
- }
83
- // compound verb (V+V) intentionally skipped — caller handles DB lookup
84
- false
85
- }
86
- _ => false,
87
- }
88
- }
89
-
90
- fn is_clause_boundary(surface: &str, pos_id: u16, dict: &JapaneseDictionary) -> bool {
91
- let comps = dict.grammar().pos_components(pos_id);
92
- let pos0 = comps.first().map(|s| s.as_str());
93
- let pos1 = comps.get(1).map(|s| s.as_str());
94
-
95
- if pos0 == Some("助詞") {
96
- if is_clause_boundary_particle(surface) {
97
- return true;
98
- }
99
- // contrastive が (接続助詞) is a boundary, unlike subject が (格助詞)
100
- if pos1 == Some("接続助詞") && surface == "が" {
101
- return true;
102
- }
103
- }
104
- false
105
- }
106
-
107
- fn is_clause_boundary_particle(surface: &str) -> bool {
108
- matches!(
109
- surface,
110
- "ながら"
111
- | "たら"
112
- | "ば"
113
- | "と"
114
- | "のに"
115
- | "から"
116
- | "ので"
117
- | "けれど"
118
- | "けど"
119
- | "つつ"
120
- | "なり"
121
- | "や"
122
- | "か"
123
- | "かどうか"
124
- | "とも"
125
- )
126
- }
127
-
128
- fn is_verb_adj_adv(pos_id: u16, dict: &JapaneseDictionary) -> bool {
129
- matches!(
130
- dict.grammar().pos_components(pos_id).first().map(|s| s.as_str()),
131
- Some("動詞") | Some("形容詞") | Some("形状詞")
132
- )
133
- }