RubyGems - kabosu - Versions diffs - 0.6.10.dev.20260225.c3c6711 → 0.6.10.dev.20260226.98055fb - Mend

kabosu 0.6.10.dev.20260225.c3c6711 → 0.6.10.dev.20260226.98055fb

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3d02b7f222732b36afe4c3da668cc8aa347a00e53d57dc59905dbb40f60cb937
-  data.tar.gz: f105c63fda73d071b4e659891afa868656cb299ee7e04b6c53e0595a34634751
+  metadata.gz: 88012b4a49377c642ff1d694ae3d6da5e6400e2b4bf494c34ee99e1467b1be0c
+  data.tar.gz: 4c7324c24a1880ba62e006d8f5f3cea4610cc9e574122a4ece812b2acc649d68
 SHA512:
-  metadata.gz: e776b3e802b8a6d6a9439367010b81580cb36d59069406d66ee2746ed059b089004e08d62d962b89d787dcfbe30463ef6ac5fba5128f469f944ef6f662e4fe37
-  data.tar.gz: a26907ac57248903d6531d62b35888942dbdd15cdb488a64fa6e782c18ffc7419b0ad52153c0e1656242c7fe9d671d6d80a8ea9fcc9fa8c7059d3ec24ec846ff
+  metadata.gz: 7bb1f40e57f79f5aab99bb3867db3ddaa0e6b0ef34f12b5d479dc961a7a6e5ea7dbbf79db1ac33e95b155843d722e45452716f1cca954f9a42346a9bfbc41b8c
+  data.tar.gz: 7b5de7868de3e7725a3c777a5d3374aa8b452c2611abcb0ffecf22cc74d5a1321055f85db690ca42a78fe0d114530fb354cdec2c1d60e47c00e042caa7cdef8b

data/README.md CHANGED Viewed

@@ -54,6 +54,10 @@ morpheme.begin_c             # => 0                - start character offset
 morpheme.end_c               # => 3                - end character offset
 morpheme.system?             # => true             - from system dictionary?
 morpheme.user?               # => false            - from user dictionary?
+# Split text into natural Japanese sentence boundaries
+Kabosu.split_sentences("東京都に住んでいる。大阪も好きだ。")
+# => ["東京都に住んでいる。", "大阪も好きだ。"]
 ```
 ## Installation
@@ -109,7 +113,6 @@ tok_c.tokenize("東京都").surfaces  # => ["東京都"]
 ```
 Modes are symbols only (`:a`, `:b`, `:c` or `Kabosu::MODE_A/B/C`).
-Invalid modes now raise `ArgumentError` (for example, `"A"`).
 ## Advanced Use Cases
@@ -135,12 +138,6 @@ dict.lookup("東京都").surfaces
 m = tokenizer.tokenize("東京都").first
 m.split(mode: :a).surfaces
-# Bulk extractors
-tokenizer.tokenize_surfaces("東京都に住んでいる")
-tokenizer.tokenize_readings("東京都に住んでいる")
-tokenizer.tokenize_dictionary_forms("東京都に住んでいる")
-tokenizer.tokenize_normalized_forms("東京都に住んでいる")
 # Sentence splitting
 Kabosu.split_sentences("東京都に住んでいる。大阪も好きだ。", ranges: true)
 Kabosu.split_sentences("長い文...", limit: 12, with_checker: true)
@@ -203,5 +200,6 @@ bundle exec rake kabosu:install # Install Sudachi dictionary
 bundle exec rake compile        # Build the native extension
 bundle exec rake test           # Run tests
 bench/start                     # Run benchmarks
 ```

data/ext/kabosu/src/dictionary.rs CHANGED Viewed

@@ -64,6 +64,9 @@ impl RbDictionary {
             let data = MorphemeData {
                 surface: surface_slice.to_string(),
+                dictionary_form: info.dictionary_form().to_string(),
+                normalized_form: info.normalized_form().to_string(),
+                reading_form: info.reading_form().to_string(),
                 pos_id: info.pos_id(),
                 word_id_raw: entry.word_id.as_raw(),
                 is_oov: entry.word_id.is_oov(),

data/ext/kabosu/src/lib.rs CHANGED Viewed

@@ -42,16 +42,6 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
     // Kabosu::Tokenizer
     let tok_class = module.define_class("Tokenizer", ruby.class_object())?;
     tok_class.define_method("tokenize", method!(RbTokenizer::tokenize, 1))?;
-    tok_class.define_method("tokenize_surfaces", method!(RbTokenizer::tokenize_surfaces, 1))?;
-    tok_class.define_method("tokenize_readings", method!(RbTokenizer::tokenize_readings, 1))?;
-    tok_class.define_method(
-        "tokenize_dictionary_forms",
-        method!(RbTokenizer::tokenize_dictionary_forms, 1),
-    )?;
-    tok_class.define_method(
-        "tokenize_normalized_forms",
-        method!(RbTokenizer::tokenize_normalized_forms, 1),
-    )?;
     tok_class.define_method("mode", method!(RbTokenizer::mode, 0))?;
     tok_class.define_method("fields", method!(RbTokenizer::fields, 0))?;
     tok_class.define_method("debug?", method!(RbTokenizer::is_debug, 0))?;

data/ext/kabosu/src/morpheme.rs CHANGED Viewed

@@ -13,6 +13,9 @@ use crate::parsing::parse_mode;
 #[derive(Clone)]
 pub(crate) struct MorphemeData {
     pub(crate) surface: String,
+    pub(crate) dictionary_form: String,
+    pub(crate) normalized_form: String,
+    pub(crate) reading_form: String,
     pub(crate) pos_id: u16,
     pub(crate) word_id_raw: u32,
     pub(crate) is_oov: bool,
@@ -37,6 +40,9 @@ where
     MorphemeData {
         surface,
+        dictionary_form: m.dictionary_form().to_string(),
+        normalized_form: m.normalized_form().to_string(),
+        reading_form: m.reading_form().to_string(),
         pos_id: m.part_of_speech_id(),
         word_id_raw: m.word_id().as_raw(),
         is_oov: m.is_oov(),
@@ -84,9 +90,6 @@ pub(crate) struct RbMorpheme {
 }
 struct LazyWordFields {
-    dictionary_form: String,
-    normalized_form: String,
-    reading_form: String,
     synonym_group_ids: Vec<u32>,
     dictionary_form_word_id: i32,
     head_word_length: usize,
@@ -96,14 +99,31 @@ struct LazyWordFields {
 }
 impl RbMorpheme {
+    fn fallback_word_fields(&self) -> LazyWordFields {
+        LazyWordFields {
+            synonym_group_ids: Vec::new(),
+            dictionary_form_word_id: -1,
+            head_word_length: self.data.surface.chars().count(),
+            a_unit_split: Vec::new(),
+            b_unit_split: Vec::new(),
+            word_structure: Vec::new(),
+        }
+    }
     fn resolve_word_fields(&self) -> &LazyWordFields {
         self.word_fields.get_or_init(|| {
+            if self.data.is_oov || self.data.dictionary_id < 0 {
+                return self.fallback_word_fields();
+            }
             let wid = WordId::from_raw(self.data.word_id_raw);
-            match self.dict.lexicon().get_word_info(wid) {
-                Ok(info) => LazyWordFields {
-                    dictionary_form: info.dictionary_form().to_string(),
-                    normalized_form: info.normalized_form().to_string(),
-                    reading_form: info.reading_form().to_string(),
+            let info_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+                self.dict.lexicon().get_word_info(wid)
+            }));
+            match info_result {
+                Ok(Ok(info)) => LazyWordFields {
                     synonym_group_ids: info.synonym_group_ids().to_vec(),
                     dictionary_form_word_id: info.dictionary_form_word_id(),
                     head_word_length: info.head_word_length(),
@@ -111,28 +131,25 @@ impl RbMorpheme {
                     b_unit_split: info.b_unit_split().iter().map(WordId::as_raw).collect(),
                     word_structure: info.word_structure().iter().map(WordId::as_raw).collect(),
                 },
-                Err(_) => LazyWordFields {
-                    dictionary_form: self.data.surface.clone(),
-                    normalized_form: self.data.surface.clone(),
-                    reading_form: self.data.surface.clone(),
-                    synonym_group_ids: Vec::new(),
-                    dictionary_form_word_id: -1,
-                    head_word_length: self.data.surface.chars().count(),
-                    a_unit_split: Vec::new(),
-                    b_unit_split: Vec::new(),
-                    word_structure: Vec::new(),
-                },
+                _ => self.fallback_word_fields(),
             }
         })
     }
     fn split_ids_from_lexicon(&self, mode: Mode) -> Result<Vec<u32>, Error> {
-        let wid = WordId::from_raw(self.data.word_id_raw);
-        if wid.is_oov() {
+        if self.data.is_oov || self.data.dictionary_id < 0 {
             return Ok(Vec::new());
         }
+        let wid = WordId::from_raw(self.data.word_id_raw);
-        let info = self.dict.lexicon().get_word_info(wid).map_err(sudachi_error)?;
+        let info_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+            self.dict.lexicon().get_word_info(wid)
+        }));
+        let info = match info_result {
+            Ok(Ok(info)) => info,
+            Ok(Err(e)) => return Err(sudachi_error(e)),
+            Err(_) => return Err(sudachi_error("panic while reading word info for split")),
+        };
         let ids = match mode {
             Mode::A => info.a_unit_split(),
             Mode::B => info.b_unit_split(),
@@ -174,7 +191,14 @@ impl RbMorpheme {
         for (i, &raw_wid) in split_ids.iter().enumerate() {
             let wid = WordId::from_raw(raw_wid);
-            let info = self.dict.lexicon().get_word_info(wid).map_err(sudachi_error)?;
+            let info_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+                self.dict.lexicon().get_word_info(wid)
+            }));
+            let info = match info_result {
+                Ok(Ok(info)) => info,
+                Ok(Err(e)) => return Err(sudachi_error(e)),
+                Err(_) => return Err(sudachi_error("panic while reading child word info for split")),
+            };
             // head_word_length is in codepoints; clamp to remaining characters.
             let mut span_chars = info.head_word_length();
@@ -190,6 +214,9 @@ impl RbMorpheme {
             let child = MorphemeData {
                 surface: surface[start_byte..end_byte].to_string(),
+                dictionary_form: info.dictionary_form().to_string(),
+                normalized_form: info.normalized_form().to_string(),
+                reading_form: info.reading_form().to_string(),
                 pos_id: info.pos_id(),
                 word_id_raw: raw_wid,
                 is_oov: wid.is_oov(),
@@ -230,15 +257,15 @@ impl RbMorpheme {
     }
     pub(crate) fn dictionary_form(&self) -> &str {
-        &self.resolve_word_fields().dictionary_form
+        &self.data.dictionary_form
     }
     pub(crate) fn normalized_form(&self) -> &str {
-        &self.resolve_word_fields().normalized_form
+        &self.data.normalized_form
     }
     pub(crate) fn reading_form(&self) -> &str {
-        &self.resolve_word_fields().reading_form
+        &self.data.reading_form
     }
     pub(crate) fn is_oov(&self) -> bool {

data/ext/kabosu/src/tokenizer.rs CHANGED Viewed

@@ -8,7 +8,6 @@ use sudachi::analysis::stateful_tokenizer::StatefulTokenizer;
 use sudachi::analysis::Mode;
 use sudachi::dic::dictionary::JapaneseDictionary;
 use sudachi::dic::subset::InfoSubset;
-use sudachi::dic::word_id::WordId;
 use sudachi::prelude::MorphemeList;
 use crate::errors::sudachi_error;
@@ -168,20 +167,6 @@ impl RbTokenizer {
         }
     }
-    fn collect_strings<F>(&self, text: String, mut project: F) -> Result<RArray, Error>
-    where
-        F: FnMut(&MorphemeData) -> String,
-    {
-        let ruby = Ruby::get().unwrap();
-        let analyzed = self.analyze(text)?;
-        let ary = ruby.ary_new_capa(analyzed.morphemes.len());
-        for data in analyzed.morphemes {
-            let projected = project(&data);
-            ary.push(ruby.str_new(&projected))?;
-        }
-        Ok(ary)
-    }
     pub(crate) fn tokenize(&self, text: String) -> Result<RbTokenBatch, Error> {
         let analyzed = self.analyze(text)?;
         Ok(RbTokenBatch::new(
@@ -192,40 +177,6 @@ impl RbTokenizer {
         ))
     }
-    pub(crate) fn tokenize_surfaces(&self, text: String) -> Result<RArray, Error> {
-        self.collect_strings(text, |data| data.surface.clone())
-    }
-    pub(crate) fn tokenize_readings(&self, text: String) -> Result<RArray, Error> {
-        let lexicon = self.pool.dict.lexicon();
-        self.collect_strings(text, |data| {
-            lexicon
-                .get_word_info(WordId::from_raw(data.word_id_raw))
-                .map(|info| info.reading_form().to_string())
-                .unwrap_or_else(|_| data.surface.clone())
-        })
-    }
-    pub(crate) fn tokenize_dictionary_forms(&self, text: String) -> Result<RArray, Error> {
-        let lexicon = self.pool.dict.lexicon();
-        self.collect_strings(text, |data| {
-            lexicon
-                .get_word_info(WordId::from_raw(data.word_id_raw))
-                .map(|info| info.dictionary_form().to_string())
-                .unwrap_or_else(|_| data.surface.clone())
-        })
-    }
-    pub(crate) fn tokenize_normalized_forms(&self, text: String) -> Result<RArray, Error> {
-        let lexicon = self.pool.dict.lexicon();
-        self.collect_strings(text, |data| {
-            lexicon
-                .get_word_info(WordId::from_raw(data.word_id_raw))
-                .map(|info| info.normalized_form().to_string())
-                .unwrap_or_else(|_| data.surface.clone())
-        })
-    }
     pub(crate) fn mode(&self) -> String {
         self.mode.to_string()
     }

data/lib/kabosu/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Kabosu
-  VERSION = "0.6.10.dev.20260225.c3c6711"
+  VERSION = "0.6.10.dev.20260226.98055fb"
 end

data/lib/kabosu.rb CHANGED Viewed

@@ -162,47 +162,6 @@ module Kabosu
     rescue RuntimeError => e
       raise TokenizationError.new(e.message), cause: e
     end
-    alias_method :_tokenize_surfaces, :tokenize_surfaces
-    alias_method :_tokenize_readings, :tokenize_readings
-    alias_method :_tokenize_dictionary_forms, :tokenize_dictionary_forms
-    alias_method :_tokenize_normalized_forms, :tokenize_normalized_forms
-    def tokenize_surfaces(text)
-      unless text.is_a?(String)
-        raise ArgumentError, "text must be a String"
-      end
-      _tokenize_surfaces(text)
-    rescue RuntimeError => e
-      raise TokenizationError.new(e.message), cause: e
-    end
-    def tokenize_readings(text)
-      unless text.is_a?(String)
-        raise ArgumentError, "text must be a String"
-      end
-      _tokenize_readings(text)
-    rescue RuntimeError => e
-      raise TokenizationError.new(e.message), cause: e
-    end
-    def tokenize_dictionary_forms(text)
-      unless text.is_a?(String)
-        raise ArgumentError, "text must be a String"
-      end
-      _tokenize_dictionary_forms(text)
-    rescue RuntimeError => e
-      raise TokenizationError.new(e.message), cause: e
-    end
-    def tokenize_normalized_forms(text)
-      unless text.is_a?(String)
-        raise ArgumentError, "text must be a String"
-      end
-      _tokenize_normalized_forms(text)
-    rescue RuntimeError => e
-      raise TokenizationError.new(e.message), cause: e
-    end
   end
   class Morpheme

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: kabosu
 version: !ruby/object:Gem::Version
-  version: 0.6.10.dev.20260225.c3c6711
+  version: 0.6.10.dev.20260226.98055fb
 platform: ruby
 authors:
 - davafons
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2026-02-25 00:00:00.000000000 Z
+date: 2026-02-26 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys