kabosu 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,239 @@
1
+ require "net/http"
2
+ require "uri"
3
+ require "fileutils"
4
+ require "json"
5
+ require "open3"
6
+
7
+ module Kabosu
8
+ class DictManager
9
+ EDITIONS = %w[small core full].freeze
10
+ EDITION_PRIORITY = %w[full core small].freeze
11
+ GITHUB_REPO = "WorksApplications/SudachiDict"
12
+ GITHUB_API = "https://api.github.com"
13
+
14
+ class DictNotFound < StandardError; end
15
+ class DownloadError < StandardError; end
16
+
17
+ # Default storage directory: ~/.kabosu/dict/
18
+ def self.default_dir
19
+ File.join(Dir.home, ".kabosu", "dict")
20
+ end
21
+
22
+ def initialize(dir: self.class.default_dir)
23
+ @dir = dir
24
+ end
25
+
26
+ attr_reader :dir
27
+
28
+ # ── Install ──
29
+
30
+ # Download and extract a dictionary edition.
31
+ #
32
+ # manager.install("small")
33
+ # manager.install("core", version: "20260116")
34
+ #
35
+ def install(edition = "core", version: nil)
36
+ edition = validate_edition(edition)
37
+ version ||= latest_version
38
+
39
+ dest_dir = File.join(@dir, "sudachi-dictionary-#{version}")
40
+ dic_path = File.join(dest_dir, "system_#{edition}.dic")
41
+
42
+ if File.exist?(dic_path)
43
+ $stderr.puts "Already installed: #{dic_path}"
44
+ return dic_path
45
+ end
46
+
47
+ url = release_asset_url(version, edition)
48
+ zip_path = File.join(@dir, "sudachi-dictionary-#{version}-#{edition}.zip")
49
+
50
+ FileUtils.mkdir_p(@dir)
51
+ download(url, zip_path)
52
+ extract(zip_path, @dir)
53
+ FileUtils.rm_f(zip_path)
54
+
55
+ unless File.exist?(dic_path)
56
+ raise DownloadError, "Expected #{dic_path} after extraction, but file not found"
57
+ end
58
+
59
+ $stderr.puts "Installed: #{dic_path}"
60
+ dic_path
61
+ end
62
+
63
+ # ── Discovery ──
64
+
65
+ # List all installed dictionaries.
66
+ # Returns an array of hashes: { version:, edition:, path: }
67
+ def installed
68
+ results = []
69
+ return results unless Dir.exist?(@dir)
70
+
71
+ Dir.glob(File.join(@dir, "sudachi-dictionary-*")).sort.reverse.each do |version_dir|
72
+ next unless File.directory?(version_dir)
73
+
74
+ version = File.basename(version_dir).sub("sudachi-dictionary-", "")
75
+ EDITIONS.each do |edition|
76
+ dic = File.join(version_dir, "system_#{edition}.dic")
77
+ next unless File.exist?(dic)
78
+
79
+ results << { version: version, edition: edition, path: dic }
80
+ end
81
+ end
82
+
83
+ results
84
+ end
85
+
86
+ # Find the best available dictionary path.
87
+ # Prefers: latest version, then full > core > small.
88
+ def find(edition: nil)
89
+ candidates = installed
90
+ raise DictNotFound, "No dictionaries installed. Run: rake kabosu:install" if candidates.empty?
91
+
92
+ if edition
93
+ edition = validate_edition(edition)
94
+ match = candidates.find { |d| d[:edition] == edition }
95
+ raise DictNotFound, "No #{edition} dictionary installed" unless match
96
+ return match[:path]
97
+ end
98
+
99
+ # Group by version (already sorted newest-first), pick best edition
100
+ by_version = candidates.group_by { |d| d[:version] }
101
+ latest_version_dicts = by_version.values.first
102
+
103
+ best = EDITION_PRIORITY.each do |ed|
104
+ found = latest_version_dicts.find { |d| d[:edition] == ed }
105
+ break found if found
106
+ end
107
+
108
+ best.is_a?(Hash) ? best[:path] : latest_version_dicts.first[:path]
109
+ end
110
+
111
+ # ── Remove ──
112
+
113
+ # Remove a specific dictionary edition, or an entire version.
114
+ def remove(edition: nil, version: nil)
115
+ targets = installed
116
+ targets = targets.select { |d| d[:version] == version } if version
117
+ targets = targets.select { |d| d[:edition] == edition } if edition
118
+
119
+ raise DictNotFound, "No matching dictionary found" if targets.empty?
120
+
121
+ targets.each do |d|
122
+ FileUtils.rm_f(d[:path])
123
+ $stderr.puts "Removed: #{d[:path]}"
124
+
125
+ # Clean up empty version directories
126
+ version_dir = File.dirname(d[:path])
127
+ dics_remaining = Dir.glob(File.join(version_dir, "system_*.dic"))
128
+ if dics_remaining.empty?
129
+ FileUtils.rm_rf(version_dir)
130
+ $stderr.puts "Removed empty directory: #{version_dir}"
131
+ end
132
+ end
133
+ end
134
+
135
+ # ── Version resolution ──
136
+
137
+ # Fetch the latest release tag from GitHub.
138
+ def latest_version
139
+ uri = URI("#{GITHUB_API}/repos/#{GITHUB_REPO}/releases/latest")
140
+ response = http_get(uri, headers: { "Accept" => "application/json" })
141
+ data = JSON.parse(response.body)
142
+ tag = data["tag_name"]
143
+ # Tags are like "v20260116" — strip the "v" prefix
144
+ tag.sub(/\Av/, "")
145
+ end
146
+
147
+ # List available versions from GitHub releases.
148
+ def available_versions
149
+ uri = URI("#{GITHUB_API}/repos/#{GITHUB_REPO}/releases")
150
+ response = http_get(uri, headers: { "Accept" => "application/json" })
151
+ JSON.parse(response.body).map { |r| r["tag_name"].sub(/\Av/, "") }
152
+ end
153
+
154
+ private
155
+
156
+ def validate_edition(edition)
157
+ edition = edition.to_s.downcase
158
+ unless EDITIONS.include?(edition)
159
+ raise ArgumentError, "Unknown edition '#{edition}'. Must be one of: #{EDITIONS.join(", ")}"
160
+ end
161
+ edition
162
+ end
163
+
164
+ def release_asset_url(version, edition)
165
+ "https://github.com/#{GITHUB_REPO}/releases/download/v#{version}/sudachi-dictionary-#{version}-#{edition}.zip"
166
+ end
167
+
168
+ def download(url, dest)
169
+ $stderr.puts "Downloading #{url}..."
170
+ uri = resolve_redirects(URI(url))
171
+
172
+ Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
173
+ http.request(Net::HTTP::Get.new(uri)) do |response|
174
+ unless response.is_a?(Net::HTTPSuccess)
175
+ raise DownloadError, "Failed to download: #{response.code} #{response.message}"
176
+ end
177
+
178
+ total = response["Content-Length"]&.to_i
179
+ written = 0
180
+
181
+ File.open(dest, "wb") do |f|
182
+ response.read_body do |chunk|
183
+ f.write(chunk)
184
+ written += chunk.bytesize
185
+ if total && total > 0
186
+ pct = (written * 100 / total).clamp(0, 100)
187
+ $stderr.print "\r #{(written.to_f / 1024 / 1024).round(1)} / #{(total.to_f / 1024 / 1024).round(1)} MB (#{pct}%)"
188
+ end
189
+ end
190
+ end
191
+
192
+ $stderr.puts "\r #{(written.to_f / 1024 / 1024).round(1)} MB downloaded"
193
+ end
194
+ end
195
+ end
196
+
197
+ def resolve_redirects(uri, limit: 5)
198
+ raise DownloadError, "Too many redirects" if limit == 0
199
+
200
+ Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
201
+ response = http.request(Net::HTTP::Head.new(uri))
202
+ case response
203
+ when Net::HTTPRedirection
204
+ resolve_redirects(URI(response["location"]), limit: limit - 1)
205
+ else
206
+ uri
207
+ end
208
+ end
209
+ end
210
+
211
+ def http_get(uri, headers: {}, redirect_limit: 5)
212
+ raise DownloadError, "Too many redirects" if redirect_limit == 0
213
+
214
+ http = Net::HTTP.new(uri.host, uri.port)
215
+ http.use_ssl = (uri.scheme == "https")
216
+
217
+ request = Net::HTTP::Get.new(uri)
218
+ headers.each { |k, v| request[k] = v }
219
+
220
+ response = http.request(request)
221
+
222
+ case response
223
+ when Net::HTTPRedirection
224
+ http_get(URI(response["location"]), headers: headers, redirect_limit: redirect_limit - 1)
225
+ else
226
+ response
227
+ end
228
+ end
229
+
230
+ def extract(zip_path, dest_dir)
231
+ $stderr.puts "Extracting..."
232
+ # Use system unzip — available everywhere, handles large files well
233
+ _stdout, stderr, status = Open3.capture3("unzip", "-o", zip_path, "-d", dest_dir)
234
+ unless status.success?
235
+ raise DownloadError, "Failed to extract #{zip_path}: #{stderr}"
236
+ end
237
+ end
238
+ end
239
+ end
@@ -0,0 +1,97 @@
1
+ module Kabosu
2
+ class MorphemeList
3
+ include Enumerable
4
+
5
+ attr_accessor :internal_cost
6
+
7
+ def initialize(morphemes, internal_cost: nil)
8
+ @morphemes = morphemes
9
+ @internal_cost = internal_cost
10
+ end
11
+
12
+ def each(&block)
13
+ @morphemes.each(&block)
14
+ end
15
+
16
+ def [](index)
17
+ @morphemes[index]
18
+ end
19
+
20
+ def first(n = nil)
21
+ n ? @morphemes.first(n) : @morphemes.first
22
+ end
23
+
24
+ def last(n = nil)
25
+ n ? @morphemes.last(n) : @morphemes.last
26
+ end
27
+
28
+ def size
29
+ @morphemes.size
30
+ end
31
+
32
+ def surfaces
33
+ map(&:surface)
34
+ end
35
+
36
+ def readings
37
+ map(&:reading_form)
38
+ end
39
+
40
+ def dictionary_forms
41
+ map(&:dictionary_form)
42
+ end
43
+
44
+ def normalized_forms
45
+ map(&:normalized_form)
46
+ end
47
+
48
+ def total_costs
49
+ map(&:total_cost)
50
+ end
51
+
52
+ def synonym_group_ids
53
+ map(&:synonym_group_ids)
54
+ end
55
+
56
+ # Joins all surfaces back into the original text (no spaces, for Japanese text).
57
+ def to_text
58
+ surfaces.join
59
+ end
60
+
61
+ # Filter morphemes by POS. Accepts a PosMatcher or an array pattern.
62
+ # Returns a new MorphemeList with only matching morphemes.
63
+ #
64
+ # list.select_pos(Kabosu::PosMatcher.nouns)
65
+ # list.select_pos(["名詞", "固有名詞"])
66
+ #
67
+ def select_pos(matcher_or_pattern)
68
+ matcher = coerce_to_matcher(matcher_or_pattern)
69
+ self.class.new(matcher.filter(@morphemes))
70
+ end
71
+
72
+ # Inverse of select_pos. Returns a new MorphemeList excluding matching morphemes.
73
+ def reject_pos(matcher_or_pattern)
74
+ matcher = coerce_to_matcher(matcher_or_pattern)
75
+ self.class.new(matcher.reject(@morphemes))
76
+ end
77
+
78
+ def inspect
79
+ base = "#<Kabosu::MorphemeList (#{size} morphemes)"
80
+ base += " cost=#{@internal_cost}" if @internal_cost
81
+ base + ": #{surfaces.join(" | ")}>"
82
+ end
83
+
84
+ private
85
+
86
+ def coerce_to_matcher(matcher_or_pattern)
87
+ case matcher_or_pattern
88
+ when PosMatcher
89
+ matcher_or_pattern
90
+ when Array
91
+ PosMatcher.new(matcher_or_pattern)
92
+ else
93
+ raise ArgumentError, "expected a PosMatcher or an Array pattern, got #{matcher_or_pattern.class}"
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,119 @@
1
+ module Kabosu
2
+ class PosMatcher
3
+ # Build a matcher from POS patterns or a block.
4
+ #
5
+ # # From a block
6
+ # PosMatcher.new { |pos| pos[0] == "名詞" }
7
+ #
8
+ # # Single pattern (array of strings; "*" or nil = match anything)
9
+ # PosMatcher.new(["名詞", "固有名詞", "*", "*"])
10
+ #
11
+ # # Multiple patterns (matches if any pattern matches)
12
+ # PosMatcher.new(["名詞", "固有名詞"], ["動詞", "*", "*", "*"])
13
+ #
14
+ def initialize(*patterns, &block)
15
+ if block
16
+ raise ArgumentError, "cannot supply both patterns and a block" unless patterns.empty?
17
+
18
+ @proc = block
19
+ elsif patterns.empty?
20
+ raise ArgumentError, "must supply at least one pattern or a block"
21
+ else
22
+ @patterns = patterns.map(&:freeze).freeze
23
+ end
24
+
25
+ freeze
26
+ end
27
+
28
+ # Returns true if the morpheme (or raw POS array) matches this matcher.
29
+ def match?(morpheme_or_pos)
30
+ pos = extract_pos(morpheme_or_pos)
31
+
32
+ if @proc
33
+ @proc.call(pos)
34
+ else
35
+ @patterns.any? { |pattern| pattern_match?(pattern, pos) }
36
+ end
37
+ end
38
+
39
+ # Return matching morphemes as an Array.
40
+ def filter(morphemes)
41
+ morphemes.select { |m| match?(m) }
42
+ end
43
+
44
+ # Return non-matching morphemes as an Array.
45
+ def reject(morphemes)
46
+ morphemes.reject { |m| match?(m) }
47
+ end
48
+
49
+ # Union: matches if either matcher matches.
50
+ def |(other)
51
+ a, b = self, other
52
+ PosMatcher.new { |pos| a.match?(pos) || b.match?(pos) }
53
+ end
54
+
55
+ # Intersection: matches if both matchers match.
56
+ def &(other)
57
+ a, b = self, other
58
+ PosMatcher.new { |pos| a.match?(pos) && b.match?(pos) }
59
+ end
60
+
61
+ # Difference: matches self but not other.
62
+ def -(other)
63
+ difference(other)
64
+ end
65
+
66
+ def difference(other)
67
+ a, b = self, other
68
+ PosMatcher.new { |pos| a.match?(pos) && !b.match?(pos) }
69
+ end
70
+
71
+ # ── Pre-built matchers ──
72
+
73
+ def self.nouns
74
+ @nouns ||= new(["名詞"])
75
+ end
76
+
77
+ def self.verbs
78
+ @verbs ||= new(["動詞"])
79
+ end
80
+
81
+ def self.adjectives
82
+ @adjectives ||= new(["形容詞"])
83
+ end
84
+
85
+ def self.particles
86
+ @particles ||= new(["助詞"])
87
+ end
88
+
89
+ def self.auxiliary_verbs
90
+ @auxiliary_verbs ||= new(["助動詞"])
91
+ end
92
+
93
+ def self.adverbs
94
+ @adverbs ||= new(["副詞"])
95
+ end
96
+
97
+ def self.proper_nouns
98
+ @proper_nouns ||= new(["名詞", "固有名詞"])
99
+ end
100
+
101
+ private
102
+
103
+ def extract_pos(morpheme_or_pos)
104
+ if morpheme_or_pos.is_a?(Array)
105
+ morpheme_or_pos
106
+ elsif morpheme_or_pos.respond_to?(:part_of_speech)
107
+ morpheme_or_pos.part_of_speech
108
+ else
109
+ raise ArgumentError, "expected an Array or an object responding to #part_of_speech"
110
+ end
111
+ end
112
+
113
+ def pattern_match?(pattern, pos)
114
+ pattern.each_with_index.all? do |slot, i|
115
+ slot.nil? || slot == "*" || slot == pos[i]
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,86 @@
1
+ require "kabosu/dict_manager"
2
+
3
+ namespace :kabosu do
4
+ # ── Install ──
5
+
6
+ desc "Install the core dictionary (default). VERSION=YYYYMMDD to pin a specific release."
7
+ task :install do
8
+ Kabosu::DictManager.new.install("core", version: ENV["VERSION"])
9
+ end
10
+
11
+ namespace :install do
12
+ desc "Install the small dictionary. VERSION=YYYYMMDD to pin a specific release."
13
+ task :small do
14
+ Kabosu::DictManager.new.install("small", version: ENV["VERSION"])
15
+ end
16
+
17
+ desc "Install the core dictionary. VERSION=YYYYMMDD to pin a specific release."
18
+ task :core do
19
+ Kabosu::DictManager.new.install("core", version: ENV["VERSION"])
20
+ end
21
+
22
+ desc "Install the full dictionary. VERSION=YYYYMMDD to pin a specific release."
23
+ task :full do
24
+ Kabosu::DictManager.new.install("full", version: ENV["VERSION"])
25
+ end
26
+ end
27
+
28
+ # ── Remove ──
29
+
30
+ desc "Remove all installed dictionaries, or a specific one with EDITION=small|core|full and/or VERSION=YYYYMMDD"
31
+ task :remove do
32
+ Kabosu::DictManager.new.remove(edition: ENV["EDITION"], version: ENV["VERSION"])
33
+ end
34
+
35
+ namespace :remove do
36
+ desc "Remove the small dictionary."
37
+ task :small do
38
+ Kabosu::DictManager.new.remove(edition: "small", version: ENV["VERSION"])
39
+ end
40
+
41
+ desc "Remove the core dictionary."
42
+ task :core do
43
+ Kabosu::DictManager.new.remove(edition: "core", version: ENV["VERSION"])
44
+ end
45
+
46
+ desc "Remove the full dictionary."
47
+ task :full do
48
+ Kabosu::DictManager.new.remove(edition: "full", version: ENV["VERSION"])
49
+ end
50
+ end
51
+
52
+ # ── Info ──
53
+
54
+ desc "List installed Sudachi dictionaries"
55
+ task :list do
56
+ manager = Kabosu::DictManager.new
57
+ dicts = manager.installed
58
+
59
+ if dicts.empty?
60
+ puts "No dictionaries installed. Run: rake kabosu:install"
61
+ else
62
+ puts "Installed dictionaries (#{manager.dir}):"
63
+ puts
64
+ dicts.each do |d|
65
+ size_mb = (File.size(d[:path]).to_f / 1024 / 1024).round(1)
66
+ puts " #{d[:version]} / #{d[:edition].ljust(5)} (#{size_mb} MB)"
67
+ puts " #{d[:path]}"
68
+ end
69
+ end
70
+ end
71
+
72
+ desc "Show available dictionary versions from GitHub"
73
+ task :versions do
74
+ Kabosu::DictManager.new.available_versions.each { |v| puts v }
75
+ end
76
+
77
+ desc "Show the path to the best available dictionary. EDITION=small|core|full to be specific."
78
+ task :path do
79
+ begin
80
+ puts Kabosu::DictManager.new.find(edition: ENV["EDITION"])
81
+ rescue Kabosu::DictManager::DictNotFound => e
82
+ $stderr.puts e.message
83
+ exit 1
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,3 @@
1
+ module Kabosu
2
+ VERSION = "0.1.0"
3
+ end
data/lib/kabosu.rb ADDED
@@ -0,0 +1,143 @@
1
+ require_relative "kabosu/version"
2
+ require_relative "kabosu/kabosu"
3
+ require_relative "kabosu/dict_manager"
4
+ require_relative "kabosu/pos_matcher"
5
+ require_relative "kabosu/morpheme_list"
6
+
7
+ module Kabosu
8
+ # ── Dictionary.new: keyword API + auto-discovery ──
9
+
10
+ class Dictionary
11
+ class << self
12
+ alias_method :_new, :new
13
+
14
+ def new(config: nil, dict: nil)
15
+ dict ||= Kabosu.dict_path
16
+ _new(config, dict)
17
+ end
18
+ end
19
+ end
20
+
21
+ # ── Tokenizer: wrap output in MorphemeList ──
22
+
23
+ class Tokenizer
24
+ alias_method :_tokenize, :tokenize
25
+
26
+ def tokenize(text)
27
+ morphemes = _tokenize(text)
28
+ cost = respond_to?(:internal_cost) ? internal_cost : nil
29
+ MorphemeList.new(morphemes, internal_cost: cost)
30
+ end
31
+
32
+ if method_defined?(:tokenize_sentences) || instance_methods(false).include?(:tokenize_sentences)
33
+ alias_method :_tokenize_sentences, :tokenize_sentences
34
+
35
+ def tokenize_sentences(text)
36
+ _tokenize_sentences(text).map { |morphemes| MorphemeList.new(morphemes) }
37
+ end
38
+ end
39
+ end
40
+
41
+ # ── StatefulTokenizer: wrap output in MorphemeList ──
42
+
43
+ class StatefulTokenizer
44
+ alias_method :_tokenize, :tokenize
45
+
46
+ def tokenize(text)
47
+ morphemes = _tokenize(text)
48
+ cost = respond_to?(:internal_cost) ? internal_cost : nil
49
+ MorphemeList.new(morphemes, internal_cost: cost)
50
+ end
51
+
52
+ if method_defined?(:tokenize_sentences) || instance_methods(false).include?(:tokenize_sentences)
53
+ alias_method :_tokenize_sentences, :tokenize_sentences
54
+
55
+ def tokenize_sentences(text)
56
+ _tokenize_sentences(text).map { |morphemes| MorphemeList.new(morphemes) }
57
+ end
58
+ end
59
+ end
60
+
61
+ # ── Dictionary management ──
62
+
63
+ def self.dict_manager
64
+ @dict_manager ||= DictManager.new
65
+ end
66
+
67
+ def self.install_dictionary(edition = "core", version: nil)
68
+ dict_manager.install(edition, version: version)
69
+ end
70
+
71
+ def self.dict_path(edition: nil)
72
+ dict_manager.find(edition: edition)
73
+ end
74
+
75
+ def self.dictionaries
76
+ dict_manager.installed
77
+ end
78
+
79
+ # ── Convenience tokenization ──
80
+
81
+ # Tokenize text using the best available dictionary.
82
+ #
83
+ # Kabosu.tokenize("東京都に住んでいる")
84
+ # Kabosu.tokenize("東京都に住んでいる", mode: "A")
85
+ # Kabosu.tokenize("東京都に住んでいる", edition: "small")
86
+ #
87
+ def self.tokenize(text, mode: "C", edition: nil)
88
+ tokenizer = cached_tokenizer(edition: edition, mode: mode)
89
+ tokenizer.tokenize(text)
90
+ end
91
+
92
+ # Tokenize text into sentences. Returns an array of MorphemeList objects,
93
+ # one per sentence.
94
+ #
95
+ # Kabosu.tokenize_sentences("東京都に住んでいる。大阪も好きだ。")
96
+ #
97
+ def self.tokenize_sentences(text, mode: "C", edition: nil)
98
+ tokenizer = cached_tokenizer(edition: edition, mode: mode)
99
+ if tokenizer.respond_to?(:_tokenize_sentences)
100
+ tokenizer.tokenize_sentences(text)
101
+ elsif tokenizer.respond_to?(:tokenize_sentences)
102
+ # Fallback if the alias wasn't set up (Rust method not yet available)
103
+ tokenizer.tokenize_sentences(text).map { |morphemes| MorphemeList.new(morphemes) }
104
+ else
105
+ # Final fallback: treat entire text as a single sentence
106
+ [tokenizer.tokenize(text)]
107
+ end
108
+ end
109
+
110
+ # Convenience factory for PosMatcher.
111
+ #
112
+ # nouns = Kabosu.pos_matcher("名詞")
113
+ # verbs = Kabosu.pos_matcher(["動詞", "*", "*", "*"])
114
+ # custom = Kabosu.pos_matcher { |pos| pos[0] == "動詞" }
115
+ #
116
+ def self.pos_matcher(*patterns, &block)
117
+ if block
118
+ PosMatcher.new(&block)
119
+ elsif patterns.length == 1 && patterns[0].is_a?(String)
120
+ # Shorthand: single string is treated as first POS element
121
+ PosMatcher.new([patterns[0]])
122
+ else
123
+ PosMatcher.new(*patterns)
124
+ end
125
+ end
126
+
127
+ # Create a StatefulTokenizer for efficient batch processing.
128
+ # Reuses internal buffers across calls.
129
+ #
130
+ # tok = Kabosu.batch_tokenizer(mode: "A")
131
+ # texts.each { |t| results << tok.tokenize(t) }
132
+ #
133
+ def self.batch_tokenizer(mode: "C", edition: nil)
134
+ Dictionary.new(dict: dict_path(edition: edition)).create_stateful(mode)
135
+ end
136
+
137
+ # @api private
138
+ def self.cached_tokenizer(edition:, mode:)
139
+ @tokenizers ||= {}
140
+ @tokenizers[[edition, mode]] ||= Dictionary.new(dict: dict_path(edition: edition)).create(mode)
141
+ end
142
+ private_class_method :cached_tokenizer
143
+ end