kabosu 0.6.10 → 0.6.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7b83adc2b2e4c3a1c7ac4953342e512a48a8d8bba58ec731c5ee86881e9de4d9
4
- data.tar.gz: 5de32bd6f77cfef548e5af147192869937a96d1c9348ada2cb642c4e19529afd
3
+ metadata.gz: 82efeb6b51fcfc69c65004ffbd3bbc63a19c6c78b3a09569cdd8329e72f4766b
4
+ data.tar.gz: b282e3c3ad1b723c7a6aef61a46748c601d148af90639632be00a790567700b9
5
5
  SHA512:
6
- metadata.gz: fc4b8094f85e839ced1141143de1f4ba4e1f5cf3c3d564793add708696c4678b43a49887c1069bca4d59f66622757570ce8751a53410f73f07b2548035e52cb1
7
- data.tar.gz: c55195f1fad167eac7d75d8bda66a31363227f08e48d8f5e10ed0530c8fb5b4bc6816e63370d1eef134ef003c469c3c41e5649f02ab90a55b5357e8a7e09b523
6
+ metadata.gz: 310ce82c04d9e735fd6df5cb70e36ccdc93a54f290daa85d86a296bbe4106cc94d4b60ec76bd0a486769c5d2f2d9c17e4bd0b99b13cba602341a8c43bf5b8f33
7
+ data.tar.gz: 180d32c31053365e2360b2d2b403e1b33fbeb78cbf6bdc1fc3767d4071507f37c33898c52e00d9cf0ef694ce94256fc7df906b026da0c5a0c0feb6f7514d1fe5
data/Cargo.lock CHANGED
@@ -159,7 +159,7 @@ checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
159
159
 
160
160
  [[package]]
161
161
  name = "kabosu"
162
- version = "0.6.10"
162
+ version = "0.6.12"
163
163
  dependencies = [
164
164
  "magnus",
165
165
  "sudachi",
data/README.md CHANGED
@@ -85,14 +85,17 @@ Dictionary editions (from smallest to largest): `small`, `core`, `full`. See the
85
85
  Rake tasks for managing Sudachi dictionaries:
86
86
 
87
87
  ```sh
88
- rake kabosu:install[small] # Install a dictionary (VERSION=YYYYMMDD for a specific version)
89
- rake kabosu:list # List installed dictionaries
90
- rake kabosu:versions # Show available versions from GitHub
91
- rake kabosu:path # Show path to best available dictionary
92
- rake kabosu:remove[small] # Remove a dictionary (VERSION=YYYYMMDD for a specific version)
88
+ rake kabosu:install[small] # Install a dictionary (VERSION=YYYYMMDD for a specific version)
89
+ rake kabosu:install_if_missing # Same, but a no-op when a dictionary is already installed
90
+ rake kabosu:list # List installed dictionaries
91
+ rake kabosu:versions # Show available versions from GitHub
92
+ rake kabosu:path # Show path to best available dictionary
93
+ rake kabosu:remove[small] # Remove a dictionary (VERSION=YYYYMMDD for a specific version)
93
94
  ```
94
95
 
95
- Dictionaries are stored in `~/.kabosu/dict/` by default. Set `KABOSU_DICT_DIR` to customize.
96
+ Dictionaries are stored in `~/.kabosu/dict/` by default. Set `KABOSU_DICT_DIR` to customize — useful for pointing at a Docker volume so the dictionary persists across deployments.
97
+
98
+ In a Rails app, the rake tasks are auto-loaded via railtie — no manual `load` needed. For container entrypoints, `rake kabosu:install_if_missing` converges on the desired state without hitting the network on subsequent runs.
96
99
 
97
100
  ## Tokenization modes
98
101
 
@@ -2,7 +2,7 @@ require "net/http"
2
2
  require "uri"
3
3
  require "fileutils"
4
4
  require "json"
5
- require "open3"
5
+ require "zip"
6
6
 
7
7
  module Kabosu
8
8
  class DictManager
@@ -14,9 +14,11 @@ module Kabosu
14
14
  class DictNotFound < StandardError; end
15
15
  class DownloadError < StandardError; end
16
16
 
17
- # Default storage directory: ~/.kabosu/dict/
17
+ # Default storage directory. Honors KABOSU_DICT_DIR so consumers can point
18
+ # the gem at a Docker volume / shared mount without subclassing or threading
19
+ # `dir:` through every call site. Falls back to ~/.kabosu/dict/.
18
20
  def self.default_dir
19
- File.join(Dir.home, ".kabosu", "dict")
21
+ ENV["KABOSU_DICT_DIR"] || File.join(Dir.home, ".kabosu", "dict")
20
22
  end
21
23
 
22
24
  def initialize(dir: self.class.default_dir)
@@ -60,6 +62,24 @@ module Kabosu
60
62
  dic_path
61
63
  end
62
64
 
65
+ # Idempotent install. Returns the existing dictionary path if a matching
66
+ # one is already on disk; otherwise downloads and extracts. Useful for
67
+ # entrypoint scripts and CI hooks that should converge on the desired
68
+ # state without paying the network cost on every run.
69
+ #
70
+ # manager.install_if_missing("core")
71
+ # manager.install_if_missing("core", version: "20260116")
72
+ #
73
+ def install_if_missing(edition = "core", version: nil)
74
+ edition = validate_edition(edition)
75
+ matching = installed.find do |d|
76
+ d[:edition] == edition && (version.nil? || d[:version] == version)
77
+ end
78
+ return matching[:path] if matching
79
+
80
+ install(edition, version: version)
81
+ end
82
+
63
83
  # ── Discovery ──
64
84
 
65
85
  # List all installed dictionaries.
@@ -229,11 +249,19 @@ module Kabosu
229
249
 
230
250
  def extract(zip_path, dest_dir)
231
251
  $stderr.puts "Extracting..."
232
- # Use system unzip — available everywhere, handles large files well
233
- _stdout, stderr, status = Open3.capture3("unzip", "-o", zip_path, "-d", dest_dir)
234
- unless status.success?
235
- raise DownloadError, "Failed to extract #{zip_path}: #{stderr}"
252
+ Zip::File.open(zip_path) do |archive|
253
+ archive.each do |entry|
254
+ target = File.join(dest_dir, entry.name)
255
+ # Guard against zip-slip refuse entries that escape dest_dir.
256
+ unless File.expand_path(target).start_with?(File.expand_path(dest_dir) + File::SEPARATOR)
257
+ raise DownloadError, "Refusing to extract entry outside dest_dir: #{entry.name}"
258
+ end
259
+ FileUtils.mkdir_p(File.dirname(target))
260
+ entry.extract(target) { true } # overwrite existing
261
+ end
236
262
  end
263
+ rescue Zip::Error => e
264
+ raise DownloadError, "Failed to extract #{zip_path}: #{e.message}"
237
265
  end
238
266
  end
239
267
  end
@@ -0,0 +1,9 @@
1
+ require "rails/railtie"
2
+
3
+ module Kabosu
4
+ class Railtie < Rails::Railtie
5
+ rake_tasks do
6
+ load File.expand_path("tasks.rake", __dir__)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,179 @@
1
+ #
2
+ # Japanese charcter category map
3
+ #
4
+ # $Id: char.def 9 2012-12-12 04:13:15Z togiso $;
5
+ #
6
+
7
+ ###################################################################################
8
+ #
9
+ # CHARACTER CATEGORY DEFINITION
10
+ #
11
+ # CATEGORY_NAME INVOKE GROUP LENGTH
12
+ #
13
+ # - CATEGORY_NAME: Name of category. you have to define DEFAULT class.
14
+ # - INVOKE: 1/0: always invoke unknown word processing, evan when the word can be found in the lexicon
15
+ # - GROUP: 1/0: make a new word by grouping the same chracter category
16
+ # - LENGTH: n: 1 to n length new words are added
17
+ #
18
+ DEFAULT 0 1 0 # DEFAULT is a mandatory category!
19
+ SPACE 0 1 0
20
+ KANJI 0 0 2
21
+ SYMBOL 1 1 0
22
+ NUMERIC 1 1 0
23
+ ALPHA 1 1 0
24
+ HIRAGANA 0 1 2
25
+ KATAKANA 1 1 2
26
+ KANJINUMERIC 0 1 0 #change INVOKE 1->0
27
+ GREEK 1 1 0
28
+ CYRILLIC 1 1 0
29
+
30
+ ###################################################################################
31
+ #
32
+ # CODE(UCS2) TO CATEGORY MAPPING
33
+ #
34
+
35
+ # SPACE
36
+ 0x0020 SPACE # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE
37
+ 0x000D SPACE
38
+ 0x0009 SPACE
39
+ 0x000B SPACE
40
+ 0x000A SPACE
41
+
42
+ # ASCII
43
+ 0x0021..0x002F SYMBOL #!"#$%&'()*+,-./
44
+ 0x0030..0x0039 NUMERIC #0-9
45
+ 0x003A..0x0040 SYMBOL #:;<=>?@
46
+ 0x0041..0x005A ALPHA #A-Z
47
+ 0x005B..0x0060 SYMBOL #[\]^_`
48
+ 0x0061..0x007A ALPHA #a-z
49
+ 0x007B..0x007E SYMBOL #{|}~
50
+
51
+ # Latin
52
+ 0x00A1..0x00BF SYMBOL # Latin 1 #¡->¿
53
+ 0x00C0..0x00D6 ALPHA # Latin 1 #À->Ö
54
+ 0x00D7 SYMBOL # Latin 1 #×
55
+ 0x00D8..0x00F6 ALPHA # Latin 1 #Ø->ö
56
+ 0x00F7 SYMBOL # Latin 1 #÷
57
+ 0x00F8..0x00FF ALPHA # Latin 1 #ø->ÿ
58
+ 0x0100..0x017F ALPHA # Latin Extended A
59
+ 0x0180..0x0236 ALPHA # Latin Extended B
60
+ 0x1E00..0x1EF9 ALPHA # Latin Extended Additional
61
+
62
+ # CYRILLIC
63
+ 0x0400..0x04F9 CYRILLIC #Ѐ->ӹ
64
+ 0x0500..0x050F CYRILLIC # Cyrillic supplementary
65
+
66
+ # GREEK
67
+ 0x0374..0x03FB GREEK # Greek and Coptic #ʹ->ϻ
68
+
69
+ # HIRAGANA
70
+ 0x3041..0x309F HIRAGANA
71
+
72
+ # KATAKANA
73
+ #0x30A1..0x30FF KATAKANA
74
+ 0x30A1..0x30FA KATAKANA
75
+ 0x30FC..0x30FF KATAKANA
76
+ 0x31F0..0x31FF KATAKANA # Small KU .. Small RO
77
+ # 0x30FC KATAKANA HIRAGANA # ー
78
+ 0x30A1 NOOOVBOW # Small A
79
+ 0x30A3 NOOOVBOW # ...
80
+ 0x30A5 NOOOVBOW
81
+ 0x30A7 NOOOVBOW
82
+ 0x30A9 NOOOVBOW
83
+ 0x30E3 NOOOVBOW
84
+ 0x30E5 NOOOVBOW
85
+ 0x30E7 NOOOVBOW
86
+ 0x30EE NOOOVBOW # Small Wa
87
+ 0x30FC..0x30FE NOOOVBOW # 'ー' 'ヽ' 'ヾ'
88
+
89
+ # Half KATAKANA
90
+ 0xFF66..0xFF9D KATAKANA
91
+ 0xFF9E..0xFF9F KATAKANA
92
+
93
+ # KANJI
94
+ 0x2E80..0x2EF3 KANJI # CJK Raidcals Supplement
95
+ 0x2F00..0x2FD5 KANJI
96
+ 0x3005 KANJI NOOOVBOW
97
+ 0x3007 KANJI
98
+ 0x3400..0x4DB5 KANJI # CJK Unified Ideographs Extention
99
+ #0x4E00..0x9FA5 KANJI
100
+ 0x4E00..0x9FFF KANJI
101
+ 0xF900..0xFA2D KANJI
102
+ 0xFA30..0xFA6A KANJI
103
+
104
+
105
+ # KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆)
106
+ 0x4E00 KANJINUMERIC KANJI
107
+ 0x4E8C KANJINUMERIC KANJI
108
+ 0x4E09 KANJINUMERIC KANJI
109
+ 0x56DB KANJINUMERIC KANJI
110
+ 0x4E94 KANJINUMERIC KANJI
111
+ 0x516D KANJINUMERIC KANJI
112
+ 0x4E03 KANJINUMERIC KANJI
113
+ 0x516B KANJINUMERIC KANJI
114
+ 0x4E5D KANJINUMERIC KANJI
115
+ 0x5341 KANJINUMERIC KANJI
116
+ 0x767E KANJINUMERIC KANJI
117
+ 0x5343 KANJINUMERIC KANJI
118
+ 0x4E07 KANJINUMERIC KANJI
119
+ 0x5104 KANJINUMERIC KANJI
120
+ 0x5146 KANJINUMERIC KANJI
121
+
122
+ # ZENKAKU
123
+ 0xFF10..0xFF19 NUMERIC
124
+ 0xFF21..0xFF3A ALPHA
125
+ 0xFF41..0xFF5A ALPHA
126
+ 0xFF01..0xFF0F SYMBOL #!->/
127
+ 0xFF1A..0xFF20 SYMBOL #:->@
128
+ 0xFF3B..0xFF40 SYMBOL #[->`
129
+ 0xFF5B..0xFF65 SYMBOL #{->・
130
+ 0xFFE0..0xFFEF SYMBOL # HalfWidth and Full width Form
131
+
132
+ # OTHER SYMBOLS
133
+ 0x2000..0x206F SYMBOL # General Punctuation
134
+ 0x2070..0x209F NUMERIC # Superscripts and Subscripts
135
+ 0x20A0..0x20CF SYMBOL # Currency Symbols
136
+ 0x2100..0x214F SYMBOL # Letterlike Symbols
137
+ 0x2150..0x218F NUMERIC # Number forms
138
+ 0x2100..0x214B SYMBOL # Letterlike Symbols
139
+ 0x2190..0x21FF SYMBOL # Arrow
140
+ 0x2200..0x22FF SYMBOL # Mathematical Operators
141
+ 0x2300..0x23FF SYMBOL # Miscellaneuos Technical
142
+ 0x2460..0x24FF SYMBOL # Enclosed NUMERICs
143
+ 0x2501..0x257F SYMBOL # Box Drawing
144
+ 0x2580..0x259F SYMBOL # Block Elements
145
+ 0x25A0..0x25FF SYMBOL # Geometric Shapes
146
+ 0x2600..0x26FE SYMBOL # Miscellaneous Symbols
147
+ 0x2700..0x27BF SYMBOL # Dingbats
148
+ 0x27F0..0x27FF SYMBOL # Supplemental Arrows A
149
+ 0x27C0..0x27EF SYMBOL # Miscellaneous Mathematical Symbols-A
150
+ 0x2800..0x28FF SYMBOL # Braille Patterns
151
+ 0x2900..0x297F SYMBOL # Supplemental Arrows B
152
+ 0x2B00..0x2BFF SYMBOL # Miscellaneous Symbols and Arrows
153
+ 0x2A00..0x2AFF SYMBOL # Supplemental Mathematical Operators
154
+ 0x3300..0x33FF SYMBOL
155
+ 0x3200..0x32FE SYMBOL # ENclosed CJK Letters and Months
156
+ 0x3000..0x303F SYMBOL # CJK Symbol and Punctuation
157
+ 0xFE30..0xFE4F SYMBOL # CJK Compatibility Forms
158
+ 0xFE50..0xFE6B SYMBOL # Small Form Variants
159
+
160
+ # added 2006/3/13
161
+ 0x3007 SYMBOL KANJINUMERIC
162
+
163
+ # added 2018/11/30
164
+ 0x309b..0x309c HIRAGANA KATAKANA # voiced/semi-voiced sound marks
165
+
166
+ # Unicode combining symbols 2021/12/03
167
+ # from https://en.wikipedia.org/wiki/Combining_character
168
+ 0x0300..0x036F ALL NOOOVBOW # Combining Diacritical Marks
169
+ 0x1AB0..0x1AFF ALL NOOOVBOW # Combining Diacritical Marks Extended
170
+ 0x1DC0..0x1DFF ALL NOOOVBOW # Combining Diacritical Marks Supplement
171
+ 0x20D0..0x20FF ALL NOOOVBOW # Combining Diacritical Marks for Symbols
172
+ 0xFE20..0xFE2F ALL NOOOVBOW # Combining Half Marks
173
+ 0xFE00..0xFE0F ALL NOOOVBOW # https://codepoints.net/variation_selectors
174
+ 0x1F3FB..0x1F3FE ALL NOOOVBOW # emoji skin tone modifiers https://codepoints.net/U+1F3FF
175
+
176
+ # Combination marks
177
+ 0x200C..0x200D ALL NOOOVBOW2 # Zero Width Non-Joiner/Joiner
178
+
179
+ # END OF TABLE