kabosu 0.6.10 → 0.6.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +1 -1
- data/README.md +9 -6
- data/lib/kabosu/dict_manager.rb +35 -7
- data/lib/kabosu/railtie.rb +9 -0
- data/lib/kabosu/resources/char.def +179 -0
- data/lib/kabosu/resources/rewrite.def +1022 -0
- data/lib/kabosu/resources/sudachi.json +32 -0
- data/lib/kabosu/resources/unk.def +35 -0
- data/lib/kabosu/tasks.rake +6 -0
- data/lib/kabosu/version.rb +1 -1
- data/lib/kabosu.rb +22 -1
- metadata +21 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 82efeb6b51fcfc69c65004ffbd3bbc63a19c6c78b3a09569cdd8329e72f4766b
|
|
4
|
+
data.tar.gz: b282e3c3ad1b723c7a6aef61a46748c601d148af90639632be00a790567700b9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 310ce82c04d9e735fd6df5cb70e36ccdc93a54f290daa85d86a296bbe4106cc94d4b60ec76bd0a486769c5d2f2d9c17e4bd0b99b13cba602341a8c43bf5b8f33
|
|
7
|
+
data.tar.gz: 180d32c31053365e2360b2d2b403e1b33fbeb78cbf6bdc1fc3767d4071507f37c33898c52e00d9cf0ef694ce94256fc7df906b026da0c5a0c0feb6f7514d1fe5
|
data/Cargo.lock
CHANGED
data/README.md
CHANGED
|
@@ -85,14 +85,17 @@ Dictionary editions (from smallest to largest): `small`, `core`, `full`. See the
|
|
|
85
85
|
Rake tasks for managing Sudachi dictionaries:
|
|
86
86
|
|
|
87
87
|
```sh
|
|
88
|
-
rake kabosu:install[small]
|
|
89
|
-
rake kabosu:
|
|
90
|
-
rake kabosu:
|
|
91
|
-
rake kabosu:
|
|
92
|
-
rake kabosu:
|
|
88
|
+
rake kabosu:install[small] # Install a dictionary (VERSION=YYYYMMDD for a specific version)
|
|
89
|
+
rake kabosu:install_if_missing # Same, but a no-op when a dictionary is already installed
|
|
90
|
+
rake kabosu:list # List installed dictionaries
|
|
91
|
+
rake kabosu:versions # Show available versions from GitHub
|
|
92
|
+
rake kabosu:path # Show path to best available dictionary
|
|
93
|
+
rake kabosu:remove[small] # Remove a dictionary (VERSION=YYYYMMDD for a specific version)
|
|
93
94
|
```
|
|
94
95
|
|
|
95
|
-
Dictionaries are stored in `~/.kabosu/dict/` by default. Set `KABOSU_DICT_DIR` to customize.
|
|
96
|
+
Dictionaries are stored in `~/.kabosu/dict/` by default. Set `KABOSU_DICT_DIR` to customize — useful for pointing at a Docker volume so the dictionary persists across deployments.
|
|
97
|
+
|
|
98
|
+
In a Rails app, the rake tasks are auto-loaded via railtie — no manual `load` needed. For container entrypoints, `rake kabosu:install_if_missing` converges on the desired state without hitting the network on subsequent runs.
|
|
96
99
|
|
|
97
100
|
## Tokenization modes
|
|
98
101
|
|
data/lib/kabosu/dict_manager.rb
CHANGED
|
@@ -2,7 +2,7 @@ require "net/http"
|
|
|
2
2
|
require "uri"
|
|
3
3
|
require "fileutils"
|
|
4
4
|
require "json"
|
|
5
|
-
require "
|
|
5
|
+
require "zip"
|
|
6
6
|
|
|
7
7
|
module Kabosu
|
|
8
8
|
class DictManager
|
|
@@ -14,9 +14,11 @@ module Kabosu
|
|
|
14
14
|
class DictNotFound < StandardError; end
|
|
15
15
|
class DownloadError < StandardError; end
|
|
16
16
|
|
|
17
|
-
# Default storage directory
|
|
17
|
+
# Default storage directory. Honors KABOSU_DICT_DIR so consumers can point
|
|
18
|
+
# the gem at a Docker volume / shared mount without subclassing or threading
|
|
19
|
+
# `dir:` through every call site. Falls back to ~/.kabosu/dict/.
|
|
18
20
|
def self.default_dir
|
|
19
|
-
File.join(Dir.home, ".kabosu", "dict")
|
|
21
|
+
ENV["KABOSU_DICT_DIR"] || File.join(Dir.home, ".kabosu", "dict")
|
|
20
22
|
end
|
|
21
23
|
|
|
22
24
|
def initialize(dir: self.class.default_dir)
|
|
@@ -60,6 +62,24 @@ module Kabosu
|
|
|
60
62
|
dic_path
|
|
61
63
|
end
|
|
62
64
|
|
|
65
|
+
# Idempotent install. Returns the existing dictionary path if a matching
|
|
66
|
+
# one is already on disk; otherwise downloads and extracts. Useful for
|
|
67
|
+
# entrypoint scripts and CI hooks that should converge on the desired
|
|
68
|
+
# state without paying the network cost on every run.
|
|
69
|
+
#
|
|
70
|
+
# manager.install_if_missing("core")
|
|
71
|
+
# manager.install_if_missing("core", version: "20260116")
|
|
72
|
+
#
|
|
73
|
+
def install_if_missing(edition = "core", version: nil)
|
|
74
|
+
edition = validate_edition(edition)
|
|
75
|
+
matching = installed.find do |d|
|
|
76
|
+
d[:edition] == edition && (version.nil? || d[:version] == version)
|
|
77
|
+
end
|
|
78
|
+
return matching[:path] if matching
|
|
79
|
+
|
|
80
|
+
install(edition, version: version)
|
|
81
|
+
end
|
|
82
|
+
|
|
63
83
|
# ── Discovery ──
|
|
64
84
|
|
|
65
85
|
# List all installed dictionaries.
|
|
@@ -229,11 +249,19 @@ module Kabosu
|
|
|
229
249
|
|
|
230
250
|
def extract(zip_path, dest_dir)
|
|
231
251
|
$stderr.puts "Extracting..."
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
252
|
+
Zip::File.open(zip_path) do |archive|
|
|
253
|
+
archive.each do |entry|
|
|
254
|
+
target = File.join(dest_dir, entry.name)
|
|
255
|
+
# Guard against zip-slip — refuse entries that escape dest_dir.
|
|
256
|
+
unless File.expand_path(target).start_with?(File.expand_path(dest_dir) + File::SEPARATOR)
|
|
257
|
+
raise DownloadError, "Refusing to extract entry outside dest_dir: #{entry.name}"
|
|
258
|
+
end
|
|
259
|
+
FileUtils.mkdir_p(File.dirname(target))
|
|
260
|
+
entry.extract(target) { true } # overwrite existing
|
|
261
|
+
end
|
|
236
262
|
end
|
|
263
|
+
rescue Zip::Error => e
|
|
264
|
+
raise DownloadError, "Failed to extract #{zip_path}: #{e.message}"
|
|
237
265
|
end
|
|
238
266
|
end
|
|
239
267
|
end
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Japanese charcter category map
|
|
3
|
+
#
|
|
4
|
+
# $Id: char.def 9 2012-12-12 04:13:15Z togiso $;
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
###################################################################################
|
|
8
|
+
#
|
|
9
|
+
# CHARACTER CATEGORY DEFINITION
|
|
10
|
+
#
|
|
11
|
+
# CATEGORY_NAME INVOKE GROUP LENGTH
|
|
12
|
+
#
|
|
13
|
+
# - CATEGORY_NAME: Name of category. you have to define DEFAULT class.
|
|
14
|
+
# - INVOKE: 1/0: always invoke unknown word processing, evan when the word can be found in the lexicon
|
|
15
|
+
# - GROUP: 1/0: make a new word by grouping the same chracter category
|
|
16
|
+
# - LENGTH: n: 1 to n length new words are added
|
|
17
|
+
#
|
|
18
|
+
DEFAULT 0 1 0 # DEFAULT is a mandatory category!
|
|
19
|
+
SPACE 0 1 0
|
|
20
|
+
KANJI 0 0 2
|
|
21
|
+
SYMBOL 1 1 0
|
|
22
|
+
NUMERIC 1 1 0
|
|
23
|
+
ALPHA 1 1 0
|
|
24
|
+
HIRAGANA 0 1 2
|
|
25
|
+
KATAKANA 1 1 2
|
|
26
|
+
KANJINUMERIC 0 1 0 #change INVOKE 1->0
|
|
27
|
+
GREEK 1 1 0
|
|
28
|
+
CYRILLIC 1 1 0
|
|
29
|
+
|
|
30
|
+
###################################################################################
|
|
31
|
+
#
|
|
32
|
+
# CODE(UCS2) TO CATEGORY MAPPING
|
|
33
|
+
#
|
|
34
|
+
|
|
35
|
+
# SPACE
|
|
36
|
+
0x0020 SPACE # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE
|
|
37
|
+
0x000D SPACE
|
|
38
|
+
0x0009 SPACE
|
|
39
|
+
0x000B SPACE
|
|
40
|
+
0x000A SPACE
|
|
41
|
+
|
|
42
|
+
# ASCII
|
|
43
|
+
0x0021..0x002F SYMBOL #!"#$%&'()*+,-./
|
|
44
|
+
0x0030..0x0039 NUMERIC #0-9
|
|
45
|
+
0x003A..0x0040 SYMBOL #:;<=>?@
|
|
46
|
+
0x0041..0x005A ALPHA #A-Z
|
|
47
|
+
0x005B..0x0060 SYMBOL #[\]^_`
|
|
48
|
+
0x0061..0x007A ALPHA #a-z
|
|
49
|
+
0x007B..0x007E SYMBOL #{|}~
|
|
50
|
+
|
|
51
|
+
# Latin
|
|
52
|
+
0x00A1..0x00BF SYMBOL # Latin 1 #¡->¿
|
|
53
|
+
0x00C0..0x00D6 ALPHA # Latin 1 #À->Ö
|
|
54
|
+
0x00D7 SYMBOL # Latin 1 #×
|
|
55
|
+
0x00D8..0x00F6 ALPHA # Latin 1 #Ø->ö
|
|
56
|
+
0x00F7 SYMBOL # Latin 1 #÷
|
|
57
|
+
0x00F8..0x00FF ALPHA # Latin 1 #ø->ÿ
|
|
58
|
+
0x0100..0x017F ALPHA # Latin Extended A
|
|
59
|
+
0x0180..0x0236 ALPHA # Latin Extended B
|
|
60
|
+
0x1E00..0x1EF9 ALPHA # Latin Extended Additional
|
|
61
|
+
|
|
62
|
+
# CYRILLIC
|
|
63
|
+
0x0400..0x04F9 CYRILLIC #Ѐ->ӹ
|
|
64
|
+
0x0500..0x050F CYRILLIC # Cyrillic supplementary
|
|
65
|
+
|
|
66
|
+
# GREEK
|
|
67
|
+
0x0374..0x03FB GREEK # Greek and Coptic #ʹ->ϻ
|
|
68
|
+
|
|
69
|
+
# HIRAGANA
|
|
70
|
+
0x3041..0x309F HIRAGANA
|
|
71
|
+
|
|
72
|
+
# KATAKANA
|
|
73
|
+
#0x30A1..0x30FF KATAKANA
|
|
74
|
+
0x30A1..0x30FA KATAKANA
|
|
75
|
+
0x30FC..0x30FF KATAKANA
|
|
76
|
+
0x31F0..0x31FF KATAKANA # Small KU .. Small RO
|
|
77
|
+
# 0x30FC KATAKANA HIRAGANA # ー
|
|
78
|
+
0x30A1 NOOOVBOW # Small A
|
|
79
|
+
0x30A3 NOOOVBOW # ...
|
|
80
|
+
0x30A5 NOOOVBOW
|
|
81
|
+
0x30A7 NOOOVBOW
|
|
82
|
+
0x30A9 NOOOVBOW
|
|
83
|
+
0x30E3 NOOOVBOW
|
|
84
|
+
0x30E5 NOOOVBOW
|
|
85
|
+
0x30E7 NOOOVBOW
|
|
86
|
+
0x30EE NOOOVBOW # Small Wa
|
|
87
|
+
0x30FC..0x30FE NOOOVBOW # 'ー' 'ヽ' 'ヾ'
|
|
88
|
+
|
|
89
|
+
# Half KATAKANA
|
|
90
|
+
0xFF66..0xFF9D KATAKANA
|
|
91
|
+
0xFF9E..0xFF9F KATAKANA
|
|
92
|
+
|
|
93
|
+
# KANJI
|
|
94
|
+
0x2E80..0x2EF3 KANJI # CJK Raidcals Supplement
|
|
95
|
+
0x2F00..0x2FD5 KANJI
|
|
96
|
+
0x3005 KANJI NOOOVBOW
|
|
97
|
+
0x3007 KANJI
|
|
98
|
+
0x3400..0x4DB5 KANJI # CJK Unified Ideographs Extention
|
|
99
|
+
#0x4E00..0x9FA5 KANJI
|
|
100
|
+
0x4E00..0x9FFF KANJI
|
|
101
|
+
0xF900..0xFA2D KANJI
|
|
102
|
+
0xFA30..0xFA6A KANJI
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆)
|
|
106
|
+
0x4E00 KANJINUMERIC KANJI
|
|
107
|
+
0x4E8C KANJINUMERIC KANJI
|
|
108
|
+
0x4E09 KANJINUMERIC KANJI
|
|
109
|
+
0x56DB KANJINUMERIC KANJI
|
|
110
|
+
0x4E94 KANJINUMERIC KANJI
|
|
111
|
+
0x516D KANJINUMERIC KANJI
|
|
112
|
+
0x4E03 KANJINUMERIC KANJI
|
|
113
|
+
0x516B KANJINUMERIC KANJI
|
|
114
|
+
0x4E5D KANJINUMERIC KANJI
|
|
115
|
+
0x5341 KANJINUMERIC KANJI
|
|
116
|
+
0x767E KANJINUMERIC KANJI
|
|
117
|
+
0x5343 KANJINUMERIC KANJI
|
|
118
|
+
0x4E07 KANJINUMERIC KANJI
|
|
119
|
+
0x5104 KANJINUMERIC KANJI
|
|
120
|
+
0x5146 KANJINUMERIC KANJI
|
|
121
|
+
|
|
122
|
+
# ZENKAKU
|
|
123
|
+
0xFF10..0xFF19 NUMERIC
|
|
124
|
+
0xFF21..0xFF3A ALPHA
|
|
125
|
+
0xFF41..0xFF5A ALPHA
|
|
126
|
+
0xFF01..0xFF0F SYMBOL #!->/
|
|
127
|
+
0xFF1A..0xFF20 SYMBOL #:->@
|
|
128
|
+
0xFF3B..0xFF40 SYMBOL #[->`
|
|
129
|
+
0xFF5B..0xFF65 SYMBOL #{->・
|
|
130
|
+
0xFFE0..0xFFEF SYMBOL # HalfWidth and Full width Form
|
|
131
|
+
|
|
132
|
+
# OTHER SYMBOLS
|
|
133
|
+
0x2000..0x206F SYMBOL # General Punctuation
|
|
134
|
+
0x2070..0x209F NUMERIC # Superscripts and Subscripts
|
|
135
|
+
0x20A0..0x20CF SYMBOL # Currency Symbols
|
|
136
|
+
0x2100..0x214F SYMBOL # Letterlike Symbols
|
|
137
|
+
0x2150..0x218F NUMERIC # Number forms
|
|
138
|
+
0x2100..0x214B SYMBOL # Letterlike Symbols
|
|
139
|
+
0x2190..0x21FF SYMBOL # Arrow
|
|
140
|
+
0x2200..0x22FF SYMBOL # Mathematical Operators
|
|
141
|
+
0x2300..0x23FF SYMBOL # Miscellaneuos Technical
|
|
142
|
+
0x2460..0x24FF SYMBOL # Enclosed NUMERICs
|
|
143
|
+
0x2501..0x257F SYMBOL # Box Drawing
|
|
144
|
+
0x2580..0x259F SYMBOL # Block Elements
|
|
145
|
+
0x25A0..0x25FF SYMBOL # Geometric Shapes
|
|
146
|
+
0x2600..0x26FE SYMBOL # Miscellaneous Symbols
|
|
147
|
+
0x2700..0x27BF SYMBOL # Dingbats
|
|
148
|
+
0x27F0..0x27FF SYMBOL # Supplemental Arrows A
|
|
149
|
+
0x27C0..0x27EF SYMBOL # Miscellaneous Mathematical Symbols-A
|
|
150
|
+
0x2800..0x28FF SYMBOL # Braille Patterns
|
|
151
|
+
0x2900..0x297F SYMBOL # Supplemental Arrows B
|
|
152
|
+
0x2B00..0x2BFF SYMBOL # Miscellaneous Symbols and Arrows
|
|
153
|
+
0x2A00..0x2AFF SYMBOL # Supplemental Mathematical Operators
|
|
154
|
+
0x3300..0x33FF SYMBOL
|
|
155
|
+
0x3200..0x32FE SYMBOL # ENclosed CJK Letters and Months
|
|
156
|
+
0x3000..0x303F SYMBOL # CJK Symbol and Punctuation
|
|
157
|
+
0xFE30..0xFE4F SYMBOL # CJK Compatibility Forms
|
|
158
|
+
0xFE50..0xFE6B SYMBOL # Small Form Variants
|
|
159
|
+
|
|
160
|
+
# added 2006/3/13
|
|
161
|
+
0x3007 SYMBOL KANJINUMERIC
|
|
162
|
+
|
|
163
|
+
# added 2018/11/30
|
|
164
|
+
0x309b..0x309c HIRAGANA KATAKANA # voiced/semi-voiced sound marks
|
|
165
|
+
|
|
166
|
+
# Unicode combining symbols 2021/12/03
|
|
167
|
+
# from https://en.wikipedia.org/wiki/Combining_character
|
|
168
|
+
0x0300..0x036F ALL NOOOVBOW # Combining Diacritical Marks
|
|
169
|
+
0x1AB0..0x1AFF ALL NOOOVBOW # Combining Diacritical Marks Extended
|
|
170
|
+
0x1DC0..0x1DFF ALL NOOOVBOW # Combining Diacritical Marks Supplement
|
|
171
|
+
0x20D0..0x20FF ALL NOOOVBOW # Combining Diacritical Marks for Symbols
|
|
172
|
+
0xFE20..0xFE2F ALL NOOOVBOW # Combining Half Marks
|
|
173
|
+
0xFE00..0xFE0F ALL NOOOVBOW # https://codepoints.net/variation_selectors
|
|
174
|
+
0x1F3FB..0x1F3FE ALL NOOOVBOW # emoji skin tone modifiers https://codepoints.net/U+1F3FF
|
|
175
|
+
|
|
176
|
+
# Combination marks
|
|
177
|
+
0x200C..0x200D ALL NOOOVBOW2 # Zero Width Non-Joiner/Joiner
|
|
178
|
+
|
|
179
|
+
# END OF TABLE
|