kanji-translator 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/README.md +2 -6
- data/lib/kanji/translator/version.rb +1 -1
- data/lib/kanji/translator.rb +86 -34
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 56e5d89e3ee4e402ab0100d1c86b5bd156481671d4dbd6633f7b9577e4155304
|
|
4
|
+
data.tar.gz: bc7eff78a596f65fdc387ac9fa0f3245f4cc72781a785a8f4298fd18b696003a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4a36777ecf342e246a9009c788016d54a74d5a6fa439d6b0ad0e0d3fdd4d1bd86440142c06e67dea29ddcf3be927d08a872db132cca4c0237c698410bcce49d7
|
|
7
|
+
data.tar.gz: 7dba7989316c7c2c29cf5eaf2ed7799f3b92cdb2d133a126a1bc7541d9a157b5003f1ce2f3f1d34e13c1c0eb0279e6786007cde9776daf32ac366a316b76f71b
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [1.1.0] - 2025-09-08
|
|
4
|
+
|
|
5
|
+
- Fixed: `to_hira` の結果を必ずひらがなに正規化(返却HTMLにカタカナが混ざる場合の不整合を解消)。
|
|
6
|
+
- Improved: スラッグ生成の分割アルゴリズムを整理し、ASCII連結・空白(全角含む)境界を厳密化。混在テキストでの精度向上。
|
|
7
|
+
- Changed: `to_slug(text, separator: "-", **opts)` に整理(`separator` は直接キーワード、他は `**opts`)。互換性は維持。
|
|
8
|
+
- Refactor: 正規表現・正規化処理の定数化/関数抽出、内部メソッドを `private_class_method` 化。
|
|
9
|
+
|
|
3
10
|
## [1.0.0] - 2025-09-08
|
|
4
11
|
|
|
5
12
|
- Breaking: `to_slug` のデフォルト挙動を `segmenter: :tiny` に変更(語境界ごとにハイフン区切り)。
|
data/README.md
CHANGED
|
@@ -49,20 +49,16 @@ require "kanji/translator/core_ext/string"
|
|
|
49
49
|
- ひらがな読みをカタカナに変換して返します。
|
|
50
50
|
- `Kanji::Translator.to_roma(text, **opts)`
|
|
51
51
|
- 簡易ヘボン式のローマ字(ASCII、小文字)で返します。拗音/促音(ゃゅょ/っ)に対応。長音記号「ー」は無視します(例: おう→ou)。
|
|
52
|
-
- `Kanji::Translator.to_slug(text, separator: "-", downcase: true, collapse: true,
|
|
52
|
+
- `Kanji::Translator.to_slug(text, separator: "-", downcase: true, collapse: true, **opts)`
|
|
53
53
|
- `to_roma` の結果をスラッグ化します。
|
|
54
54
|
- 非英数字を `separator` に置換、連続区切りを圧縮、前後の区切りをトリムします。
|
|
55
|
-
-
|
|
56
|
-
- `segmenter: :space`: 空白でのみ分割(例: "学校 案内" → "gakkou-annai")。
|
|
57
|
-
- `segmenter: nil`: 分かちなし(語の自動区切りなし)。
|
|
55
|
+
- 内部で TinySegmenter による分かち書きを行い、語境界ごとにハイフン区切りします(例: "学校案内" → "gakkou-annai")。
|
|
58
56
|
|
|
59
57
|
例(オプション):
|
|
60
58
|
|
|
61
59
|
```ruby
|
|
62
60
|
Kanji::Translator.to_hira("漢字", timeout: 3, retries: 1)
|
|
63
61
|
Kanji::Translator.to_slug("東京タワー 2010") #=> "toukyou-tawa-2010"
|
|
64
|
-
Kanji::Translator.to_slug("学校 案内", segmenter: :space) #=> "gakkou-annai"
|
|
65
|
-
Kanji::Translator.to_slug("学校案内", segmenter: nil) #=> "gakkouannai"
|
|
66
62
|
Kanji::Translator.to_slug("Foo Bar", separator: "_") #=> "foo_bar"
|
|
67
63
|
```
|
|
68
64
|
|
data/lib/kanji/translator.rb
CHANGED
|
@@ -13,12 +13,26 @@ module Kanji
|
|
|
13
13
|
|
|
14
14
|
USER_AGENT = "kanji-translator/#{VERSION}".freeze
|
|
15
15
|
HOST = "yomikatawa.com"
|
|
16
|
+
ASCII_RE = /[A-Za-z0-9]/
|
|
17
|
+
SPACE_RE = /[\s\u3000]/ # ASCII whitespace or IDEOGRAPHIC SPACE
|
|
18
|
+
NON_ALNUM_RE = /[^a-z0-9]+/
|
|
19
|
+
JAPANESE_RE = /[一-龯々〆ヵヶぁ-ゖゝゞァ-ヴー]/
|
|
20
|
+
BOUNDARY = :__BOUNDARY__
|
|
16
21
|
|
|
17
22
|
def self.to_hira(text, timeout: 5, retries: 2, backoff: 0.5, user_agent: USER_AGENT)
|
|
18
23
|
raise ArgumentError, "text must be a String" unless text.is_a?(String)
|
|
19
24
|
|
|
25
|
+
# Fast-path for kana inputs: avoid network and normalize locally
|
|
26
|
+
if text.match?(/\A[ぁ-ゖーゝゞ]+\z/)
|
|
27
|
+
return text
|
|
28
|
+
elsif text.match?(/\A[ァ-ヴーヽヾヵヶ]+\z/)
|
|
29
|
+
return katakana_to_hiragana(text)
|
|
30
|
+
end
|
|
31
|
+
|
|
20
32
|
body = fetch_page(text, timeout: timeout, retries: retries, backoff: backoff, user_agent: user_agent)
|
|
21
|
-
parse_hiragana(body)
|
|
33
|
+
hira = parse_hiragana(body)
|
|
34
|
+
# Ensure result is normalized to hiragana only (remote may mix katakana like 固有名詞)
|
|
35
|
+
katakana_to_hiragana(hira)
|
|
22
36
|
end
|
|
23
37
|
|
|
24
38
|
def self.to_kata(text, **)
|
|
@@ -31,35 +45,18 @@ module Kanji
|
|
|
31
45
|
hiragana_to_romaji(hira)
|
|
32
46
|
end
|
|
33
47
|
|
|
34
|
-
def self.to_slug(text, **opts)
|
|
35
|
-
sep =
|
|
48
|
+
def self.to_slug(text, separator: "-", **opts)
|
|
49
|
+
sep = separator
|
|
36
50
|
downcase = opts.fetch(:downcase, true)
|
|
37
51
|
collapse = opts.fetch(:collapse, true)
|
|
38
|
-
segmenter = opts.fetch(:segmenter, :tiny)
|
|
39
52
|
net_opts = slice_opts(opts, :timeout, :retries, :backoff, :user_agent)
|
|
40
53
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
parts.join(sep)
|
|
46
|
-
when :space
|
|
47
|
-
tokens = segment_with_space(text)
|
|
48
|
-
parts = tokens.filter_map { |tok| normalize_slug_part(tok, net_opts) }
|
|
49
|
-
parts.join(sep)
|
|
50
|
-
else
|
|
51
|
-
roma = to_roma(text, **net_opts)
|
|
52
|
-
roma.dup
|
|
53
|
-
end
|
|
54
|
+
tokens = segment_with_tiny(text)
|
|
55
|
+
raw_parts = tokens.filter_map { |tok| normalize_slug_part(tok, net_opts) }
|
|
56
|
+
parts = merge_ascii_parts(raw_parts)
|
|
57
|
+
s = parts.join(sep)
|
|
54
58
|
|
|
55
|
-
s
|
|
56
|
-
# Replace non-alphanumeric with separator
|
|
57
|
-
s = s.gsub(/[^a-z0-9]+/, sep)
|
|
58
|
-
# Collapse duplicate separators
|
|
59
|
-
s = s.gsub(/#{Regexp.escape(sep)}{2,}/, sep) if collapse && !sep.empty?
|
|
60
|
-
# Trim leading/trailing separators
|
|
61
|
-
s = s.gsub(/^#{Regexp.escape(sep)}|#{Regexp.escape(sep)}$/, "") unless sep.empty?
|
|
62
|
-
s
|
|
59
|
+
normalize_slug_string(s, sep: sep, downcase: downcase, collapse: collapse)
|
|
63
60
|
end
|
|
64
61
|
|
|
65
62
|
def self.fetch_page(text, timeout:, retries:, backoff:, user_agent: USER_AGENT)
|
|
@@ -124,6 +121,10 @@ module Kanji
|
|
|
124
121
|
hira.tr("ぁ-ゔゝゞー", "ァ-ヴヽヾー")
|
|
125
122
|
end
|
|
126
123
|
|
|
124
|
+
def self.katakana_to_hiragana(kata)
|
|
125
|
+
kata.tr("ァ-ヴヽヾヵヶー", "ぁ-ゔゝゞかけー")
|
|
126
|
+
end
|
|
127
|
+
|
|
127
128
|
DIGRAPHS = {
|
|
128
129
|
"きゃ" => "kya", "きゅ" => "kyu", "きぇ" => "kye", "きょ" => "kyo",
|
|
129
130
|
"ぎゃ" => "gya", "ぎゅ" => "gyu", "ぎぇ" => "gye", "ぎょ" => "gyo",
|
|
@@ -214,30 +215,81 @@ module Kanji
|
|
|
214
215
|
|
|
215
216
|
def self.segment_with_tiny(text)
|
|
216
217
|
require "tiny_segmenter"
|
|
217
|
-
TinySegmenter.new
|
|
218
|
+
seg = TinySegmenter.new
|
|
219
|
+
tokens = []
|
|
220
|
+
i = 0
|
|
221
|
+
while i < text.length
|
|
222
|
+
ch = text[i]
|
|
223
|
+
if ch =~ ASCII_RE
|
|
224
|
+
j = i + 1
|
|
225
|
+
j += 1 while j < text.length && text[j] =~ ASCII_RE
|
|
226
|
+
tokens << text[i...j]
|
|
227
|
+
i = j
|
|
228
|
+
elsif ch =~ SPACE_RE
|
|
229
|
+
# treat whitespace (incl. IDEOGRAPHIC SPACE) as a hard boundary
|
|
230
|
+
tokens << BOUNDARY unless tokens.last == BOUNDARY
|
|
231
|
+
i += 1
|
|
232
|
+
else
|
|
233
|
+
# collect contiguous non-ASCII-non-space and segment via TinySegmenter
|
|
234
|
+
j = i + 1
|
|
235
|
+
j += 1 while j < text.length && text[j] !~ /[A-Za-z0-9\s\u3000]/
|
|
236
|
+
chunk = text[i...j]
|
|
237
|
+
tokens.concat(seg.segment(chunk))
|
|
238
|
+
i = j
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
tokens
|
|
218
242
|
rescue LoadError
|
|
219
243
|
raise Error, "tiny_segmenter gem is not installed. Add `tiny_segmenter` or omit segmenter option."
|
|
220
244
|
end
|
|
221
245
|
|
|
222
246
|
def self.japanese_token?(tok)
|
|
223
247
|
# Kanji, Kana, prolonged sound mark, iteration marks, small kana
|
|
224
|
-
!!(tok =~
|
|
248
|
+
!!(tok =~ JAPANESE_RE)
|
|
225
249
|
end
|
|
226
250
|
|
|
227
|
-
def self.
|
|
228
|
-
|
|
251
|
+
def self.normalize_slug_part(tok, net_opts)
|
|
252
|
+
if tok == BOUNDARY
|
|
253
|
+
{ type: :boundary, text: nil }
|
|
254
|
+
elsif japanese_token?(tok)
|
|
255
|
+
{ type: :j, text: to_roma(tok, **net_opts) }
|
|
256
|
+
elsif tok =~ ASCII_RE
|
|
257
|
+
{ type: :ascii, text: tok }
|
|
258
|
+
end
|
|
229
259
|
end
|
|
230
260
|
|
|
231
|
-
def self.
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
261
|
+
def self.merge_ascii_parts(parts)
|
|
262
|
+
merged = []
|
|
263
|
+
parts.each do |p|
|
|
264
|
+
if p[:type] == :boundary
|
|
265
|
+
merged << p
|
|
266
|
+
elsif !merged.empty? && merged.last[:type] == :ascii && p[:type] == :ascii
|
|
267
|
+
merged.last[:text] << p[:text]
|
|
268
|
+
else
|
|
269
|
+
merged << { type: p[:type], text: p[:text].dup }
|
|
270
|
+
end
|
|
236
271
|
end
|
|
272
|
+
merged.reject { |p| p[:type] == :boundary }.map { |p| p[:text] }
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
def self.normalize_slug_string(str, sep:, downcase:, collapse:)
|
|
276
|
+
s = str
|
|
277
|
+
s = s.downcase if downcase
|
|
278
|
+
# Replace non-alphanumeric with separator
|
|
279
|
+
s = s.gsub(NON_ALNUM_RE, sep)
|
|
280
|
+
# Collapse duplicate separators
|
|
281
|
+
s = s.gsub(/#{Regexp.escape(sep)}{2,}/, sep) if collapse && !sep.empty?
|
|
282
|
+
# Trim leading/trailing separators
|
|
283
|
+
s = s.gsub(/^#{Regexp.escape(sep)}|#{Regexp.escape(sep)}$/, "") unless sep.empty?
|
|
284
|
+
s
|
|
237
285
|
end
|
|
238
286
|
|
|
239
287
|
def self.slice_opts(hash, *keys)
|
|
240
288
|
hash.slice(*keys)
|
|
241
289
|
end
|
|
290
|
+
|
|
291
|
+
private_class_method :segment_with_tiny, :japanese_token?, :normalize_slug_part, :merge_ascii_parts,
|
|
292
|
+
:normalize_slug_string, :slice_opts, :backoff_for, :katakana_to_hiragana,
|
|
293
|
+
:hiragana_to_katakana
|
|
242
294
|
end
|
|
243
295
|
end
|