kanji-translator 0.1.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/README.md +2 -3
- data/lib/kanji/translator/version.rb +1 -1
- data/lib/kanji/translator.rb +110 -12
- metadata +15 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 56e5d89e3ee4e402ab0100d1c86b5bd156481671d4dbd6633f7b9577e4155304
|
|
4
|
+
data.tar.gz: bc7eff78a596f65fdc387ac9fa0f3245f4cc72781a785a8f4298fd18b696003a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4a36777ecf342e246a9009c788016d54a74d5a6fa439d6b0ad0e0d3fdd4d1bd86440142c06e67dea29ddcf3be927d08a872db132cca4c0237c698410bcce49d7
|
|
7
|
+
data.tar.gz: 7dba7989316c7c2c29cf5eaf2ed7799f3b92cdb2d133a126a1bc7541d9a157b5003f1ce2f3f1d34e13c1c0eb0279e6786007cde9776daf32ac366a316b76f71b
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,19 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [1.1.0] - 2025-09-08
|
|
4
|
+
|
|
5
|
+
- Fixed: `to_hira` の結果を必ずひらがなに正規化(返却HTMLにカタカナが混ざる場合の不整合を解消)。
|
|
6
|
+
- Improved: スラッグ生成の分割アルゴリズムを整理し、ASCII連結・空白(全角含む)境界を厳密化。混在テキストでの精度向上。
|
|
7
|
+
- Changed: `to_slug(text, separator: "-", **opts)` に整理(`separator` は直接キーワード、他は `**opts`)。互換性は維持。
|
|
8
|
+
- Refactor: 正規表現・正規化処理の定数化/関数抽出、内部メソッドを `private_class_method` 化。
|
|
9
|
+
|
|
10
|
+
## [1.0.0] - 2025-09-08
|
|
11
|
+
|
|
12
|
+
- Breaking: `to_slug` のデフォルト挙動を `segmenter: :tiny` に変更(語境界ごとにハイフン区切り)。
|
|
13
|
+
- Added: `segmenter: :space` オプションを追加。
|
|
14
|
+
- Added: 依存に `tiny_segmenter (~> 0.0.6)` を追加。
|
|
15
|
+
- Docs/Tests: READMEとRSpecを更新し新仕様を反映。
|
|
16
|
+
|
|
3
17
|
## [0.1.0] - 2025-09-08
|
|
4
18
|
|
|
5
19
|
- Initial release
|
data/README.md
CHANGED
|
@@ -8,13 +8,11 @@
|
|
|
8
8
|
|
|
9
9
|
Ruby 3.2以上が必要です。
|
|
10
10
|
|
|
11
|
-
Rubygems公開後:
|
|
12
|
-
|
|
13
11
|
```bash
|
|
14
12
|
bundle add kanji-translator
|
|
15
13
|
```
|
|
16
14
|
|
|
17
|
-
|
|
15
|
+
GitHubから使う場合(任意):
|
|
18
16
|
|
|
19
17
|
```ruby
|
|
20
18
|
# Gemfile
|
|
@@ -54,6 +52,7 @@ require "kanji/translator/core_ext/string"
|
|
|
54
52
|
- `Kanji::Translator.to_slug(text, separator: "-", downcase: true, collapse: true, **opts)`
|
|
55
53
|
- `to_roma` の結果をスラッグ化します。
|
|
56
54
|
- 非英数字を `separator` に置換、連続区切りを圧縮、前後の区切りをトリムします。
|
|
55
|
+
- 内部で TinySegmenter による分かち書きを行い、語境界ごとにハイフン区切りします(例: "学校案内" → "gakkou-annai")。
|
|
57
56
|
|
|
58
57
|
例(オプション):
|
|
59
58
|
|
data/lib/kanji/translator.rb
CHANGED
|
@@ -13,12 +13,26 @@ module Kanji
|
|
|
13
13
|
|
|
14
14
|
USER_AGENT = "kanji-translator/#{VERSION}".freeze
|
|
15
15
|
HOST = "yomikatawa.com"
|
|
16
|
+
ASCII_RE = /[A-Za-z0-9]/
|
|
17
|
+
SPACE_RE = /[\s\u3000]/ # ASCII whitespace or IDEOGRAPHIC SPACE
|
|
18
|
+
NON_ALNUM_RE = /[^a-z0-9]+/
|
|
19
|
+
JAPANESE_RE = /[一-龯々〆ヵヶぁ-ゖゝゞァ-ヴー]/
|
|
20
|
+
BOUNDARY = :__BOUNDARY__
|
|
16
21
|
|
|
17
22
|
def self.to_hira(text, timeout: 5, retries: 2, backoff: 0.5, user_agent: USER_AGENT)
|
|
18
23
|
raise ArgumentError, "text must be a String" unless text.is_a?(String)
|
|
19
24
|
|
|
25
|
+
# Fast-path for kana inputs: avoid network and normalize locally
|
|
26
|
+
if text.match?(/\A[ぁ-ゖーゝゞ]+\z/)
|
|
27
|
+
return text
|
|
28
|
+
elsif text.match?(/\A[ァ-ヴーヽヾヵヶ]+\z/)
|
|
29
|
+
return katakana_to_hiragana(text)
|
|
30
|
+
end
|
|
31
|
+
|
|
20
32
|
body = fetch_page(text, timeout: timeout, retries: retries, backoff: backoff, user_agent: user_agent)
|
|
21
|
-
parse_hiragana(body)
|
|
33
|
+
hira = parse_hiragana(body)
|
|
34
|
+
# Ensure result is normalized to hiragana only (remote may mix katakana like 固有名詞)
|
|
35
|
+
katakana_to_hiragana(hira)
|
|
22
36
|
end
|
|
23
37
|
|
|
24
38
|
def self.to_kata(text, **)
|
|
@@ -31,17 +45,18 @@ module Kanji
|
|
|
31
45
|
hiragana_to_romaji(hira)
|
|
32
46
|
end
|
|
33
47
|
|
|
34
|
-
def self.to_slug(text, separator: "-",
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
s =
|
|
44
|
-
|
|
48
|
+
def self.to_slug(text, separator: "-", **opts)
|
|
49
|
+
sep = separator
|
|
50
|
+
downcase = opts.fetch(:downcase, true)
|
|
51
|
+
collapse = opts.fetch(:collapse, true)
|
|
52
|
+
net_opts = slice_opts(opts, :timeout, :retries, :backoff, :user_agent)
|
|
53
|
+
|
|
54
|
+
tokens = segment_with_tiny(text)
|
|
55
|
+
raw_parts = tokens.filter_map { |tok| normalize_slug_part(tok, net_opts) }
|
|
56
|
+
parts = merge_ascii_parts(raw_parts)
|
|
57
|
+
s = parts.join(sep)
|
|
58
|
+
|
|
59
|
+
normalize_slug_string(s, sep: sep, downcase: downcase, collapse: collapse)
|
|
45
60
|
end
|
|
46
61
|
|
|
47
62
|
def self.fetch_page(text, timeout:, retries:, backoff:, user_agent: USER_AGENT)
|
|
@@ -106,6 +121,10 @@ module Kanji
|
|
|
106
121
|
hira.tr("ぁ-ゔゝゞー", "ァ-ヴヽヾー")
|
|
107
122
|
end
|
|
108
123
|
|
|
124
|
+
def self.katakana_to_hiragana(kata)
|
|
125
|
+
kata.tr("ァ-ヴヽヾヵヶー", "ぁ-ゔゝゞかけー")
|
|
126
|
+
end
|
|
127
|
+
|
|
109
128
|
DIGRAPHS = {
|
|
110
129
|
"きゃ" => "kya", "きゅ" => "kyu", "きぇ" => "kye", "きょ" => "kyo",
|
|
111
130
|
"ぎゃ" => "gya", "ぎゅ" => "gyu", "ぎぇ" => "gye", "ぎょ" => "gyo",
|
|
@@ -193,5 +212,84 @@ module Kanji
|
|
|
193
212
|
jitter = rand * 0.05
|
|
194
213
|
sleep_s + jitter
|
|
195
214
|
end
|
|
215
|
+
|
|
216
|
+
def self.segment_with_tiny(text)
|
|
217
|
+
require "tiny_segmenter"
|
|
218
|
+
seg = TinySegmenter.new
|
|
219
|
+
tokens = []
|
|
220
|
+
i = 0
|
|
221
|
+
while i < text.length
|
|
222
|
+
ch = text[i]
|
|
223
|
+
if ch =~ ASCII_RE
|
|
224
|
+
j = i + 1
|
|
225
|
+
j += 1 while j < text.length && text[j] =~ ASCII_RE
|
|
226
|
+
tokens << text[i...j]
|
|
227
|
+
i = j
|
|
228
|
+
elsif ch =~ SPACE_RE
|
|
229
|
+
# treat whitespace (incl. IDEOGRAPHIC SPACE) as a hard boundary
|
|
230
|
+
tokens << BOUNDARY unless tokens.last == BOUNDARY
|
|
231
|
+
i += 1
|
|
232
|
+
else
|
|
233
|
+
# collect contiguous non-ASCII-non-space and segment via TinySegmenter
|
|
234
|
+
j = i + 1
|
|
235
|
+
j += 1 while j < text.length && text[j] !~ /[A-Za-z0-9\s\u3000]/
|
|
236
|
+
chunk = text[i...j]
|
|
237
|
+
tokens.concat(seg.segment(chunk))
|
|
238
|
+
i = j
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
tokens
|
|
242
|
+
rescue LoadError
|
|
243
|
+
raise Error, "tiny_segmenter gem is not installed. Add `tiny_segmenter` or omit segmenter option."
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
def self.japanese_token?(tok)
|
|
247
|
+
# Kanji, Kana, prolonged sound mark, iteration marks, small kana
|
|
248
|
+
!!(tok =~ JAPANESE_RE)
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def self.normalize_slug_part(tok, net_opts)
|
|
252
|
+
if tok == BOUNDARY
|
|
253
|
+
{ type: :boundary, text: nil }
|
|
254
|
+
elsif japanese_token?(tok)
|
|
255
|
+
{ type: :j, text: to_roma(tok, **net_opts) }
|
|
256
|
+
elsif tok =~ ASCII_RE
|
|
257
|
+
{ type: :ascii, text: tok }
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
def self.merge_ascii_parts(parts)
|
|
262
|
+
merged = []
|
|
263
|
+
parts.each do |p|
|
|
264
|
+
if p[:type] == :boundary
|
|
265
|
+
merged << p
|
|
266
|
+
elsif !merged.empty? && merged.last[:type] == :ascii && p[:type] == :ascii
|
|
267
|
+
merged.last[:text] << p[:text]
|
|
268
|
+
else
|
|
269
|
+
merged << { type: p[:type], text: p[:text].dup }
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
merged.reject { |p| p[:type] == :boundary }.map { |p| p[:text] }
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
def self.normalize_slug_string(str, sep:, downcase:, collapse:)
|
|
276
|
+
s = str
|
|
277
|
+
s = s.downcase if downcase
|
|
278
|
+
# Replace non-alphanumeric with separator
|
|
279
|
+
s = s.gsub(NON_ALNUM_RE, sep)
|
|
280
|
+
# Collapse duplicate separators
|
|
281
|
+
s = s.gsub(/#{Regexp.escape(sep)}{2,}/, sep) if collapse && !sep.empty?
|
|
282
|
+
# Trim leading/trailing separators
|
|
283
|
+
s = s.gsub(/^#{Regexp.escape(sep)}|#{Regexp.escape(sep)}$/, "") unless sep.empty?
|
|
284
|
+
s
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
def self.slice_opts(hash, *keys)
|
|
288
|
+
hash.slice(*keys)
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
private_class_method :segment_with_tiny, :japanese_token?, :normalize_slug_part, :merge_ascii_parts,
|
|
292
|
+
:normalize_slug_string, :slice_opts, :backoff_for, :katakana_to_hiragana,
|
|
293
|
+
:hiragana_to_katakana
|
|
196
294
|
end
|
|
197
295
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kanji-translator
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 1.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Hiromu Kodani
|
|
@@ -23,6 +23,20 @@ dependencies:
|
|
|
23
23
|
- - "~>"
|
|
24
24
|
- !ruby/object:Gem::Version
|
|
25
25
|
version: '1.16'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: tiny_segmenter
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: 0.0.6
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: 0.0.6
|
|
26
40
|
description: |-
|
|
27
41
|
Fetches readings for Japanese Kanji from yomikatawa.com and converts them
|
|
28
42
|
to hiragana, katakana, or Hepburn-style romaji. Includes timeout/retry
|