kanji-translator 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6d693a0727ff440e9630d4ac66f28ff42da8c707a3d222deedfaa2a394304e55
4
- data.tar.gz: 5c26cc604c58942286badd65577a7b8094f9f304d34ae5e8474489ce18757676
3
+ metadata.gz: 56e5d89e3ee4e402ab0100d1c86b5bd156481671d4dbd6633f7b9577e4155304
4
+ data.tar.gz: bc7eff78a596f65fdc387ac9fa0f3245f4cc72781a785a8f4298fd18b696003a
5
5
  SHA512:
6
- metadata.gz: b3e115cb0d6057dde1642b4fdf8c61acc59b4a8275e471df8a04e4ba01311dfe119677ef4a5fd7d39348687532d5e5d7711c9da2b598529960461eddff3fc140
7
- data.tar.gz: 279143b96cef0e5ecccb6cd8e35b80409e6ca734ba9ec72ef61b0080aea94c76fb4d93f958155e43d4562a1a4143121f0cee4140d6f36b59fb6918f68823ab5e
6
+ metadata.gz: 4a36777ecf342e246a9009c788016d54a74d5a6fa439d6b0ad0e0d3fdd4d1bd86440142c06e67dea29ddcf3be927d08a872db132cca4c0237c698410bcce49d7
7
+ data.tar.gz: 7dba7989316c7c2c29cf5eaf2ed7799f3b92cdb2d133a126a1bc7541d9a157b5003f1ce2f3f1d34e13c1c0eb0279e6786007cde9776daf32ac366a316b76f71b
data/CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [1.1.0] - 2025-09-08
4
+
5
+ - Fixed: `to_hira` の結果を必ずひらがなに正規化(返却HTMLにカタカナが混ざる場合の不整合を解消)。
6
+ - Improved: スラッグ生成の分割アルゴリズムを整理し、ASCII連結・空白(全角含む)境界を厳密化。混在テキストでの精度向上。
7
+ - Changed: `to_slug(text, separator: "-", **opts)` に整理(`separator` は直接キーワード、他は `**opts`)。互換性は維持。
8
+ - Refactor: 正規表現・正規化処理の定数化/関数抽出、内部メソッドを `private_class_method` 化。
9
+
3
10
  ## [1.0.0] - 2025-09-08
4
11
 
5
12
  - Breaking: `to_slug` のデフォルト挙動を `segmenter: :tiny` に変更(語境界ごとにハイフン区切り)。
data/README.md CHANGED
@@ -49,20 +49,16 @@ require "kanji/translator/core_ext/string"
49
49
  - ひらがな読みをカタカナに変換して返します。
50
50
  - `Kanji::Translator.to_roma(text, **opts)`
51
51
  - 簡易ヘボン式のローマ字(ASCII、小文字)で返します。拗音/促音(ゃゅょ/っ)に対応。長音記号「ー」は無視します(例: おう→ou)。
52
- - `Kanji::Translator.to_slug(text, separator: "-", downcase: true, collapse: true, segmenter: :tiny, **opts)`
52
+ - `Kanji::Translator.to_slug(text, separator: "-", downcase: true, collapse: true, **opts)`
53
53
  - `to_roma` の結果をスラッグ化します。
54
54
  - 非英数字を `separator` に置換、連続区切りを圧縮、前後の区切りをトリムします。
55
- - `segmenter: :tiny`(デフォルト): TinySegmenter で分かち、語ごとにハイフン区切り(例: "学校案内" → "gakkou-annai")。
56
- - `segmenter: :space`: 空白でのみ分割(例: "学校 案内" → "gakkou-annai")。
57
- - `segmenter: nil`: 分かちなし(語の自動区切りなし)。
55
+ - 内部で TinySegmenter による分かち書きを行い、語境界ごとにハイフン区切りします(例: "学校案内" → "gakkou-annai")。
58
56
 
59
57
  例(オプション):
60
58
 
61
59
  ```ruby
62
60
  Kanji::Translator.to_hira("漢字", timeout: 3, retries: 1)
63
61
  Kanji::Translator.to_slug("東京タワー 2010") #=> "toukyou-tawa-2010"
64
- Kanji::Translator.to_slug("学校 案内", segmenter: :space) #=> "gakkou-annai"
65
- Kanji::Translator.to_slug("学校案内", segmenter: nil) #=> "gakkouannai"
66
62
  Kanji::Translator.to_slug("Foo Bar", separator: "_") #=> "foo_bar"
67
63
  ```
68
64
 
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Kanji
4
4
  module Translator
5
- VERSION = "1.0.0"
5
+ VERSION = "1.1.0"
6
6
  end
7
7
  end
@@ -13,12 +13,26 @@ module Kanji
13
13
 
14
14
  USER_AGENT = "kanji-translator/#{VERSION}".freeze
15
15
  HOST = "yomikatawa.com"
16
+ ASCII_RE = /[A-Za-z0-9]/
17
+ SPACE_RE = /[\s\u3000]/ # ASCII whitespace or IDEOGRAPHIC SPACE
18
+ NON_ALNUM_RE = /[^a-z0-9]+/
19
+ JAPANESE_RE = /[一-龯々〆ヵヶぁ-ゖゝゞァ-ヴー]/
20
+ BOUNDARY = :__BOUNDARY__
16
21
 
17
22
  def self.to_hira(text, timeout: 5, retries: 2, backoff: 0.5, user_agent: USER_AGENT)
18
23
  raise ArgumentError, "text must be a String" unless text.is_a?(String)
19
24
 
25
+ # Fast-path for kana inputs: avoid network and normalize locally
26
+ if text.match?(/\A[ぁ-ゖーゝゞ]+\z/)
27
+ return text
28
+ elsif text.match?(/\A[ァ-ヴーヽヾヵヶ]+\z/)
29
+ return katakana_to_hiragana(text)
30
+ end
31
+
20
32
  body = fetch_page(text, timeout: timeout, retries: retries, backoff: backoff, user_agent: user_agent)
21
- parse_hiragana(body)
33
+ hira = parse_hiragana(body)
34
+ # Ensure result is normalized to hiragana only (remote may mix katakana like 固有名詞)
35
+ katakana_to_hiragana(hira)
22
36
  end
23
37
 
24
38
  def self.to_kata(text, **)
@@ -31,35 +45,18 @@ module Kanji
31
45
  hiragana_to_romaji(hira)
32
46
  end
33
47
 
34
- def self.to_slug(text, **opts)
35
- sep = opts.fetch(:separator, "-")
48
+ def self.to_slug(text, separator: "-", **opts)
49
+ sep = separator
36
50
  downcase = opts.fetch(:downcase, true)
37
51
  collapse = opts.fetch(:collapse, true)
38
- segmenter = opts.fetch(:segmenter, :tiny)
39
52
  net_opts = slice_opts(opts, :timeout, :retries, :backoff, :user_agent)
40
53
 
41
- s = case segmenter
42
- when :tiny
43
- tokens = segment_with_tiny(text)
44
- parts = tokens.filter_map { |tok| normalize_slug_part(tok, net_opts) }
45
- parts.join(sep)
46
- when :space
47
- tokens = segment_with_space(text)
48
- parts = tokens.filter_map { |tok| normalize_slug_part(tok, net_opts) }
49
- parts.join(sep)
50
- else
51
- roma = to_roma(text, **net_opts)
52
- roma.dup
53
- end
54
+ tokens = segment_with_tiny(text)
55
+ raw_parts = tokens.filter_map { |tok| normalize_slug_part(tok, net_opts) }
56
+ parts = merge_ascii_parts(raw_parts)
57
+ s = parts.join(sep)
54
58
 
55
- s = s.downcase if downcase
56
- # Replace non-alphanumeric with separator
57
- s = s.gsub(/[^a-z0-9]+/, sep)
58
- # Collapse duplicate separators
59
- s = s.gsub(/#{Regexp.escape(sep)}{2,}/, sep) if collapse && !sep.empty?
60
- # Trim leading/trailing separators
61
- s = s.gsub(/^#{Regexp.escape(sep)}|#{Regexp.escape(sep)}$/, "") unless sep.empty?
62
- s
59
+ normalize_slug_string(s, sep: sep, downcase: downcase, collapse: collapse)
63
60
  end
64
61
 
65
62
  def self.fetch_page(text, timeout:, retries:, backoff:, user_agent: USER_AGENT)
@@ -124,6 +121,10 @@ module Kanji
124
121
  hira.tr("ぁ-ゔゝゞー", "ァ-ヴヽヾー")
125
122
  end
126
123
 
124
+ def self.katakana_to_hiragana(kata)
125
+ kata.tr("ァ-ヴヽヾヵヶー", "ぁ-ゔゝゞかけー")
126
+ end
127
+
127
128
  DIGRAPHS = {
128
129
  "きゃ" => "kya", "きゅ" => "kyu", "きぇ" => "kye", "きょ" => "kyo",
129
130
  "ぎゃ" => "gya", "ぎゅ" => "gyu", "ぎぇ" => "gye", "ぎょ" => "gyo",
@@ -214,30 +215,81 @@ module Kanji
214
215
 
215
216
  def self.segment_with_tiny(text)
216
217
  require "tiny_segmenter"
217
- TinySegmenter.new.segment(text)
218
+ seg = TinySegmenter.new
219
+ tokens = []
220
+ i = 0
221
+ while i < text.length
222
+ ch = text[i]
223
+ if ch =~ ASCII_RE
224
+ j = i + 1
225
+ j += 1 while j < text.length && text[j] =~ ASCII_RE
226
+ tokens << text[i...j]
227
+ i = j
228
+ elsif ch =~ SPACE_RE
229
+ # treat whitespace (incl. IDEOGRAPHIC SPACE) as a hard boundary
230
+ tokens << BOUNDARY unless tokens.last == BOUNDARY
231
+ i += 1
232
+ else
233
+ # collect contiguous non-ASCII-non-space and segment via TinySegmenter
234
+ j = i + 1
235
+ j += 1 while j < text.length && text[j] !~ /[A-Za-z0-9\s\u3000]/
236
+ chunk = text[i...j]
237
+ tokens.concat(seg.segment(chunk))
238
+ i = j
239
+ end
240
+ end
241
+ tokens
218
242
  rescue LoadError
219
243
  raise Error, "tiny_segmenter gem is not installed. Add `tiny_segmenter` or omit segmenter option."
220
244
  end
221
245
 
222
246
  def self.japanese_token?(tok)
223
247
  # Kanji, Kana, prolonged sound mark, iteration marks, small kana
224
- !!(tok =~ /[一-龯々〆ヵヶぁ-ゖゝゞァ-ヴー]/)
248
+ !!(tok =~ JAPANESE_RE)
225
249
  end
226
250
 
227
- def self.segment_with_space(text)
228
- text.split(/\s+/)
251
+ def self.normalize_slug_part(tok, net_opts)
252
+ if tok == BOUNDARY
253
+ { type: :boundary, text: nil }
254
+ elsif japanese_token?(tok)
255
+ { type: :j, text: to_roma(tok, **net_opts) }
256
+ elsif tok =~ ASCII_RE
257
+ { type: :ascii, text: tok }
258
+ end
229
259
  end
230
260
 
231
- def self.normalize_slug_part(tok, net_opts)
232
- if japanese_token?(tok)
233
- to_roma(tok, **net_opts)
234
- elsif tok =~ /[A-Za-z0-9]+/
235
- tok
261
+ def self.merge_ascii_parts(parts)
262
+ merged = []
263
+ parts.each do |p|
264
+ if p[:type] == :boundary
265
+ merged << p
266
+ elsif !merged.empty? && merged.last[:type] == :ascii && p[:type] == :ascii
267
+ merged.last[:text] << p[:text]
268
+ else
269
+ merged << { type: p[:type], text: p[:text].dup }
270
+ end
236
271
  end
272
+ merged.reject { |p| p[:type] == :boundary }.map { |p| p[:text] }
273
+ end
274
+
275
+ def self.normalize_slug_string(str, sep:, downcase:, collapse:)
276
+ s = str
277
+ s = s.downcase if downcase
278
+ # Replace non-alphanumeric with separator
279
+ s = s.gsub(NON_ALNUM_RE, sep)
280
+ # Collapse duplicate separators
281
+ s = s.gsub(/#{Regexp.escape(sep)}{2,}/, sep) if collapse && !sep.empty?
282
+ # Trim leading/trailing separators
283
+ s = s.gsub(/^#{Regexp.escape(sep)}|#{Regexp.escape(sep)}$/, "") unless sep.empty?
284
+ s
237
285
  end
238
286
 
239
287
  def self.slice_opts(hash, *keys)
240
288
  hash.slice(*keys)
241
289
  end
290
+
291
+ private_class_method :segment_with_tiny, :japanese_token?, :normalize_slug_part, :merge_ascii_parts,
292
+ :normalize_slug_string, :slice_opts, :backoff_for, :katakana_to_hiragana,
293
+ :hiragana_to_katakana
242
294
  end
243
295
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kanji-translator
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hiromu Kodani