kanji-translator 0.1.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c4f92f7987bc8ed9600c1b5849f56c0538f6af423afd53249ae70bc83e53a383
4
- data.tar.gz: 226e287702852d0d05563b4a38aba069caef02e52c56c76d4011ef22e928c43c
3
+ metadata.gz: 56e5d89e3ee4e402ab0100d1c86b5bd156481671d4dbd6633f7b9577e4155304
4
+ data.tar.gz: bc7eff78a596f65fdc387ac9fa0f3245f4cc72781a785a8f4298fd18b696003a
5
5
  SHA512:
6
- metadata.gz: 7e09f7183a129996b4fbfeff7e56b86f51e7b6b53ef7db428bb3c45b4e1bea4acb644251e996dbb68fe05c39785b202d76d7d3aca1d50b9c6299ce15ec1c74a7
7
- data.tar.gz: a99f5a503d9b9a1a715d3d9405a86306c110ebc13b0c6bb5c17c494453460f04c05130176a4662138c8d2e24dfa119fd0080f61943c0364e5dd2cab963d24d54
6
+ metadata.gz: 4a36777ecf342e246a9009c788016d54a74d5a6fa439d6b0ad0e0d3fdd4d1bd86440142c06e67dea29ddcf3be927d08a872db132cca4c0237c698410bcce49d7
7
+ data.tar.gz: 7dba7989316c7c2c29cf5eaf2ed7799f3b92cdb2d133a126a1bc7541d9a157b5003f1ce2f3f1d34e13c1c0eb0279e6786007cde9776daf32ac366a316b76f71b
data/CHANGELOG.md CHANGED
@@ -1,5 +1,19 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [1.1.0] - 2025-09-08
4
+
5
+ - Fixed: `to_hira` の結果を必ずひらがなに正規化(返却HTMLにカタカナが混ざる場合の不整合を解消)。
6
+ - Improved: スラッグ生成の分割アルゴリズムを整理し、ASCII連結・空白(全角含む)境界を厳密化。混在テキストでの精度向上。
7
+ - Changed: `to_slug(text, separator: "-", **opts)` に整理(`separator` は直接キーワード、他は `**opts`)。互換性は維持。
8
+ - Refactor: 正規表現・正規化処理の定数化/関数抽出、内部メソッドを `private_class_method` 化。
9
+
10
+ ## [1.0.0] - 2025-09-08
11
+
12
+ - Breaking: `to_slug` のデフォルト挙動を `segmenter: :tiny` に変更(語境界ごとにハイフン区切り)。
13
+ - Added: `segmenter: :space` オプションを追加。
14
+ - Added: 依存に `tiny_segmenter (~> 0.0.6)` を追加。
15
+ - Docs/Tests: READMEとRSpecを更新し新仕様を反映。
16
+
3
17
  ## [0.1.0] - 2025-09-08
4
18
 
5
19
  - Initial release
data/README.md CHANGED
@@ -8,13 +8,11 @@
8
8
 
9
9
  Ruby 3.2以上が必要です。
10
10
 
11
- Rubygems公開後:
12
-
13
11
  ```bash
14
12
  bundle add kanji-translator
15
13
  ```
16
14
 
17
- 未公開期間にGitHubから使う場合:
15
+ GitHubから使う場合(任意):
18
16
 
19
17
  ```ruby
20
18
  # Gemfile
@@ -54,6 +52,7 @@ require "kanji/translator/core_ext/string"
54
52
  - `Kanji::Translator.to_slug(text, separator: "-", downcase: true, collapse: true, **opts)`
55
53
  - `to_roma` の結果をスラッグ化します。
56
54
  - 非英数字を `separator` に置換、連続区切りを圧縮、前後の区切りをトリムします。
55
+ - 内部で TinySegmenter による分かち書きを行い、語境界ごとにハイフン区切りします(例: "学校案内" → "gakkou-annai")。
57
56
 
58
57
  例(オプション):
59
58
 
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Kanji
4
4
  module Translator
5
- VERSION = "0.1.0"
5
+ VERSION = "1.1.0"
6
6
  end
7
7
  end
@@ -13,12 +13,26 @@ module Kanji
13
13
 
14
14
  USER_AGENT = "kanji-translator/#{VERSION}".freeze
15
15
  HOST = "yomikatawa.com"
16
+ ASCII_RE = /[A-Za-z0-9]/
17
+ SPACE_RE = /[\s\u3000]/ # ASCII whitespace or IDEOGRAPHIC SPACE
18
+ NON_ALNUM_RE = /[^a-z0-9]+/
19
+ JAPANESE_RE = /[一-龯々〆ヵヶぁ-ゖゝゞァ-ヴー]/
20
+ BOUNDARY = :__BOUNDARY__
16
21
 
17
22
  def self.to_hira(text, timeout: 5, retries: 2, backoff: 0.5, user_agent: USER_AGENT)
18
23
  raise ArgumentError, "text must be a String" unless text.is_a?(String)
19
24
 
25
+ # Fast-path for kana inputs: avoid network and normalize locally
26
+ if text.match?(/\A[ぁ-ゖーゝゞ]+\z/)
27
+ return text
28
+ elsif text.match?(/\A[ァ-ヴーヽヾヵヶ]+\z/)
29
+ return katakana_to_hiragana(text)
30
+ end
31
+
20
32
  body = fetch_page(text, timeout: timeout, retries: retries, backoff: backoff, user_agent: user_agent)
21
- parse_hiragana(body)
33
+ hira = parse_hiragana(body)
34
+ # Ensure result is normalized to hiragana only (remote may mix katakana like 固有名詞)
35
+ katakana_to_hiragana(hira)
22
36
  end
23
37
 
24
38
  def self.to_kata(text, **)
@@ -31,17 +45,18 @@ module Kanji
31
45
  hiragana_to_romaji(hira)
32
46
  end
33
47
 
34
- def self.to_slug(text, separator: "-", downcase: true, collapse: true, **)
35
- roma = to_roma(text, **)
36
- s = downcase ? roma.downcase : roma.dup
37
- sep = separator
38
- # Replace non-alphanumeric with separator
39
- s = s.gsub(/[^a-z0-9]+/, sep)
40
- # Collapse duplicate separators
41
- s = s.gsub(/#{Regexp.escape(sep)}{2,}/, sep) if collapse && !sep.empty?
42
- # Trim leading/trailing separators
43
- s = s.gsub(/^#{Regexp.escape(sep)}|#{Regexp.escape(sep)}$/, "") unless sep.empty?
44
- s
48
+ def self.to_slug(text, separator: "-", **opts)
49
+ sep = separator
50
+ downcase = opts.fetch(:downcase, true)
51
+ collapse = opts.fetch(:collapse, true)
52
+ net_opts = slice_opts(opts, :timeout, :retries, :backoff, :user_agent)
53
+
54
+ tokens = segment_with_tiny(text)
55
+ raw_parts = tokens.filter_map { |tok| normalize_slug_part(tok, net_opts) }
56
+ parts = merge_ascii_parts(raw_parts)
57
+ s = parts.join(sep)
58
+
59
+ normalize_slug_string(s, sep: sep, downcase: downcase, collapse: collapse)
45
60
  end
46
61
 
47
62
  def self.fetch_page(text, timeout:, retries:, backoff:, user_agent: USER_AGENT)
@@ -106,6 +121,10 @@ module Kanji
106
121
  hira.tr("ぁ-ゔゝゞー", "ァ-ヴヽヾー")
107
122
  end
108
123
 
124
+ def self.katakana_to_hiragana(kata)
125
+ kata.tr("ァ-ヴヽヾヵヶー", "ぁ-ゔゝゞかけー")
126
+ end
127
+
109
128
  DIGRAPHS = {
110
129
  "きゃ" => "kya", "きゅ" => "kyu", "きぇ" => "kye", "きょ" => "kyo",
111
130
  "ぎゃ" => "gya", "ぎゅ" => "gyu", "ぎぇ" => "gye", "ぎょ" => "gyo",
@@ -193,5 +212,84 @@ module Kanji
193
212
  jitter = rand * 0.05
194
213
  sleep_s + jitter
195
214
  end
215
+
216
+ def self.segment_with_tiny(text)
217
+ require "tiny_segmenter"
218
+ seg = TinySegmenter.new
219
+ tokens = []
220
+ i = 0
221
+ while i < text.length
222
+ ch = text[i]
223
+ if ch =~ ASCII_RE
224
+ j = i + 1
225
+ j += 1 while j < text.length && text[j] =~ ASCII_RE
226
+ tokens << text[i...j]
227
+ i = j
228
+ elsif ch =~ SPACE_RE
229
+ # treat whitespace (incl. IDEOGRAPHIC SPACE) as a hard boundary
230
+ tokens << BOUNDARY unless tokens.last == BOUNDARY
231
+ i += 1
232
+ else
233
+ # collect contiguous non-ASCII-non-space and segment via TinySegmenter
234
+ j = i + 1
235
+ j += 1 while j < text.length && text[j] !~ /[A-Za-z0-9\s\u3000]/
236
+ chunk = text[i...j]
237
+ tokens.concat(seg.segment(chunk))
238
+ i = j
239
+ end
240
+ end
241
+ tokens
242
+ rescue LoadError
243
+ raise Error, "tiny_segmenter gem is not installed. Add `tiny_segmenter` or omit segmenter option."
244
+ end
245
+
246
+ def self.japanese_token?(tok)
247
+ # Kanji, Kana, prolonged sound mark, iteration marks, small kana
248
+ !!(tok =~ JAPANESE_RE)
249
+ end
250
+
251
+ def self.normalize_slug_part(tok, net_opts)
252
+ if tok == BOUNDARY
253
+ { type: :boundary, text: nil }
254
+ elsif japanese_token?(tok)
255
+ { type: :j, text: to_roma(tok, **net_opts) }
256
+ elsif tok =~ ASCII_RE
257
+ { type: :ascii, text: tok }
258
+ end
259
+ end
260
+
261
+ def self.merge_ascii_parts(parts)
262
+ merged = []
263
+ parts.each do |p|
264
+ if p[:type] == :boundary
265
+ merged << p
266
+ elsif !merged.empty? && merged.last[:type] == :ascii && p[:type] == :ascii
267
+ merged.last[:text] << p[:text]
268
+ else
269
+ merged << { type: p[:type], text: p[:text].dup }
270
+ end
271
+ end
272
+ merged.reject { |p| p[:type] == :boundary }.map { |p| p[:text] }
273
+ end
274
+
275
+ def self.normalize_slug_string(str, sep:, downcase:, collapse:)
276
+ s = str
277
+ s = s.downcase if downcase
278
+ # Replace non-alphanumeric with separator
279
+ s = s.gsub(NON_ALNUM_RE, sep)
280
+ # Collapse duplicate separators
281
+ s = s.gsub(/#{Regexp.escape(sep)}{2,}/, sep) if collapse && !sep.empty?
282
+ # Trim leading/trailing separators
283
+ s = s.gsub(/^#{Regexp.escape(sep)}|#{Regexp.escape(sep)}$/, "") unless sep.empty?
284
+ s
285
+ end
286
+
287
+ def self.slice_opts(hash, *keys)
288
+ hash.slice(*keys)
289
+ end
290
+
291
+ private_class_method :segment_with_tiny, :japanese_token?, :normalize_slug_part, :merge_ascii_parts,
292
+ :normalize_slug_string, :slice_opts, :backoff_for, :katakana_to_hiragana,
293
+ :hiragana_to_katakana
196
294
  end
197
295
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kanji-translator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hiromu Kodani
@@ -23,6 +23,20 @@ dependencies:
23
23
  - - "~>"
24
24
  - !ruby/object:Gem::Version
25
25
  version: '1.16'
26
+ - !ruby/object:Gem::Dependency
27
+ name: tiny_segmenter
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: 0.0.6
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: 0.0.6
26
40
  description: |-
27
41
  Fetches readings for Japanese Kanji from yomikatawa.com and converts them
28
42
  to hiragana, katakana, or Hepburn-style romaji. Includes timeout/retry