neologdish-normalizer 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2aa7f75228455a5870d56e6f8b74dd74dcb9f8c0619bb86eda89e02518478afd
4
- data.tar.gz: b92a3042b6f7c3ad7b2653bb7f04643e2769733104ee78e1c689170776729b53
3
+ metadata.gz: 57bda9b5b4b5de8dc0b6582e3a5eb5d2103d3207a405bb7b977b52e0eff02322
4
+ data.tar.gz: a35a668f517d31229e370dd4839c911f2e1b16f6c868e8537d9e61f2e44c7eee
5
5
  SHA512:
6
- metadata.gz: 4479aa95d7ba3d2aca68feecf5a4674dc5af9f71c795ac748bcb6e38558b7d9e73c4d068b071441c020b96a4765b58224ad0328fe4856aafcf934bf6eb4d6e99
7
- data.tar.gz: fb794e40955c4460bf592fac2f9baf415bb9c486ff07a288fc9b3a083bb8256a98d949291b497fd36b2e846ea083dee3925afd8001ae6a7ce90e4536badf6836
6
+ metadata.gz: bf2788e135ccecdfdea6a4be0e52e4e1b151e4fc6c8847844299d2bbc1578c23553f92d6d47bea0e955dff2ddb941f9944e94cbfe420bbe51b733c2094497f6e
7
+ data.tar.gz: 8da7c4aa8848bef56fd5d612f22e08e0cf33cde2ccc0af829d2394da3f8bd9cb5c2ae38223adcd7d3b871df6b3909f1c550a36beb86f0bdcf6a9ab14c0d65345
data/.rubocop.yml CHANGED
@@ -1,10 +1,9 @@
1
1
  require:
2
- - rubocop-rake
3
2
  - rubocop-minitest
4
-
3
+ plugins:
4
+ - rubocop-rake
5
5
  AllCops:
6
6
  NewCops: enable
7
-
8
7
  Metrics:
9
8
  Enabled: false
10
9
  Layout/LineLength:
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 3.3.5
1
+ 3.4.4
data/README.md CHANGED
@@ -47,5 +47,5 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
47
47
 
48
48
  ## Contributing
49
49
 
50
- Bug reports and pull requests are welcome on GitHub at https://github.com/moznion/neologdish-normalizer.
50
+ Bug reports and pull requests are welcome on GitHub at https://github.com/moznion/neologdish-normalizer-ruby.
51
51
 
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Neologdish
4
4
  module Normalizer
5
- VERSION = '0.1.0' #: String
5
+ VERSION = '0.2.0'
6
6
  end
7
7
  end
@@ -5,6 +5,9 @@ require_relative 'normalizer/version'
5
5
  module Neologdish
6
6
  # A Japanese text normalizer module according to the neologd convention.
7
7
  module Normalizer
8
+ NORMALIZED_HYPHEN = "\u002d" # -
9
+ NORMALIZED_VOWEL = "\u30fc" # ー
10
+
8
11
  CONVERSION_MAP = {
9
12
  # Normalize [0-9a-zA-Z] to half-width
10
13
  '0' => '0', '1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8', '9' => '9',
@@ -15,11 +18,31 @@ module Neologdish
15
18
  'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n', 'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't',
16
19
  'u' => 'u', 'v' => 'v', 'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z',
17
20
  # normalize the hyphen/minus-ish characters to '-'
18
- '˗' => '-', '֊' => '-', '‐' => '-', '‑' => '-', '‒' => '-', '–' => '-', '⁃' => '-', '⁻' => '-', '₋' => '-', '−' => '-',
21
+ "\u02d7" => NORMALIZED_HYPHEN, # ˗
22
+ "\u058a" => NORMALIZED_HYPHEN, # ֊
23
+ "\u2010" => NORMALIZED_HYPHEN, # ‐
24
+ "\u2011" => NORMALIZED_HYPHEN, # ‑
25
+ "\u2012" => NORMALIZED_HYPHEN, # ‒
26
+ "\u2013" => NORMALIZED_HYPHEN, # –
27
+ "\u2043" => NORMALIZED_HYPHEN, # ⁃
28
+ "\u207b" => NORMALIZED_HYPHEN, # ⁻
29
+ "\u208b" => NORMALIZED_HYPHEN, # ₋
30
+ "\u2212" => NORMALIZED_HYPHEN, # −
19
31
  # normalize the long-vowel mark-ish characters to 'ー'
20
- '﹣' => 'ー', '-' => 'ー', 'ー' => 'ー', '' => 'ー', '―' => 'ー', '─' => 'ー', '━' => 'ー',
32
+ "\u2014" => NORMALIZED_VOWEL, #
33
+ "\u2015" => NORMALIZED_VOWEL, # ―
34
+ "\u2500" => NORMALIZED_VOWEL, # ─
35
+ "\u2501" => NORMALIZED_VOWEL, # ━
36
+ "\ufe63" => NORMALIZED_VOWEL, # ﹣
37
+ "\uff0d" => NORMALIZED_VOWEL, # -
38
+ "\uff70" => NORMALIZED_VOWEL, # ー
21
39
  # remove the tilde-ish characters
22
- '~' => '', '∼' => '', '∾' => '', '〜' => '', '〰' => '', '~' => '',
40
+ "\u007e" => '', # ~
41
+ "\u223c" => '', # ∼
42
+ "\u223e" => '', # ∾
43
+ "\u301c" => '', # 〜
44
+ "\u3030" => '', # 〰
45
+ "\uff5e" => '', # ~
23
46
  # normalize the full-width special symbol characters (/!”#$%&’()*+,−./:;<>?@[¥]^_`{|}) and space characters to half-width
24
47
  ' ' => ' ', '!' => '!', '”' => '"', '#' => '#', '$' => '$', '%' => '%', '&' => '&', '’' => "'", '(' => '(', ')' => ')',
25
48
  '*' => '*', '+' => '+', ',' => ',', '.' => '.', '/' => '/', ':' => ':', ';' => ';', '<' => '<', '>' => '>', '?' => '?',
@@ -53,18 +76,39 @@ module Neologdish
53
76
  'ッ' => 'ッ', 'ャ' => 'ヤ', 'ュ' => 'ユ', 'ョ' => 'ヨ'
54
77
  }.freeze #: Hash[String, String]
55
78
 
79
+ DAKUON_HANDAKUON_POSSIBLES = {
80
+ 'ウ' => true,
81
+ 'カ' => true, 'キ' => true, 'ク' => true, 'ケ' => true, 'コ' => true,
82
+ 'サ' => true, 'シ' => true, 'ス' => true, 'セ' => true, 'ソ' => true,
83
+ 'タ' => true, 'チ' => true, 'ツ' => true, 'テ' => true, 'ト' => true,
84
+ 'ハ' => true, 'ヒ' => true, 'フ' => true, 'ヘ' => true, 'ホ' => true,
85
+ 'う' => true,
86
+ 'か' => true, 'き' => true, 'く' => true, 'け' => true, 'こ' => true,
87
+ 'さ' => true, 'し' => true, 'す' => true, 'せ' => true, 'そ' => true,
88
+ 'た' => true, 'ち' => true, 'つ' => true, 'て' => true, 'と' => true,
89
+ 'は' => true, 'ひ' => true, 'ふ' => true, 'へ' => true, 'ほ' => true
90
+ }.freeze #: Hash[String, bool]
91
+
56
92
  DAKUON_KANA_MAP = {
93
+ 'ウ' => 'ヴ',
57
94
  'カ' => 'ガ', 'キ' => 'ギ', 'ク' => 'グ', 'ケ' => 'ゲ', 'コ' => 'ゴ',
58
95
  'サ' => 'ザ', 'シ' => 'ジ', 'ス' => 'ズ', 'セ' => 'ゼ', 'ソ' => 'ゾ',
59
96
  'タ' => 'ダ', 'チ' => 'ヂ', 'ツ' => 'ヅ', 'テ' => 'デ', 'ト' => 'ド',
60
- 'ハ' => 'バ', 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ'
97
+ 'ハ' => 'バ', 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ',
98
+ 'う' => 'ゔ',
99
+ 'か' => 'が', 'き' => 'ぎ', 'く' => 'ぐ', 'け' => 'げ', 'こ' => 'ご',
100
+ 'さ' => 'ざ', 'し' => 'じ', 'す' => 'ず', 'せ' => 'ぜ', 'そ' => 'ぞ',
101
+ 'た' => 'だ', 'ち' => 'ぢ', 'つ' => 'づ', 'て' => 'で', 'と' => 'ど',
102
+ 'は' => 'ば', 'ひ' => 'び', 'ふ' => 'ぶ', 'へ' => 'べ', 'ほ' => 'ぼ'
61
103
  }.freeze #: Hash[String, String]
62
104
 
63
105
  HANDAKUON_KANA_MAP = {
64
- 'ハ' => 'パ', 'ヒ' => 'ピ', 'フ' => 'プ', 'ヘ' => 'ペ', 'ホ' => 'ポ'
106
+ 'ハ' => 'パ', 'ヒ' => 'ピ', 'フ' => 'プ', 'ヘ' => 'ペ', 'ホ' => 'ポ',
107
+ 'は' => 'ぱ', 'ひ' => 'ぴ', 'ふ' => 'ぷ', 'へ' => 'ぺ', 'ほ' => 'ぽ'
65
108
  }.freeze #: Hash[String, String]
66
109
 
67
- private_constant :CONVERSION_MAP, :LATIN_MAP, :HALF_WIDTH_KANA_MAP, :DAKUON_KANA_MAP, :HANDAKUON_KANA_MAP
110
+ private_constant :CONVERSION_MAP, :LATIN_MAP, :HALF_WIDTH_KANA_MAP, :DAKUON_KANA_MAP, :HANDAKUON_KANA_MAP, :DAKUON_HANDAKUON_POSSIBLES,
111
+ :NORMALIZED_HYPHEN, :NORMALIZED_VOWEL
68
112
 
69
113
  # Normalize the given text.
70
114
  #
@@ -77,23 +121,29 @@ module Neologdish
77
121
  squeezee = ''
78
122
  prev_latin = false
79
123
  whitespace_encountered = false
80
- encountered_half_width_kana = nil
124
+ dakuon_handakuon_possible = nil
81
125
  normalized = str.chars.map do |c|
82
126
  prefix = ''
83
127
  c = conversion_map[c] || c
84
128
 
85
129
  # normalize the Half-width kana to full-width
86
- if encountered_half_width_kana
87
- if (c == '゙' && (k = DAKUON_KANA_MAP[encountered_half_width_kana])) ||
88
- (c == '゚' && (k = HANDAKUON_KANA_MAP[encountered_half_width_kana]))
130
+ if dakuon_handakuon_possible
131
+ if (["\u309b", "\u3099", "\uff9e"].include?(c) && (k = DAKUON_KANA_MAP[dakuon_handakuon_possible])) ||
132
+ (["\u309c", "\u309a", "\uff9f"].include?(c) && (k = HANDAKUON_KANA_MAP[dakuon_handakuon_possible]))
89
133
  c = ''
90
134
  prefix = k
91
135
  else
92
- prefix = encountered_half_width_kana
136
+ prefix = dakuon_handakuon_possible
93
137
  end
94
138
  end
95
139
 
96
140
  if (encountered_half_width_kana = HALF_WIDTH_KANA_MAP[c])
141
+ c = encountered_half_width_kana
142
+ end
143
+
144
+ dakuon_handakuon_possible = nil
145
+ if DAKUON_HANDAKUON_POSSIBLES[c]
146
+ dakuon_handakuon_possible = c
97
147
  c = ''
98
148
  end
99
149
 
@@ -120,7 +170,7 @@ module Neologdish
120
170
  prev_latin = is_latin
121
171
 
122
172
  prefix + c
123
- end.join + (encountered_half_width_kana || '')
173
+ end.join + (dakuon_handakuon_possible || '')
124
174
 
125
175
  normalized.strip
126
176
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Neologdish
4
4
  module Normalizer
5
- VERSION: String
5
+ VERSION: ::String
6
6
  end
7
7
  end
@@ -3,12 +3,18 @@
3
3
  module Neologdish
4
4
  # A Japanese text normalizer module according to the neologd convention.
5
5
  module Normalizer
6
+ NORMALIZED_HYPHEN: ::String
7
+
8
+ NORMALIZED_VOWEL: ::String
9
+
6
10
  CONVERSION_MAP: Hash[String, String]
7
11
 
8
12
  LATIN_MAP: Hash[String, bool]
9
13
 
10
14
  HALF_WIDTH_KANA_MAP: Hash[String, String]
11
15
 
16
+ DAKUON_HANDAKUON_POSSIBLES: Hash[String, bool]
17
+
12
18
  DAKUON_KANA_MAP: Hash[String, String]
13
19
 
14
20
  HANDAKUON_KANA_MAP: Hash[String, String]
@@ -18,6 +24,6 @@ module Neologdish
18
24
  # @rbs str: String
19
25
  # @rbs override_conversion_map: Hash[String, String]
20
26
  # @rbs return: String
21
- def normalize: (String str, ?Hash[String, String] override_conversion_map) -> String
27
+ def self?.normalize: (String str, ?Hash[String, String] override_conversion_map) -> String
22
28
  end
23
29
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: neologdish-normalizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - moznion
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2024-10-29 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies: []
13
12
  description: A Japanese text normalization library follows the conventions of neologd
14
13
  with some performance optimizations. It is designed to preprocess Japanese text
@@ -29,14 +28,13 @@ files:
29
28
  - renovate.json
30
29
  - sig/generated/neologdish/normalizer.rbs
31
30
  - sig/generated/neologdish/normalizer/version.rbs
32
- homepage: https://github.com/moznion/neologdish-normalizer
31
+ homepage: https://github.com/moznion/neologdish-normalizer-ruby
33
32
  licenses: []
34
33
  metadata:
35
- homepage_uri: https://github.com/moznion/neologdish-normalizer
36
- source_code_uri: https://github.com/moznion/neologdish-normalizer
37
- changelog_uri: https://github.com/moznion/neologdish-normalizer/releases
34
+ homepage_uri: https://github.com/moznion/neologdish-normalizer-ruby
35
+ source_code_uri: https://github.com/moznion/neologdish-normalizer-ruby
36
+ changelog_uri: https://github.com/moznion/neologdish-normalizer-ruby/releases
38
37
  rubygems_mfa_required: 'true'
39
- post_install_message:
40
38
  rdoc_options: []
41
39
  require_paths:
42
40
  - lib
@@ -51,8 +49,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
51
49
  - !ruby/object:Gem::Version
52
50
  version: '0'
53
51
  requirements: []
54
- rubygems_version: 3.5.16
55
- signing_key:
52
+ rubygems_version: 3.6.7
56
53
  specification_version: 4
57
54
  summary: A Japanese text normalization library follows the conventions of neologd
58
55
  test_files: []