neologdish-normalizer 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f2f96d57b26c4e44d37d47bcabcec5a820470d4e89beb59b709c2dc36ecd449b
4
- data.tar.gz: fc7479594872db9c262cd13acf8e4f717936b289ead7fe136c65f0f211d32417
3
+ metadata.gz: 57bda9b5b4b5de8dc0b6582e3a5eb5d2103d3207a405bb7b977b52e0eff02322
4
+ data.tar.gz: a35a668f517d31229e370dd4839c911f2e1b16f6c868e8537d9e61f2e44c7eee
5
5
  SHA512:
6
- metadata.gz: 84457e6fd2b5fae0b87d51a86b0e9a5c4aaf808b436ba97654c0939630fe19142c6bb2eb6729baeb0773882726aacffb62f67139be5c03b8c2404e74ca1b2408
7
- data.tar.gz: 312f9993dceed61d3443f7ddb6c06904d49b21a52725be48721bf0683e75111b76b6b3bd53cde1d7a34315adb157f91306a75f3561f584af74d269bd71eb19aa
6
+ metadata.gz: bf2788e135ccecdfdea6a4be0e52e4e1b151e4fc6c8847844299d2bbc1578c23553f92d6d47bea0e955dff2ddb941f9944e94cbfe420bbe51b733c2094497f6e
7
+ data.tar.gz: 8da7c4aa8848bef56fd5d612f22e08e0cf33cde2ccc0af829d2394da3f8bd9cb5c2ae38223adcd7d3b871df6b3909f1c550a36beb86f0bdcf6a9ab14c0d65345
data/.rubocop.yml CHANGED
@@ -1,10 +1,9 @@
1
1
  require:
2
- - rubocop-rake
3
2
  - rubocop-minitest
4
-
3
+ plugins:
4
+ - rubocop-rake
5
5
  AllCops:
6
6
  NewCops: enable
7
-
8
7
  Metrics:
9
8
  Enabled: false
10
9
  Layout/LineLength:
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 3.3
1
+ 3.4.4
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Neologdish::Normalizer for Ruby
1
+ # Neologdish::Normalizer for Ruby [![Check](https://github.com/moznion/neologdish-normalizer-ruby/actions/workflows/check.yml/badge.svg)](https://github.com/moznion/neologdish-normalizer-ruby/actions/workflows/check.yml) [![Gem Version](https://badge.fury.io/rb/neologdish-normalizer.svg)](https://badge.fury.io/rb/neologdish-normalizer)
2
2
 
3
3
  A Japanese text normalization library for Ruby follows the conventions of [neologd/mecab-ipadic-neologd](https://github.com/neologd/mecab-ipadic-neologd), with some performance optimizations, without external dependencies. It is designed to preprocess Japanese text before applying NLP techniques.
4
4
 
@@ -27,18 +27,16 @@ The benchmark script is here: [./scripts/benchmark.rb](./scripts/benchmark.rb)
27
27
 
28
28
  ## Installation
29
29
 
30
- TODO: Replace `UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG` with your gem name right after releasing it to RubyGems.org. Please do not do it earlier due to security reasons. Alternatively, replace this section with instructions to install your gem from git if you don't plan to release to RubyGems.org.
31
-
32
30
  Install the gem and add to the application's Gemfile by executing:
33
31
 
34
32
  ```bash
35
- bundle add UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG
33
+ bundle add 'neologdish-normalizer'
36
34
  ```
37
35
 
38
36
  If bundler is not being used to manage dependencies, install the gem by executing:
39
37
 
40
38
  ```bash
41
- gem install UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG
39
+ gem install 'neologdish-normalizer'
42
40
  ```
43
41
 
44
42
  ## Development
@@ -49,5 +47,5 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
49
47
 
50
48
  ## Contributing
51
49
 
52
- Bug reports and pull requests are welcome on GitHub at https://github.com/moznion/neologdish-normalizer.
50
+ Bug reports and pull requests are welcome on GitHub at https://github.com/moznion/neologdish-normalizer-ruby.
53
51
 
data/Rakefile CHANGED
@@ -12,6 +12,11 @@ namespace :rbs do
12
12
  end
13
13
  end
14
14
 
15
+ desc 'run benchmark script'
16
+ task :benchmark do
17
+ sh 'ruby ./scripts/benchmark.rb'
18
+ end
19
+
15
20
  Rake::TestTask.new do |task|
16
21
  task.libs = %w[lib test]
17
22
  task.test_files = FileList['test/**/*.rb']
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Neologdish
4
4
  module Normalizer
5
- VERSION = '0.0.1' #: String
5
+ VERSION = '0.2.0'
6
6
  end
7
7
  end
@@ -5,6 +5,9 @@ require_relative 'normalizer/version'
5
5
  module Neologdish
6
6
  # A Japanese text normalizer module according to the neologd convention.
7
7
  module Normalizer
8
+ NORMALIZED_HYPHEN = "\u002d" # -
9
+ NORMALIZED_VOWEL = "\u30fc" # ー
10
+
8
11
  CONVERSION_MAP = {
9
12
  # Normalize [0-9a-zA-Z] to half-width
10
13
  '0' => '0', '1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8', '9' => '9',
@@ -15,11 +18,31 @@ module Neologdish
15
18
  'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n', 'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't',
16
19
  'u' => 'u', 'v' => 'v', 'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z',
17
20
  # normalize the hyphen/minus-ish characters to '-'
18
- '˗' => '-', '֊' => '-', '‐' => '-', '‑' => '-', '‒' => '-', '–' => '-', '⁃' => '-', '⁻' => '-', '₋' => '-', '−' => '-',
21
+ "\u02d7" => NORMALIZED_HYPHEN, # ˗
22
+ "\u058a" => NORMALIZED_HYPHEN, # ֊
23
+ "\u2010" => NORMALIZED_HYPHEN, # ‐
24
+ "\u2011" => NORMALIZED_HYPHEN, # ‑
25
+ "\u2012" => NORMALIZED_HYPHEN, # ‒
26
+ "\u2013" => NORMALIZED_HYPHEN, # –
27
+ "\u2043" => NORMALIZED_HYPHEN, # ⁃
28
+ "\u207b" => NORMALIZED_HYPHEN, # ⁻
29
+ "\u208b" => NORMALIZED_HYPHEN, # ₋
30
+ "\u2212" => NORMALIZED_HYPHEN, # −
19
31
  # normalize the long-vowel mark-ish characters to 'ー'
20
- '﹣' => 'ー', '-' => 'ー', 'ー' => 'ー', '' => 'ー', '―' => 'ー', '─' => 'ー', '━' => 'ー',
32
+ "\u2014" => NORMALIZED_VOWEL, #
33
+ "\u2015" => NORMALIZED_VOWEL, # ―
34
+ "\u2500" => NORMALIZED_VOWEL, # ─
35
+ "\u2501" => NORMALIZED_VOWEL, # ━
36
+ "\ufe63" => NORMALIZED_VOWEL, # ﹣
37
+ "\uff0d" => NORMALIZED_VOWEL, # -
38
+ "\uff70" => NORMALIZED_VOWEL, # ー
21
39
  # remove the tilde-ish characters
22
- '~' => '', '∼' => '', '∾' => '', '〜' => '', '〰' => '', '~' => '',
40
+ "\u007e" => '', # ~
41
+ "\u223c" => '', # ∼
42
+ "\u223e" => '', # ∾
43
+ "\u301c" => '', # 〜
44
+ "\u3030" => '', # 〰
45
+ "\uff5e" => '', # ~
23
46
  # normalize the full-width special symbol characters (/!”#$%&’()*+,−./:;<>?@[¥]^_`{|}) and space characters to half-width
24
47
  ' ' => ' ', '!' => '!', '”' => '"', '#' => '#', '$' => '$', '%' => '%', '&' => '&', '’' => "'", '(' => '(', ')' => ')',
25
48
  '*' => '*', '+' => '+', ',' => ',', '.' => '.', '/' => '/', ':' => ':', ';' => ';', '<' => '<', '>' => '>', '?' => '?',
@@ -53,44 +76,74 @@ module Neologdish
53
76
  'ッ' => 'ッ', 'ャ' => 'ヤ', 'ュ' => 'ユ', 'ョ' => 'ヨ'
54
77
  }.freeze #: Hash[String, String]
55
78
 
79
+ DAKUON_HANDAKUON_POSSIBLES = {
80
+ 'ウ' => true,
81
+ 'カ' => true, 'キ' => true, 'ク' => true, 'ケ' => true, 'コ' => true,
82
+ 'サ' => true, 'シ' => true, 'ス' => true, 'セ' => true, 'ソ' => true,
83
+ 'タ' => true, 'チ' => true, 'ツ' => true, 'テ' => true, 'ト' => true,
84
+ 'ハ' => true, 'ヒ' => true, 'フ' => true, 'ヘ' => true, 'ホ' => true,
85
+ 'う' => true,
86
+ 'か' => true, 'き' => true, 'く' => true, 'け' => true, 'こ' => true,
87
+ 'さ' => true, 'し' => true, 'す' => true, 'せ' => true, 'そ' => true,
88
+ 'た' => true, 'ち' => true, 'つ' => true, 'て' => true, 'と' => true,
89
+ 'は' => true, 'ひ' => true, 'ふ' => true, 'へ' => true, 'ほ' => true
90
+ }.freeze #: Hash[String, bool]
91
+
56
92
  DAKUON_KANA_MAP = {
93
+ 'ウ' => 'ヴ',
57
94
  'カ' => 'ガ', 'キ' => 'ギ', 'ク' => 'グ', 'ケ' => 'ゲ', 'コ' => 'ゴ',
58
95
  'サ' => 'ザ', 'シ' => 'ジ', 'ス' => 'ズ', 'セ' => 'ゼ', 'ソ' => 'ゾ',
59
96
  'タ' => 'ダ', 'チ' => 'ヂ', 'ツ' => 'ヅ', 'テ' => 'デ', 'ト' => 'ド',
60
- 'ハ' => 'バ', 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ'
97
+ 'ハ' => 'バ', 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ',
98
+ 'う' => 'ゔ',
99
+ 'か' => 'が', 'き' => 'ぎ', 'く' => 'ぐ', 'け' => 'げ', 'こ' => 'ご',
100
+ 'さ' => 'ざ', 'し' => 'じ', 'す' => 'ず', 'せ' => 'ぜ', 'そ' => 'ぞ',
101
+ 'た' => 'だ', 'ち' => 'ぢ', 'つ' => 'づ', 'て' => 'で', 'と' => 'ど',
102
+ 'は' => 'ば', 'ひ' => 'び', 'ふ' => 'ぶ', 'へ' => 'べ', 'ほ' => 'ぼ'
61
103
  }.freeze #: Hash[String, String]
62
104
 
63
105
  HANDAKUON_KANA_MAP = {
64
- 'ハ' => 'パ', 'ヒ' => 'ピ', 'フ' => 'プ', 'ヘ' => 'ペ', 'ホ' => 'ポ'
106
+ 'ハ' => 'パ', 'ヒ' => 'ピ', 'フ' => 'プ', 'ヘ' => 'ペ', 'ホ' => 'ポ',
107
+ 'は' => 'ぱ', 'ひ' => 'ぴ', 'ふ' => 'ぷ', 'へ' => 'ぺ', 'ほ' => 'ぽ'
65
108
  }.freeze #: Hash[String, String]
66
109
 
67
- private_constant :CONVERSION_MAP, :LATIN_MAP, :HALF_WIDTH_KANA_MAP, :DAKUON_KANA_MAP, :HANDAKUON_KANA_MAP
110
+ private_constant :CONVERSION_MAP, :LATIN_MAP, :HALF_WIDTH_KANA_MAP, :DAKUON_KANA_MAP, :HANDAKUON_KANA_MAP, :DAKUON_HANDAKUON_POSSIBLES,
111
+ :NORMALIZED_HYPHEN, :NORMALIZED_VOWEL
68
112
 
69
113
  # Normalize the given text.
70
114
  #
71
115
  # @rbs str: String
116
+ # @rbs override_conversion_map: Hash[String, String]
72
117
  # @rbs return: String
73
- def normalize(str)
118
+ def normalize(str, override_conversion_map = {})
119
+ conversion_map = CONVERSION_MAP.merge(override_conversion_map)
120
+
74
121
  squeezee = ''
75
122
  prev_latin = false
76
123
  whitespace_encountered = false
77
- encountered_half_width_kana = nil
124
+ dakuon_handakuon_possible = nil
78
125
  normalized = str.chars.map do |c|
79
126
  prefix = ''
80
- c = CONVERSION_MAP[c] || c
127
+ c = conversion_map[c] || c
81
128
 
82
129
  # normalize the Half-width kana to full-width
83
- if encountered_half_width_kana
84
- if (c == '゙' && (k = DAKUON_KANA_MAP[encountered_half_width_kana])) ||
85
- (c == '゚' && (k = HANDAKUON_KANA_MAP[encountered_half_width_kana]))
130
+ if dakuon_handakuon_possible
131
+ if (["\u309b", "\u3099", "\uff9e"].include?(c) && (k = DAKUON_KANA_MAP[dakuon_handakuon_possible])) ||
132
+ (["\u309c", "\u309a", "\uff9f"].include?(c) && (k = HANDAKUON_KANA_MAP[dakuon_handakuon_possible]))
86
133
  c = ''
87
134
  prefix = k
88
135
  else
89
- prefix = encountered_half_width_kana
136
+ prefix = dakuon_handakuon_possible
90
137
  end
91
138
  end
92
139
 
93
140
  if (encountered_half_width_kana = HALF_WIDTH_KANA_MAP[c])
141
+ c = encountered_half_width_kana
142
+ end
143
+
144
+ dakuon_handakuon_possible = nil
145
+ if DAKUON_HANDAKUON_POSSIBLES[c]
146
+ dakuon_handakuon_possible = c
94
147
  c = ''
95
148
  end
96
149
 
@@ -112,12 +165,12 @@ module Neologdish
112
165
  c = ''
113
166
  else
114
167
  prefix = ' ' if is_latin && whitespace_encountered
115
- whitespace_encountered = false
168
+ whitespace_encountered &&= c == '' # take care for consecutive spaces on the right side
116
169
  end
117
170
  prev_latin = is_latin
118
171
 
119
172
  prefix + c
120
- end.join + (encountered_half_width_kana || '')
173
+ end.join + (dakuon_handakuon_possible || '')
121
174
 
122
175
  normalized.strip
123
176
  end
data/renovate.json ADDED
@@ -0,0 +1,6 @@
1
+ {
2
+ "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3
+ "extends": [
4
+ "config:recommended"
5
+ ]
6
+ }
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Neologdish
4
4
  module Normalizer
5
- VERSION: String
5
+ VERSION: ::String
6
6
  end
7
7
  end
@@ -1,19 +1,29 @@
1
1
  # Generated from lib/neologdish/normalizer.rb with RBS::Inline
2
2
 
3
3
  module Neologdish
4
+ # A Japanese text normalizer module according to the neologd convention.
4
5
  module Normalizer
6
+ NORMALIZED_HYPHEN: ::String
7
+
8
+ NORMALIZED_VOWEL: ::String
9
+
5
10
  CONVERSION_MAP: Hash[String, String]
6
11
 
7
12
  LATIN_MAP: Hash[String, bool]
8
13
 
9
14
  HALF_WIDTH_KANA_MAP: Hash[String, String]
10
15
 
16
+ DAKUON_HANDAKUON_POSSIBLES: Hash[String, bool]
17
+
11
18
  DAKUON_KANA_MAP: Hash[String, String]
12
19
 
13
20
  HANDAKUON_KANA_MAP: Hash[String, String]
14
21
 
22
+ # Normalize the given text.
23
+ #
15
24
  # @rbs str: String
25
+ # @rbs override_conversion_map: Hash[String, String]
16
26
  # @rbs return: String
17
- def normalize: (String str) -> String
27
+ def self?.normalize: (String str, ?Hash[String, String] override_conversion_map) -> String
18
28
  end
19
29
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: neologdish-normalizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - moznion
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2024-10-28 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies: []
13
12
  description: A Japanese text normalization library follows the conventions of neologd
14
13
  with some performance optimizations. It is designed to preprocess Japanese text
@@ -26,16 +25,16 @@ files:
26
25
  - Rakefile
27
26
  - lib/neologdish/normalizer.rb
28
27
  - lib/neologdish/normalizer/version.rb
28
+ - renovate.json
29
29
  - sig/generated/neologdish/normalizer.rbs
30
30
  - sig/generated/neologdish/normalizer/version.rbs
31
- homepage: https://github.com/moznion/neologdish-normalizer
31
+ homepage: https://github.com/moznion/neologdish-normalizer-ruby
32
32
  licenses: []
33
33
  metadata:
34
- homepage_uri: https://github.com/moznion/neologdish-normalizer
35
- source_code_uri: https://github.com/moznion/neologdish-normalizer
36
- changelog_uri: https://github.com/moznion/neologdish-normalizer/releases
34
+ homepage_uri: https://github.com/moznion/neologdish-normalizer-ruby
35
+ source_code_uri: https://github.com/moznion/neologdish-normalizer-ruby
36
+ changelog_uri: https://github.com/moznion/neologdish-normalizer-ruby/releases
37
37
  rubygems_mfa_required: 'true'
38
- post_install_message:
39
38
  rdoc_options: []
40
39
  require_paths:
41
40
  - lib
@@ -50,8 +49,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
50
49
  - !ruby/object:Gem::Version
51
50
  version: '0'
52
51
  requirements: []
53
- rubygems_version: 3.5.22
54
- signing_key:
52
+ rubygems_version: 3.6.7
55
53
  specification_version: 4
56
54
  summary: A Japanese text normalization library follows the conventions of neologd
57
55
  test_files: []