neologdish-normalizer 0.0.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +2 -3
- data/.ruby-version +1 -1
- data/README.md +4 -6
- data/Rakefile +5 -0
- data/lib/neologdish/normalizer/version.rb +1 -1
- data/lib/neologdish/normalizer.rb +68 -15
- data/renovate.json +6 -0
- data/sig/generated/neologdish/normalizer/version.rbs +1 -1
- data/sig/generated/neologdish/normalizer.rbs +11 -1
- metadata +8 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 57bda9b5b4b5de8dc0b6582e3a5eb5d2103d3207a405bb7b977b52e0eff02322
|
4
|
+
data.tar.gz: a35a668f517d31229e370dd4839c911f2e1b16f6c868e8537d9e61f2e44c7eee
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bf2788e135ccecdfdea6a4be0e52e4e1b151e4fc6c8847844299d2bbc1578c23553f92d6d47bea0e955dff2ddb941f9944e94cbfe420bbe51b733c2094497f6e
|
7
|
+
data.tar.gz: 8da7c4aa8848bef56fd5d612f22e08e0cf33cde2ccc0af829d2394da3f8bd9cb5c2ae38223adcd7d3b871df6b3909f1c550a36beb86f0bdcf6a9ab14c0d65345
|
data/.rubocop.yml
CHANGED
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
3.
|
1
|
+
3.4.4
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Neologdish::Normalizer for Ruby
|
1
|
+
# Neologdish::Normalizer for Ruby [](https://github.com/moznion/neologdish-normalizer-ruby/actions/workflows/check.yml) [](https://badge.fury.io/rb/neologdish-normalizer)
|
2
2
|
|
3
3
|
A Japanese text normalization library for Ruby follows the conventions of [neologd/mecab-ipadic-neologd](https://github.com/neologd/mecab-ipadic-neologd), with some performance optimizations, without external dependencies. It is designed to preprocess Japanese text before applying NLP techniques.
|
4
4
|
|
@@ -27,18 +27,16 @@ The benchmark script is here: [./scripts/benchmark.rb](./scripts/benchmark.rb)
|
|
27
27
|
|
28
28
|
## Installation
|
29
29
|
|
30
|
-
TODO: Replace `UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG` with your gem name right after releasing it to RubyGems.org. Please do not do it earlier due to security reasons. Alternatively, replace this section with instructions to install your gem from git if you don't plan to release to RubyGems.org.
|
31
|
-
|
32
30
|
Install the gem and add to the application's Gemfile by executing:
|
33
31
|
|
34
32
|
```bash
|
35
|
-
bundle add
|
33
|
+
bundle add 'neologdish-normalizer'
|
36
34
|
```
|
37
35
|
|
38
36
|
If bundler is not being used to manage dependencies, install the gem by executing:
|
39
37
|
|
40
38
|
```bash
|
41
|
-
gem install
|
39
|
+
gem install 'neologdish-normalizer'
|
42
40
|
```
|
43
41
|
|
44
42
|
## Development
|
@@ -49,5 +47,5 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
49
47
|
|
50
48
|
## Contributing
|
51
49
|
|
52
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/moznion/neologdish-normalizer.
|
50
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/moznion/neologdish-normalizer-ruby.
|
53
51
|
|
data/Rakefile
CHANGED
@@ -5,6 +5,9 @@ require_relative 'normalizer/version'
|
|
5
5
|
module Neologdish
|
6
6
|
# A Japanese text normalizer module according to the neologd convention.
|
7
7
|
module Normalizer
|
8
|
+
NORMALIZED_HYPHEN = "\u002d" # -
|
9
|
+
NORMALIZED_VOWEL = "\u30fc" # ー
|
10
|
+
|
8
11
|
CONVERSION_MAP = {
|
9
12
|
# Normalize [0-9a-zA-Z] to half-width
|
10
13
|
'0' => '0', '1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8', '9' => '9',
|
@@ -15,11 +18,31 @@ module Neologdish
|
|
15
18
|
'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n', 'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't',
|
16
19
|
'u' => 'u', 'v' => 'v', 'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z',
|
17
20
|
# normalize the hyphen/minus-ish characters to '-'
|
18
|
-
|
21
|
+
"\u02d7" => NORMALIZED_HYPHEN, # ˗
|
22
|
+
"\u058a" => NORMALIZED_HYPHEN, # ֊
|
23
|
+
"\u2010" => NORMALIZED_HYPHEN, # ‐
|
24
|
+
"\u2011" => NORMALIZED_HYPHEN, # ‑
|
25
|
+
"\u2012" => NORMALIZED_HYPHEN, # ‒
|
26
|
+
"\u2013" => NORMALIZED_HYPHEN, # –
|
27
|
+
"\u2043" => NORMALIZED_HYPHEN, # ⁃
|
28
|
+
"\u207b" => NORMALIZED_HYPHEN, # ⁻
|
29
|
+
"\u208b" => NORMALIZED_HYPHEN, # ₋
|
30
|
+
"\u2212" => NORMALIZED_HYPHEN, # −
|
19
31
|
# normalize the long-vowel mark-ish characters to 'ー'
|
20
|
-
|
32
|
+
"\u2014" => NORMALIZED_VOWEL, # —
|
33
|
+
"\u2015" => NORMALIZED_VOWEL, # ―
|
34
|
+
"\u2500" => NORMALIZED_VOWEL, # ─
|
35
|
+
"\u2501" => NORMALIZED_VOWEL, # ━
|
36
|
+
"\ufe63" => NORMALIZED_VOWEL, # ﹣
|
37
|
+
"\uff0d" => NORMALIZED_VOWEL, # -
|
38
|
+
"\uff70" => NORMALIZED_VOWEL, # ー
|
21
39
|
# remove the tilde-ish characters
|
22
|
-
|
40
|
+
"\u007e" => '', # ~
|
41
|
+
"\u223c" => '', # ∼
|
42
|
+
"\u223e" => '', # ∾
|
43
|
+
"\u301c" => '', # 〜
|
44
|
+
"\u3030" => '', # 〰
|
45
|
+
"\uff5e" => '', # ~
|
23
46
|
# normalize the full-width special symbol characters (/!”#$%&’()*+,−./:;<>?@[¥]^_`{|}) and space characters to half-width
|
24
47
|
' ' => ' ', '!' => '!', '”' => '"', '#' => '#', '$' => '$', '%' => '%', '&' => '&', '’' => "'", '(' => '(', ')' => ')',
|
25
48
|
'*' => '*', '+' => '+', ',' => ',', '.' => '.', '/' => '/', ':' => ':', ';' => ';', '<' => '<', '>' => '>', '?' => '?',
|
@@ -53,44 +76,74 @@ module Neologdish
|
|
53
76
|
'ッ' => 'ッ', 'ャ' => 'ヤ', 'ュ' => 'ユ', 'ョ' => 'ヨ'
|
54
77
|
}.freeze #: Hash[String, String]
|
55
78
|
|
79
|
+
DAKUON_HANDAKUON_POSSIBLES = {
|
80
|
+
'ウ' => true,
|
81
|
+
'カ' => true, 'キ' => true, 'ク' => true, 'ケ' => true, 'コ' => true,
|
82
|
+
'サ' => true, 'シ' => true, 'ス' => true, 'セ' => true, 'ソ' => true,
|
83
|
+
'タ' => true, 'チ' => true, 'ツ' => true, 'テ' => true, 'ト' => true,
|
84
|
+
'ハ' => true, 'ヒ' => true, 'フ' => true, 'ヘ' => true, 'ホ' => true,
|
85
|
+
'う' => true,
|
86
|
+
'か' => true, 'き' => true, 'く' => true, 'け' => true, 'こ' => true,
|
87
|
+
'さ' => true, 'し' => true, 'す' => true, 'せ' => true, 'そ' => true,
|
88
|
+
'た' => true, 'ち' => true, 'つ' => true, 'て' => true, 'と' => true,
|
89
|
+
'は' => true, 'ひ' => true, 'ふ' => true, 'へ' => true, 'ほ' => true
|
90
|
+
}.freeze #: Hash[String, bool]
|
91
|
+
|
56
92
|
DAKUON_KANA_MAP = {
|
93
|
+
'ウ' => 'ヴ',
|
57
94
|
'カ' => 'ガ', 'キ' => 'ギ', 'ク' => 'グ', 'ケ' => 'ゲ', 'コ' => 'ゴ',
|
58
95
|
'サ' => 'ザ', 'シ' => 'ジ', 'ス' => 'ズ', 'セ' => 'ゼ', 'ソ' => 'ゾ',
|
59
96
|
'タ' => 'ダ', 'チ' => 'ヂ', 'ツ' => 'ヅ', 'テ' => 'デ', 'ト' => 'ド',
|
60
|
-
'ハ' => 'バ', 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ'
|
97
|
+
'ハ' => 'バ', 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ',
|
98
|
+
'う' => 'ゔ',
|
99
|
+
'か' => 'が', 'き' => 'ぎ', 'く' => 'ぐ', 'け' => 'げ', 'こ' => 'ご',
|
100
|
+
'さ' => 'ざ', 'し' => 'じ', 'す' => 'ず', 'せ' => 'ぜ', 'そ' => 'ぞ',
|
101
|
+
'た' => 'だ', 'ち' => 'ぢ', 'つ' => 'づ', 'て' => 'で', 'と' => 'ど',
|
102
|
+
'は' => 'ば', 'ひ' => 'び', 'ふ' => 'ぶ', 'へ' => 'べ', 'ほ' => 'ぼ'
|
61
103
|
}.freeze #: Hash[String, String]
|
62
104
|
|
63
105
|
HANDAKUON_KANA_MAP = {
|
64
|
-
'ハ' => 'パ', 'ヒ' => 'ピ', 'フ' => 'プ', 'ヘ' => 'ペ', 'ホ' => 'ポ'
|
106
|
+
'ハ' => 'パ', 'ヒ' => 'ピ', 'フ' => 'プ', 'ヘ' => 'ペ', 'ホ' => 'ポ',
|
107
|
+
'は' => 'ぱ', 'ひ' => 'ぴ', 'ふ' => 'ぷ', 'へ' => 'ぺ', 'ほ' => 'ぽ'
|
65
108
|
}.freeze #: Hash[String, String]
|
66
109
|
|
67
|
-
private_constant :CONVERSION_MAP, :LATIN_MAP, :HALF_WIDTH_KANA_MAP, :DAKUON_KANA_MAP, :HANDAKUON_KANA_MAP
|
110
|
+
private_constant :CONVERSION_MAP, :LATIN_MAP, :HALF_WIDTH_KANA_MAP, :DAKUON_KANA_MAP, :HANDAKUON_KANA_MAP, :DAKUON_HANDAKUON_POSSIBLES,
|
111
|
+
:NORMALIZED_HYPHEN, :NORMALIZED_VOWEL
|
68
112
|
|
69
113
|
# Normalize the given text.
|
70
114
|
#
|
71
115
|
# @rbs str: String
|
116
|
+
# @rbs override_conversion_map: Hash[String, String]
|
72
117
|
# @rbs return: String
|
73
|
-
def normalize(str)
|
118
|
+
def normalize(str, override_conversion_map = {})
|
119
|
+
conversion_map = CONVERSION_MAP.merge(override_conversion_map)
|
120
|
+
|
74
121
|
squeezee = ''
|
75
122
|
prev_latin = false
|
76
123
|
whitespace_encountered = false
|
77
|
-
|
124
|
+
dakuon_handakuon_possible = nil
|
78
125
|
normalized = str.chars.map do |c|
|
79
126
|
prefix = ''
|
80
|
-
c =
|
127
|
+
c = conversion_map[c] || c
|
81
128
|
|
82
129
|
# normalize the Half-width kana to full-width
|
83
|
-
if
|
84
|
-
if (
|
85
|
-
(
|
130
|
+
if dakuon_handakuon_possible
|
131
|
+
if (["\u309b", "\u3099", "\uff9e"].include?(c) && (k = DAKUON_KANA_MAP[dakuon_handakuon_possible])) ||
|
132
|
+
(["\u309c", "\u309a", "\uff9f"].include?(c) && (k = HANDAKUON_KANA_MAP[dakuon_handakuon_possible]))
|
86
133
|
c = ''
|
87
134
|
prefix = k
|
88
135
|
else
|
89
|
-
prefix =
|
136
|
+
prefix = dakuon_handakuon_possible
|
90
137
|
end
|
91
138
|
end
|
92
139
|
|
93
140
|
if (encountered_half_width_kana = HALF_WIDTH_KANA_MAP[c])
|
141
|
+
c = encountered_half_width_kana
|
142
|
+
end
|
143
|
+
|
144
|
+
dakuon_handakuon_possible = nil
|
145
|
+
if DAKUON_HANDAKUON_POSSIBLES[c]
|
146
|
+
dakuon_handakuon_possible = c
|
94
147
|
c = ''
|
95
148
|
end
|
96
149
|
|
@@ -112,12 +165,12 @@ module Neologdish
|
|
112
165
|
c = ''
|
113
166
|
else
|
114
167
|
prefix = ' ' if is_latin && whitespace_encountered
|
115
|
-
whitespace_encountered
|
168
|
+
whitespace_encountered &&= c == '' # take care for consecutive spaces on the right side
|
116
169
|
end
|
117
170
|
prev_latin = is_latin
|
118
171
|
|
119
172
|
prefix + c
|
120
|
-
end.join + (
|
173
|
+
end.join + (dakuon_handakuon_possible || '')
|
121
174
|
|
122
175
|
normalized.strip
|
123
176
|
end
|
data/renovate.json
ADDED
@@ -1,19 +1,29 @@
|
|
1
1
|
# Generated from lib/neologdish/normalizer.rb with RBS::Inline
|
2
2
|
|
3
3
|
module Neologdish
|
4
|
+
# A Japanese text normalizer module according to the neologd convention.
|
4
5
|
module Normalizer
|
6
|
+
NORMALIZED_HYPHEN: ::String
|
7
|
+
|
8
|
+
NORMALIZED_VOWEL: ::String
|
9
|
+
|
5
10
|
CONVERSION_MAP: Hash[String, String]
|
6
11
|
|
7
12
|
LATIN_MAP: Hash[String, bool]
|
8
13
|
|
9
14
|
HALF_WIDTH_KANA_MAP: Hash[String, String]
|
10
15
|
|
16
|
+
DAKUON_HANDAKUON_POSSIBLES: Hash[String, bool]
|
17
|
+
|
11
18
|
DAKUON_KANA_MAP: Hash[String, String]
|
12
19
|
|
13
20
|
HANDAKUON_KANA_MAP: Hash[String, String]
|
14
21
|
|
22
|
+
# Normalize the given text.
|
23
|
+
#
|
15
24
|
# @rbs str: String
|
25
|
+
# @rbs override_conversion_map: Hash[String, String]
|
16
26
|
# @rbs return: String
|
17
|
-
def normalize: (String str) -> String
|
27
|
+
def self?.normalize: (String str, ?Hash[String, String] override_conversion_map) -> String
|
18
28
|
end
|
19
29
|
end
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: neologdish-normalizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- moznion
|
8
|
-
autorequire:
|
9
8
|
bindir: exe
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
12
11
|
dependencies: []
|
13
12
|
description: A Japanese text normalization library follows the conventions of neologd
|
14
13
|
with some performance optimizations. It is designed to preprocess Japanese text
|
@@ -26,16 +25,16 @@ files:
|
|
26
25
|
- Rakefile
|
27
26
|
- lib/neologdish/normalizer.rb
|
28
27
|
- lib/neologdish/normalizer/version.rb
|
28
|
+
- renovate.json
|
29
29
|
- sig/generated/neologdish/normalizer.rbs
|
30
30
|
- sig/generated/neologdish/normalizer/version.rbs
|
31
|
-
homepage: https://github.com/moznion/neologdish-normalizer
|
31
|
+
homepage: https://github.com/moznion/neologdish-normalizer-ruby
|
32
32
|
licenses: []
|
33
33
|
metadata:
|
34
|
-
homepage_uri: https://github.com/moznion/neologdish-normalizer
|
35
|
-
source_code_uri: https://github.com/moznion/neologdish-normalizer
|
36
|
-
changelog_uri: https://github.com/moznion/neologdish-normalizer/releases
|
34
|
+
homepage_uri: https://github.com/moznion/neologdish-normalizer-ruby
|
35
|
+
source_code_uri: https://github.com/moznion/neologdish-normalizer-ruby
|
36
|
+
changelog_uri: https://github.com/moznion/neologdish-normalizer-ruby/releases
|
37
37
|
rubygems_mfa_required: 'true'
|
38
|
-
post_install_message:
|
39
38
|
rdoc_options: []
|
40
39
|
require_paths:
|
41
40
|
- lib
|
@@ -50,8 +49,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
50
49
|
- !ruby/object:Gem::Version
|
51
50
|
version: '0'
|
52
51
|
requirements: []
|
53
|
-
rubygems_version: 3.
|
54
|
-
signing_key:
|
52
|
+
rubygems_version: 3.6.7
|
55
53
|
specification_version: 4
|
56
54
|
summary: A Japanese text normalization library follows the conventions of neologd
|
57
55
|
test_files: []
|