neologdish-normalizer 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +2 -3
- data/.ruby-version +1 -1
- data/README.md +1 -1
- data/lib/neologdish/normalizer/version.rb +1 -1
- data/lib/neologdish/normalizer.rb +62 -12
- data/sig/generated/neologdish/normalizer/version.rbs +1 -1
- data/sig/generated/neologdish/normalizer.rbs +7 -1
- metadata +7 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 57bda9b5b4b5de8dc0b6582e3a5eb5d2103d3207a405bb7b977b52e0eff02322
|
4
|
+
data.tar.gz: a35a668f517d31229e370dd4839c911f2e1b16f6c868e8537d9e61f2e44c7eee
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bf2788e135ccecdfdea6a4be0e52e4e1b151e4fc6c8847844299d2bbc1578c23553f92d6d47bea0e955dff2ddb941f9944e94cbfe420bbe51b733c2094497f6e
|
7
|
+
data.tar.gz: 8da7c4aa8848bef56fd5d612f22e08e0cf33cde2ccc0af829d2394da3f8bd9cb5c2ae38223adcd7d3b871df6b3909f1c550a36beb86f0bdcf6a9ab14c0d65345
|
data/.rubocop.yml
CHANGED
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
3.
|
1
|
+
3.4.4
|
data/README.md
CHANGED
@@ -47,5 +47,5 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
47
47
|
|
48
48
|
## Contributing
|
49
49
|
|
50
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/moznion/neologdish-normalizer.
|
50
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/moznion/neologdish-normalizer-ruby.
|
51
51
|
|
@@ -5,6 +5,9 @@ require_relative 'normalizer/version'
|
|
5
5
|
module Neologdish
|
6
6
|
# A Japanese text normalizer module according to the neologd convention.
|
7
7
|
module Normalizer
|
8
|
+
NORMALIZED_HYPHEN = "\u002d" # -
|
9
|
+
NORMALIZED_VOWEL = "\u30fc" # ー
|
10
|
+
|
8
11
|
CONVERSION_MAP = {
|
9
12
|
# Normalize [0-9a-zA-Z] to half-width
|
10
13
|
'0' => '0', '1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8', '9' => '9',
|
@@ -15,11 +18,31 @@ module Neologdish
|
|
15
18
|
'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n', 'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't',
|
16
19
|
'u' => 'u', 'v' => 'v', 'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z',
|
17
20
|
# normalize the hyphen/minus-ish characters to '-'
|
18
|
-
|
21
|
+
"\u02d7" => NORMALIZED_HYPHEN, # ˗
|
22
|
+
"\u058a" => NORMALIZED_HYPHEN, # ֊
|
23
|
+
"\u2010" => NORMALIZED_HYPHEN, # ‐
|
24
|
+
"\u2011" => NORMALIZED_HYPHEN, # ‑
|
25
|
+
"\u2012" => NORMALIZED_HYPHEN, # ‒
|
26
|
+
"\u2013" => NORMALIZED_HYPHEN, # –
|
27
|
+
"\u2043" => NORMALIZED_HYPHEN, # ⁃
|
28
|
+
"\u207b" => NORMALIZED_HYPHEN, # ⁻
|
29
|
+
"\u208b" => NORMALIZED_HYPHEN, # ₋
|
30
|
+
"\u2212" => NORMALIZED_HYPHEN, # −
|
19
31
|
# normalize the long-vowel mark-ish characters to 'ー'
|
20
|
-
|
32
|
+
"\u2014" => NORMALIZED_VOWEL, # —
|
33
|
+
"\u2015" => NORMALIZED_VOWEL, # ―
|
34
|
+
"\u2500" => NORMALIZED_VOWEL, # ─
|
35
|
+
"\u2501" => NORMALIZED_VOWEL, # ━
|
36
|
+
"\ufe63" => NORMALIZED_VOWEL, # ﹣
|
37
|
+
"\uff0d" => NORMALIZED_VOWEL, # -
|
38
|
+
"\uff70" => NORMALIZED_VOWEL, # ー
|
21
39
|
# remove the tilde-ish characters
|
22
|
-
|
40
|
+
"\u007e" => '', # ~
|
41
|
+
"\u223c" => '', # ∼
|
42
|
+
"\u223e" => '', # ∾
|
43
|
+
"\u301c" => '', # 〜
|
44
|
+
"\u3030" => '', # 〰
|
45
|
+
"\uff5e" => '', # ~
|
23
46
|
# normalize the full-width special symbol characters (/!”#$%&’()*+,−./:;<>?@[¥]^_`{|}) and space characters to half-width
|
24
47
|
' ' => ' ', '!' => '!', '”' => '"', '#' => '#', '$' => '$', '%' => '%', '&' => '&', '’' => "'", '(' => '(', ')' => ')',
|
25
48
|
'*' => '*', '+' => '+', ',' => ',', '.' => '.', '/' => '/', ':' => ':', ';' => ';', '<' => '<', '>' => '>', '?' => '?',
|
@@ -53,18 +76,39 @@ module Neologdish
|
|
53
76
|
'ッ' => 'ッ', 'ャ' => 'ヤ', 'ュ' => 'ユ', 'ョ' => 'ヨ'
|
54
77
|
}.freeze #: Hash[String, String]
|
55
78
|
|
79
|
+
DAKUON_HANDAKUON_POSSIBLES = {
|
80
|
+
'ウ' => true,
|
81
|
+
'カ' => true, 'キ' => true, 'ク' => true, 'ケ' => true, 'コ' => true,
|
82
|
+
'サ' => true, 'シ' => true, 'ス' => true, 'セ' => true, 'ソ' => true,
|
83
|
+
'タ' => true, 'チ' => true, 'ツ' => true, 'テ' => true, 'ト' => true,
|
84
|
+
'ハ' => true, 'ヒ' => true, 'フ' => true, 'ヘ' => true, 'ホ' => true,
|
85
|
+
'う' => true,
|
86
|
+
'か' => true, 'き' => true, 'く' => true, 'け' => true, 'こ' => true,
|
87
|
+
'さ' => true, 'し' => true, 'す' => true, 'せ' => true, 'そ' => true,
|
88
|
+
'た' => true, 'ち' => true, 'つ' => true, 'て' => true, 'と' => true,
|
89
|
+
'は' => true, 'ひ' => true, 'ふ' => true, 'へ' => true, 'ほ' => true
|
90
|
+
}.freeze #: Hash[String, bool]
|
91
|
+
|
56
92
|
DAKUON_KANA_MAP = {
|
93
|
+
'ウ' => 'ヴ',
|
57
94
|
'カ' => 'ガ', 'キ' => 'ギ', 'ク' => 'グ', 'ケ' => 'ゲ', 'コ' => 'ゴ',
|
58
95
|
'サ' => 'ザ', 'シ' => 'ジ', 'ス' => 'ズ', 'セ' => 'ゼ', 'ソ' => 'ゾ',
|
59
96
|
'タ' => 'ダ', 'チ' => 'ヂ', 'ツ' => 'ヅ', 'テ' => 'デ', 'ト' => 'ド',
|
60
|
-
'ハ' => 'バ', 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ'
|
97
|
+
'ハ' => 'バ', 'ヒ' => 'ビ', 'フ' => 'ブ', 'ヘ' => 'ベ', 'ホ' => 'ボ',
|
98
|
+
'う' => 'ゔ',
|
99
|
+
'か' => 'が', 'き' => 'ぎ', 'く' => 'ぐ', 'け' => 'げ', 'こ' => 'ご',
|
100
|
+
'さ' => 'ざ', 'し' => 'じ', 'す' => 'ず', 'せ' => 'ぜ', 'そ' => 'ぞ',
|
101
|
+
'た' => 'だ', 'ち' => 'ぢ', 'つ' => 'づ', 'て' => 'で', 'と' => 'ど',
|
102
|
+
'は' => 'ば', 'ひ' => 'び', 'ふ' => 'ぶ', 'へ' => 'べ', 'ほ' => 'ぼ'
|
61
103
|
}.freeze #: Hash[String, String]
|
62
104
|
|
63
105
|
HANDAKUON_KANA_MAP = {
|
64
|
-
'ハ' => 'パ', 'ヒ' => 'ピ', 'フ' => 'プ', 'ヘ' => 'ペ', 'ホ' => 'ポ'
|
106
|
+
'ハ' => 'パ', 'ヒ' => 'ピ', 'フ' => 'プ', 'ヘ' => 'ペ', 'ホ' => 'ポ',
|
107
|
+
'は' => 'ぱ', 'ひ' => 'ぴ', 'ふ' => 'ぷ', 'へ' => 'ぺ', 'ほ' => 'ぽ'
|
65
108
|
}.freeze #: Hash[String, String]
|
66
109
|
|
67
|
-
private_constant :CONVERSION_MAP, :LATIN_MAP, :HALF_WIDTH_KANA_MAP, :DAKUON_KANA_MAP, :HANDAKUON_KANA_MAP
|
110
|
+
private_constant :CONVERSION_MAP, :LATIN_MAP, :HALF_WIDTH_KANA_MAP, :DAKUON_KANA_MAP, :HANDAKUON_KANA_MAP, :DAKUON_HANDAKUON_POSSIBLES,
|
111
|
+
:NORMALIZED_HYPHEN, :NORMALIZED_VOWEL
|
68
112
|
|
69
113
|
# Normalize the given text.
|
70
114
|
#
|
@@ -77,23 +121,29 @@ module Neologdish
|
|
77
121
|
squeezee = ''
|
78
122
|
prev_latin = false
|
79
123
|
whitespace_encountered = false
|
80
|
-
|
124
|
+
dakuon_handakuon_possible = nil
|
81
125
|
normalized = str.chars.map do |c|
|
82
126
|
prefix = ''
|
83
127
|
c = conversion_map[c] || c
|
84
128
|
|
85
129
|
# normalize the Half-width kana to full-width
|
86
|
-
if
|
87
|
-
if (
|
88
|
-
(
|
130
|
+
if dakuon_handakuon_possible
|
131
|
+
if (["\u309b", "\u3099", "\uff9e"].include?(c) && (k = DAKUON_KANA_MAP[dakuon_handakuon_possible])) ||
|
132
|
+
(["\u309c", "\u309a", "\uff9f"].include?(c) && (k = HANDAKUON_KANA_MAP[dakuon_handakuon_possible]))
|
89
133
|
c = ''
|
90
134
|
prefix = k
|
91
135
|
else
|
92
|
-
prefix =
|
136
|
+
prefix = dakuon_handakuon_possible
|
93
137
|
end
|
94
138
|
end
|
95
139
|
|
96
140
|
if (encountered_half_width_kana = HALF_WIDTH_KANA_MAP[c])
|
141
|
+
c = encountered_half_width_kana
|
142
|
+
end
|
143
|
+
|
144
|
+
dakuon_handakuon_possible = nil
|
145
|
+
if DAKUON_HANDAKUON_POSSIBLES[c]
|
146
|
+
dakuon_handakuon_possible = c
|
97
147
|
c = ''
|
98
148
|
end
|
99
149
|
|
@@ -120,7 +170,7 @@ module Neologdish
|
|
120
170
|
prev_latin = is_latin
|
121
171
|
|
122
172
|
prefix + c
|
123
|
-
end.join + (
|
173
|
+
end.join + (dakuon_handakuon_possible || '')
|
124
174
|
|
125
175
|
normalized.strip
|
126
176
|
end
|
@@ -3,12 +3,18 @@
|
|
3
3
|
module Neologdish
|
4
4
|
# A Japanese text normalizer module according to the neologd convention.
|
5
5
|
module Normalizer
|
6
|
+
NORMALIZED_HYPHEN: ::String
|
7
|
+
|
8
|
+
NORMALIZED_VOWEL: ::String
|
9
|
+
|
6
10
|
CONVERSION_MAP: Hash[String, String]
|
7
11
|
|
8
12
|
LATIN_MAP: Hash[String, bool]
|
9
13
|
|
10
14
|
HALF_WIDTH_KANA_MAP: Hash[String, String]
|
11
15
|
|
16
|
+
DAKUON_HANDAKUON_POSSIBLES: Hash[String, bool]
|
17
|
+
|
12
18
|
DAKUON_KANA_MAP: Hash[String, String]
|
13
19
|
|
14
20
|
HANDAKUON_KANA_MAP: Hash[String, String]
|
@@ -18,6 +24,6 @@ module Neologdish
|
|
18
24
|
# @rbs str: String
|
19
25
|
# @rbs override_conversion_map: Hash[String, String]
|
20
26
|
# @rbs return: String
|
21
|
-
def normalize: (String str, ?Hash[String, String] override_conversion_map) -> String
|
27
|
+
def self?.normalize: (String str, ?Hash[String, String] override_conversion_map) -> String
|
22
28
|
end
|
23
29
|
end
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: neologdish-normalizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- moznion
|
8
|
-
autorequire:
|
9
8
|
bindir: exe
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
12
11
|
dependencies: []
|
13
12
|
description: A Japanese text normalization library follows the conventions of neologd
|
14
13
|
with some performance optimizations. It is designed to preprocess Japanese text
|
@@ -29,14 +28,13 @@ files:
|
|
29
28
|
- renovate.json
|
30
29
|
- sig/generated/neologdish/normalizer.rbs
|
31
30
|
- sig/generated/neologdish/normalizer/version.rbs
|
32
|
-
homepage: https://github.com/moznion/neologdish-normalizer
|
31
|
+
homepage: https://github.com/moznion/neologdish-normalizer-ruby
|
33
32
|
licenses: []
|
34
33
|
metadata:
|
35
|
-
homepage_uri: https://github.com/moznion/neologdish-normalizer
|
36
|
-
source_code_uri: https://github.com/moznion/neologdish-normalizer
|
37
|
-
changelog_uri: https://github.com/moznion/neologdish-normalizer/releases
|
34
|
+
homepage_uri: https://github.com/moznion/neologdish-normalizer-ruby
|
35
|
+
source_code_uri: https://github.com/moznion/neologdish-normalizer-ruby
|
36
|
+
changelog_uri: https://github.com/moznion/neologdish-normalizer-ruby/releases
|
38
37
|
rubygems_mfa_required: 'true'
|
39
|
-
post_install_message:
|
40
38
|
rdoc_options: []
|
41
39
|
require_paths:
|
42
40
|
- lib
|
@@ -51,8 +49,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
51
49
|
- !ruby/object:Gem::Version
|
52
50
|
version: '0'
|
53
51
|
requirements: []
|
54
|
-
rubygems_version: 3.
|
55
|
-
signing_key:
|
52
|
+
rubygems_version: 3.6.7
|
56
53
|
specification_version: 4
|
57
54
|
summary: A Japanese text normalization library follows the conventions of neologd
|
58
55
|
test_files: []
|