latinizer 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 79227b3ecff306b41fc210a377ddb1811a8cdfc2823390bf7ea3a59b6a91c7fc
4
- data.tar.gz: 32a70e923290a31c5b8fdc1eeda516c2390333445adcacfac712e8fcd5f2e9ec
3
+ metadata.gz: 21b15ac5fbb50b85ac397a6b1e58bb4e7e63c5ce9800d7c21eddd02a5fa09d7d
4
+ data.tar.gz: 36e10b84402711854a0345eaf5ccf488a125ce197690daabdff6443526d20522
5
5
  SHA512:
6
- metadata.gz: b513f27ef4716f3827aad163ea9ba7c260cf2b64ed20ff455624fef52c174f10f5e763f85b6f62ccf31adf64382aedd1d56d9e0443ac9cf411bb78620411ab53
7
- data.tar.gz: 0efcd64fa9ce46b45efdbdcdbd3150e478add0d42566279461f90f145068f201efed0ad4adb43fadddc3cf1327a0b0e8c2d9eea8a35ea2941e8791b27699a8a0
6
+ metadata.gz: 4c1b6a8eddc5258615cf0017b8209e6ee15ecb4332e0f241457e7a403364597e4ee525a89bcec3a93eb2e89821498ac65cbae893183b54cffffa7e88d3431672
7
+ data.tar.gz: 3a56c28ccbb182a25ba8e65998d0c95038f613ad4cbdd42d81103e47dc26bcacd0f7eee80549650736d9bb5657e9dd1de9bd824709a1ab9602ca9c77ff9417f1
@@ -1,293 +1,70 @@
1
1
  class Latinizer
2
- require 'chinese_pinyin'
3
- require 'mecab_standalone'
4
- require 'romaji'
5
2
  require 'translit'
6
3
  require 'unicode/scripts'
7
4
  require 'babosa'
5
+ require_relative './lib/arabic.rb'
6
+ require_relative './lib/han.rb'
7
+ require_relative './lib/japanese.rb'
8
+
9
+ SUPPORTED_SCRIPTS = [
10
+ 'Arabic',
11
+ 'Cyrillic',
12
+ 'Han',
13
+ 'Japanese'
14
+ ]
8
15
 
9
16
  def self.t(text, opt = nil)
10
- scripts = Unicode::Scripts.scripts(text) - ['Common', 'Inherited', 'Latin']
11
- pinyin_options = {tonemarks: true}
17
+ scripts = detect_non_latin_scripts(text)
12
18
 
13
- if opt == :ascii
14
- pinyin_options = {}
15
- elsif opt == :ja
16
- return romanize_japanese(text)
19
+ if scripts.size == 0
20
+ return opt == :ascii ? remove_non_ascii(text) : text
21
+ elsif scripts.size > 1
22
+ latinized = latinize_script(text, scripts.first, opt)
23
+ return t(latinized, opt)
17
24
  end
18
25
 
19
- if scripts.size == 1
20
- case scripts.first
21
- when 'Arabic'
22
- return romanize_arabic(text)
23
- when 'Cyrillic'
24
- latinized = Translit.convert(text, :english)
25
- return opt == :ascii ? latinized.to_slug.to_ascii.to_s : latinized
26
- when 'Han'
27
- return Pinyin.t(text, pinyin_options)
28
- end
29
- end
26
+ latinized = latinize_script(text, scripts.first, opt)
27
+ opt == :ascii ? remove_non_ascii(latinized) : latinized
28
+ end
30
29
 
31
- if is_japanese?(scripts)
32
- return romanize_japanese(text)
30
+ def self.latinize_script(text, script, opt = nil)
31
+ case script
32
+ when 'Arabic'
33
+ return Arabic.t(text)
34
+ when 'Cyrillic'
35
+ latinized = Translit.convert(text, :english)
36
+ return opt == :ascii ? remove_diacritics(latinized) : latinized
37
+ when 'Han'
38
+ return Han.t(text, opt)
39
+ when 'Japanese'
40
+ return Japanese.t(text)
33
41
  end
34
-
35
42
  text
36
43
  end
37
44
 
38
- def self.has_non_latin?(text)
45
+ def self.detect_non_latin_scripts(text)
39
46
  scripts = Unicode::Scripts.scripts(text) - ['Common', 'Inherited', 'Latin']
40
- scripts.size > 0 ? true : false
47
+ if is_japanese?(scripts)
48
+ scripts -= ['Han', 'Hiragana', 'Katakana']
49
+ scripts += ['Japanese']
50
+ end
51
+ scripts.intersection(SUPPORTED_SCRIPTS)
41
52
  end
42
53
 
43
- def self.is_japanese?(scripts) #fix only kana text
44
- (scripts.include?('Han') && (scripts.include?('Hiragana') || scripts.include?('Katakana'))) ||
45
- (scripts.include?('Hiragana') || scripts.include?('Katakana'))
54
+ def self.remove_diacritics(text)
55
+ text.to_slug.transliterate.to_s
46
56
  end
47
57
 
48
- def self.romanize_japanese(text)
49
- Romaji.kana2romaji(parse_japanese(text)
50
- .map{|k| k[-1]}
51
- .join(' ')
52
- .gsub('ー','')
53
- .gsub(' 。','.')
54
- .gsub(' ・','-')
55
- .gsub(' 、',',')
56
- )
58
+ def self.remove_non_ascii(text)
59
+ text.to_slug.transliterate.to_ascii.to_s
57
60
  end
58
61
 
59
- def self.parse_japanese(text)
60
- mecab_parsed = MecabStandalone.parse(text)
61
- .split("\n")
62
- .map{|k| k.split("\t")}.tap(&:pop)
63
- .map{|k| [k[0]].concat(k[1].split(','))}
64
- .map{|k| [k[0], k[1], k[-2]]}
65
- tokenized_kana = []
66
- mecab_parsed.each do |token|
67
- if token[1] == "助動詞"
68
- tokenized_kana[-1][0] += token[0]
69
- tokenized_kana[-1][-1] += token[-1]
70
- elsif token[-1] == '*'
71
- tokenized_kana << [token[0], token[1], token[0]]
72
- else
73
- tokenized_kana << token
74
- end
75
- end
76
- tokenized_kana
62
+ def self.has_non_latin?(text)
63
+ scripts = Unicode::Scripts.scripts(text) - ['Common', 'Inherited', 'Latin']
64
+ scripts.size > 0 ? true : false
77
65
  end
78
66
 
79
- def self.romanize_arabic(text)
80
- text
81
- .gsub('،',',') # ARABIC COMMA
82
- .gsub('؛',';') # ARABIC SEMICOLON
83
- .gsub('؟','?') # ARABIC QUESTION MARK
84
- .gsub('ء',"'") # ARABIC LETTER HAMZA
85
- .gsub('آ','a') # ARABIC LETTER ALEF WITH MADDA ABOVE
86
- .gsub('أ','a') # ARABIC LETTER ALEF WITH HAMZA ABOVE
87
- .gsub('ؤ','w') # ARABIC LETTER WAW WITH HAMZA ABOVE
88
- .gsub('إ','i') # ARABIC LETTER ALEF WITH HAMZA BELOW
89
- .gsub('ئ','ye') # ARABIC LETTER YEH WITH HAMZA ABOVE
90
- .gsub('ا','a') # ARABIC LETTER ALEF
91
- .gsub('ب','b') # ARABIC LETTER BEH
92
- .gsub('ة','a') # ARABIC LETTER TEH MARBUTA
93
- .gsub('ت','t') # ARABIC LETTER TEH
94
- .gsub('ث','th') # ARABIC LETTER THEH
95
- .gsub('ج','j') # ARABIC LETTER JEEM
96
- .gsub('ح','h') # ARABIC LETTER HAH
97
- .gsub('خ','kh') # ARABIC LETTER KHAH
98
- .gsub('د','d') # ARABIC LETTER DAL
99
- .gsub('ذ','th') # ARABIC LETTER THAL
100
- .gsub('ر','r') # ARABIC LETTER REH
101
- .gsub('ز','z') # ARABIC LETTER ZAIN
102
- .gsub('س','s') # ARABIC LETTER SEEN
103
- .gsub('ش','sh') # ARABIC LETTER SHEEN
104
- .gsub('ص','s') # ARABIC LETTER SAD
105
- .gsub('ض','d') # ARABIC LETTER DAD
106
- .gsub('ط','t') # ARABIC LETTER TAH
107
- .gsub('ظ','z') # ARABIC LETTER ZAH
108
- .gsub('ع',"'") # ARABIC LETTER AIN
109
- .gsub('غ','gh') # ARABIC LETTER GHAIN
110
- .gsub('ـ','-') # ARABIC TATWEEL
111
- .gsub('ف','f') # ARABIC LETTER FEH
112
- .gsub('ق','q') # ARABIC LETTER QAF
113
- .gsub('ك','k') # ARABIC LETTER KAF
114
- .gsub('ل','l') # ARABIC LETTER LAM
115
- .gsub('م','m') # ARABIC LETTER MEEM
116
- .gsub('ن','n') # ARABIC LETTER NOON
117
- .gsub('ه','h') # ARABIC LETTER HEH
118
- .gsub('و','w') # ARABIC LETTER WAW
119
- .gsub('ى','a') # ARABIC LETTER ALEF MAKSURA
120
- .gsub('ي','y') # ARABIC LETTER YEH
121
- .gsub('َ','a') # ARABIC FATHA
122
- .gsub('ُ','u') # ARABIC DAMMA
123
- .gsub('ِ','i') # ARABIC KASRA
124
- .gsub('ْ','') # ARABIC SUKUN
125
- .gsub('ٔ',"'") # ARABIC HAMZA ABOVE
126
- .gsub('ٕ',"'") # ARABIC HAMZA BELOW
127
- .gsub('٠','0') # ARABIC-INDIC DIGIT ZERO
128
- .gsub('١','1') # ARABIC-INDIC DIGIT ONE
129
- .gsub('٢','2') # ARABIC-INDIC DIGIT TWO
130
- .gsub('٣','3') # ARABIC-INDIC DIGIT THREE
131
- .gsub('٤','4') # ARABIC-INDIC DIGIT FOUR
132
- .gsub('٥','5') # ARABIC-INDIC DIGIT FIVE
133
- .gsub('٦','6') # ARABIC-INDIC DIGIT SIX
134
- .gsub('٧','7') # ARABIC-INDIC DIGIT SEVEN
135
- .gsub('٨','8') # ARABIC-INDIC DIGIT EIGHT
136
- .gsub('٩','9') # ARABIC-INDIC DIGIT NINE
137
- .gsub('٪','%') # ARABIC PERCENT SIGN
138
- .gsub('٫',',') # ARABIC DECIMAL SEPARATOR
139
- .gsub('٬',',') # ARABIC THOUSANDS SEPARATOR
140
- .gsub('ٮ','b') # ARABIC LETTER DOTLESS BEH
141
- .gsub('ٯ','q') # ARABIC LETTER DOTLESS QAF
142
- .gsub('ٰ','a') # ARABIC LETTER SUPERSCRIPT ALEF
143
- .gsub('ٱ','a') # ARABIC LETTER ALEF WASLA
144
- .gsub('ٲ','a') # ARABIC LETTER ALEF WITH WAVY HAMZA ABOVE
145
- .gsub('ٳ','a') # ARABIC LETTER ALEF WITH WAVY HAMZA BELOW
146
- .gsub('ٷ','u') # ARABIC LETTER U WITH HAMZA ABOVE
147
- .gsub('ٹ','tt') # ARABIC LETTER TTEH
148
- .gsub('ٺ','tt') # ARABIC LETTER TTEHEH
149
- .gsub('ٻ','b') # ARABIC LETTER BEEH
150
- .gsub('ټ','t') # ARABIC LETTER TEH WITH RING
151
- .gsub('ٽ','t') # ARABIC LETTER TEH WITH THREE DOTS ABOVE DOWNWARDS
152
- .gsub('پ','p') # ARABIC LETTER PEH
153
- .gsub('ٿ','t') # ARABIC LETTER TEHEH
154
- .gsub('ڀ','b') # ARABIC LETTER BEHEH
155
- .gsub('ځ','h') # ARABIC LETTER HAH WITH HAMZA ABOVE
156
- .gsub('ڂ','h') # ARABIC LETTER HAH WITH TWO DOTS VERTICAL ABOVE
157
- .gsub('ڃ','ny') # ARABIC LETTER NYEH
158
- .gsub('ڄ','dy') # ARABIC LETTER DYEH
159
- .gsub('څ','h') # ARABIC LETTER HAH WITH THREE DOTS ABOVE
160
- .gsub('چ','tch') # ARABIC LETTER TCHEH
161
- .gsub('ڇ','tch') # ARABIC LETTER TCHEHEH
162
- .gsub('ڈ','dd') # ARABIC LETTER DDAL
163
- .gsub('ډ','d') # ARABIC LETTER DAL WITH RING
164
- .gsub('ڊ','d') # ARABIC LETTER DAL WITH DOT BELOW
165
- .gsub('ڋ','d') # ARABIC LETTER DAL WITH DOT BELOW AND SMALL TAH
166
- .gsub('ڌ','d') # ARABIC LETTER DAHAL
167
- .gsub('ڍ','dd') # ARABIC LETTER DDAHAL
168
- .gsub('ڎ','d') # ARABIC LETTER DUL
169
- .gsub('ڏ','d') # ARABIC LETTER DAL WITH THREE DOTS ABOVE DOWNWARDS
170
- .gsub('ڐ','d') # ARABIC LETTER DAL WITH FOUR DOTS ABOVE
171
- .gsub('ڑ','rr') # ARABIC LETTER RREH
172
- .gsub('ڒ','r') # ARABIC LETTER REH WITH SMALL V
173
- .gsub('ړ','r') # ARABIC LETTER REH WITH RING
174
- .gsub('ڔ','r') # ARABIC LETTER REH WITH DOT BELOW
175
- .gsub('ڕ','r') # ARABIC LETTER REH WITH SMALL V BELOW
176
- .gsub('ږ','r') # ARABIC LETTER REH WITH DOT BELOW AND DOT ABOVE
177
- .gsub('ڗ','r') # ARABIC LETTER REH WITH TWO DOTS ABOVE
178
- .gsub('ژ','j') # ARABIC LETTER JEH
179
- .gsub('ڙ','r') # ARABIC LETTER REH WITH FOUR DOTS ABOVE
180
- .gsub('ښ','s') # ARABIC LETTER SEEN WITH DOT BELOW AND DOT ABOVE
181
- .gsub('ڛ','s') # ARABIC LETTER SEEN WITH THREE DOTS BELOW
182
- .gsub('ڜ','s') # ARABIC LETTER SEEN WITH THREE DOTS BELOW AND THREE DOTS ABOVE
183
- .gsub('ڝ','s') # ARABIC LETTER SAD WITH TWO DOTS BELOW
184
- .gsub('ڞ','s') # ARABIC LETTER SAD WITH THREE DOTS ABOVE
185
- .gsub('ڟ','t') # ARABIC LETTER TAH WITH THREE DOTS ABOVE
186
- .gsub('ڠ','n') # ARABIC LETTER AIN WITH THREE DOTS ABOVE
187
- .gsub('ڡ','f') # ARABIC LETTER DOTLESS FEH
188
- .gsub('ڢ','f') # ARABIC LETTER FEH WITH DOT MOVED BELOW
189
- .gsub('ڣ','f') # ARABIC LETTER FEH WITH DOT BELOW
190
- .gsub('ڤ','v') # ARABIC LETTER VEH
191
- .gsub('ڥ','f') # ARABIC LETTER FEH WITH THREE DOTS BELOW
192
- .gsub('ڦ','p') # ARABIC LETTER PEHEH
193
- .gsub('ڧ','q') # ARABIC LETTER QAF WITH DOT ABOVE
194
- .gsub('ڨ','q') # ARABIC LETTER QAF WITH THREE DOTS ABOVE
195
- .gsub('ک','k') # ARABIC LETTER KEHEH
196
- .gsub('ڪ','k') # ARABIC LETTER SWASH KAF
197
- .gsub('ګ','k') # ARABIC LETTER KAF WITH RING
198
- .gsub('ڬ','k') # ARABIC LETTER KAF WITH DOT ABOVE
199
- .gsub('ڭ','ng') # ARABIC LETTER NG
200
- .gsub('ڮ','k') # ARABIC LETTER KAF WITH THREE DOTS BELOW
201
- .gsub('گ','g') # ARABIC LETTER GAF
202
- .gsub('ڰ','g') # ARABIC LETTER GAF WITH RING
203
- .gsub('ڱ','ng') # ARABIC LETTER NGOEH
204
- .gsub('ڲ','g') # ARABIC LETTER GAF WITH TWO DOTS BELOW
205
- .gsub('ڳ','g') # ARABIC LETTER GUEH
206
- .gsub('ڴ','g') # ARABIC LETTER GAF WITH THREE DOTS ABOVE
207
- .gsub('ڵ','l') # ARABIC LETTER LAM WITH SMALL V
208
- .gsub('ڶ','l') # ARABIC LETTER LAM WITH DOT ABOVE
209
- .gsub('ڷ','l') # ARABIC LETTER LAM WITH THREE DOTS ABOVE
210
- .gsub('ڸ','l') # ARABIC LETTER LAM WITH THREE DOTS BELOW
211
- .gsub('ڹ','n') # ARABIC LETTER NOON WITH DOT BELOW
212
- .gsub('ں','n') # ARABIC LETTER NOON GHUNNA
213
- .gsub('ڻ','rn') # ARABIC LETTER RNOON
214
- .gsub('ڼ','n') # ARABIC LETTER NOON WITH RING
215
- .gsub('ڽ','n') # ARABIC LETTER NOON WITH THREE DOTS ABOVE
216
- .gsub('ھ','h') # ARABIC LETTER HEH DOACHASHMEE
217
- .gsub('ڿ','tch') # ARABIC LETTER TCHEH WITH DOT ABOVE
218
- .gsub('ۀ','h') # ARABIC LETTER HEH WITH YEH ABOVE
219
- .gsub('ہ','h') # ARABIC LETTER HEH GOAL
220
- .gsub('ۂ','h') # ARABIC LETTER HEH GOAL WITH HAMZA ABOVE
221
- .gsub('ۃ','a') # ARABIC LETTER TEH MARBUTA GOAL
222
- .gsub('ۄ','w') # ARABIC LETTER WAW WITH RING
223
- .gsub('ۅ','oe') # ARABIC LETTER KIRGHIZ OE
224
- .gsub('ۆ','oe') # ARABIC LETTER OE
225
- .gsub('ۇ','u') # ARABIC LETTER U
226
- .gsub('ۈ','yu') # ARABIC LETTER YU
227
- .gsub('ۉ','yu') # ARABIC LETTER KIRGHIZ YU
228
- .gsub('ۊ','w') # ARABIC LETTER WAW WITH TWO DOTS ABOVE
229
- .gsub('ۋ','v') # ARABIC LETTER VE
230
- .gsub('ی','y') # ARABIC LETTER FARSI YEH
231
- .gsub('ۍ','y') # ARABIC LETTER YEH WITH TAIL
232
- .gsub('ێ','y') # ARABIC LETTER YEH WITH SMALL V
233
- .gsub('ۏ','w') # ARABIC LETTER WAW WITH DOT ABOVE
234
- .gsub('ې','e') # ARABIC LETTER E
235
- .gsub('ۑ','y') # ARABIC LETTER YEH WITH THREE DOTS BELOW
236
- .gsub('ے','y') # ARABIC LETTER YEH BARREE
237
- .gsub('ۓ','y') # ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
238
- .gsub('۔','.') # ARABIC FULL STOP
239
- .gsub('ە','ae') # ARABIC LETTER AE
240
- .gsub('ۮ','d') # ARABIC LETTER DAL WITH INVERTED V
241
- .gsub('ۯ','r') # ARABIC LETTER REH WITH INVERTED V
242
- .gsub('۰','0') # EXTENDED ARABIC-INDIC DIGIT ZERO
243
- .gsub('۱','1') # EXTENDED ARABIC-INDIC DIGIT ONE
244
- .gsub('۲','2') # EXTENDED ARABIC-INDIC DIGIT TWO
245
- .gsub('۳','3') # EXTENDED ARABIC-INDIC DIGIT THREE
246
- .gsub('۴','4') # EXTENDED ARABIC-INDIC DIGIT FOUR
247
- .gsub('۵','5') # EXTENDED ARABIC-INDIC DIGIT FIVE
248
- .gsub('۶','6') # EXTENDED ARABIC-INDIC DIGIT SIX
249
- .gsub('۷','7') # EXTENDED ARABIC-INDIC DIGIT SEVEN
250
- .gsub('۸','8') # EXTENDED ARABIC-INDIC DIGIT EIGHT
251
- .gsub('۹','9') # EXTENDED ARABIC-INDIC DIGIT NINE
252
- .gsub('ۺ','sh') # ARABIC LETTER SHEEN WITH DOT BELOW
253
- .gsub('ۻ','d') # ARABIC LETTER DAD WITH DOT BELOW
254
- .gsub('ۼ','gh') # ARABIC LETTER GHAIN WITH DOT BELOW
255
- .gsub('۽','&') # ARABIC SIGN SINDHI AMPERSAND
256
- .gsub('ﷲ','Allah') # ARABIC LIGATURE ALLAH ISOLATED FORM
257
- .gsub('و','w') # Arabic letter waw
258
- .gsub('ء',"'") # hamza
259
- .gsub('ٔ',"'") # hamza above
260
- .gsub('ٕ',"'") # hamza below
261
- .gsub('ع',"'") # ain
262
- .gsub('آ','a') # alef madda
263
- .gsub('إ','i') # alef with hamza below
264
- .gsub('ٱ','a') # alef wasla
265
- .gsub('ة','a') # teh marbuta
266
- .gsub('ۃ','a') # teh marbuta goal
267
- .gsub('ي','y') # Arabic yeh
268
- .gsub('ى','a') # alef maksura
269
- .gsub('ﻯ','a') # alef maksura isolated form
270
- .gsub('ﻰ','a') # alef maksura final form
271
- .gsub('ﯨ','a') # Uighur Kazach Kirghiz alef maksura initial form
272
- .gsub('ﯩ','a') # Uighur Kazach Kirghiz alef maksura medial form
273
- .gsub('ٰ','a ') # Arabic letter superscript alef
274
- .gsub('ـ','') # tatweel (filler)
275
- .gsub('َ','a') # fatha ("-a")
276
- .gsub('ُ','u') # damma ("-u")
277
- .gsub('ِ','i') # kasra ("-i")
278
- .gsub('ْ','') # sukun (no vowel)
279
- .gsub('ۡ','') # comment small high dotless head of khah; like sukun (no vowel); used in Kashmiri, Assamese
280
- .gsub('اً','an') # alef + fathatan
281
- .gsub('ً','') # fathatan ("-an")
282
- .gsub('ٌ','') # dammatan ("-un")
283
- .gsub('ٍ','') # kasratan ("-in")
284
- .gsub('ّ','') # shadda (consonant doubler)
285
- .gsub('ڃ','ny') # Arabic letter nyeh U+0683 (used in Sindhi (snd))
286
- .gsub('ڄ','dy') # Arabic letter dyeh U+0684 (used in Sindhi (snd))
287
- .gsub('۾','men') # Sindhi postposition men
288
- .gsub('ؑ','alayhe wasallam') # "upon him be peace"
289
- .gsub('ﷴ','Mohammad') # "Mohammad"
290
- .gsub('ﷸ','wasallam') # "and peace"
291
- .gsub('ﷺ','sallallahou alayhe wasallam') # "prayer of God be upon him and his family and peace"
67
+ def self.is_japanese?(scripts)
68
+ scripts.include?('Hiragana') || scripts.include?('Katakana')
292
69
  end
293
70
  end
@@ -0,0 +1,199 @@
1
+ class Arabic
2
+
3
+ ARABIC = {
4
+ '،' => ',', # ARABIC COMMA
5
+ '؛' => ';', # ARABIC SEMICOLON
6
+ '؟' => '?', # ARABIC QUESTION MARK
7
+ 'ء' => "'", # ARABIC LETTER HAMZA
8
+ 'آ' => 'a', # ARABIC LETTER ALEF WITH MADDA ABOVE
9
+ 'أ' => 'a', # ARABIC LETTER ALEF WITH HAMZA ABOVE
10
+ 'ؤ' => 'w', # ARABIC LETTER WAW WITH HAMZA ABOVE
11
+ 'إ' => 'i', # ARABIC LETTER ALEF WITH HAMZA BELOW
12
+ 'ئ' => 'ye', # ARABIC LETTER YEH WITH HAMZA ABOVE
13
+ 'ا' => 'a', # ARABIC LETTER ALEF
14
+ 'ب' => 'b', # ARABIC LETTER BEH
15
+ 'ة' => 'a', # ARABIC LETTER TEH MARBUTA
16
+ 'ت' => 't', # ARABIC LETTER TEH
17
+ 'ث' => 'th', # ARABIC LETTER THEH
18
+ 'ج' => 'j', # ARABIC LETTER JEEM
19
+ 'ح' => 'h', # ARABIC LETTER HAH
20
+ 'خ' => 'kh', # ARABIC LETTER KHAH
21
+ 'د' => 'd', # ARABIC LETTER DAL
22
+ 'ذ' => 'th', # ARABIC LETTER THAL
23
+ 'ر' => 'r', # ARABIC LETTER REH
24
+ 'ز' => 'z', # ARABIC LETTER ZAIN
25
+ 'س' => 's', # ARABIC LETTER SEEN
26
+ 'ش' => 'sh', # ARABIC LETTER SHEEN
27
+ 'ص' => 's', # ARABIC LETTER SAD
28
+ 'ض' => 'd', # ARABIC LETTER DAD
29
+ 'ط' => 't', # ARABIC LETTER TAH
30
+ 'ظ' => 'z', # ARABIC LETTER ZAH
31
+ 'ع' => "'", # ARABIC LETTER AIN
32
+ 'غ' => 'gh', # ARABIC LETTER GHAIN
33
+ 'ـ' => '-', # ARABIC TATWEEL
34
+ 'ف' => 'f', # ARABIC LETTER FEH
35
+ 'ق' => 'q', # ARABIC LETTER QAF
36
+ 'ك' => 'k', # ARABIC LETTER KAF
37
+ 'ل' => 'l', # ARABIC LETTER LAM
38
+ 'م' => 'm', # ARABIC LETTER MEEM
39
+ 'ن' => 'n', # ARABIC LETTER NOON
40
+ 'ه' => 'h', # ARABIC LETTER HEH
41
+ 'و' => 'w', # ARABIC LETTER WAW
42
+ 'ى' => 'a', # ARABIC LETTER ALEF MAKSURA
43
+ 'ي' => 'y', # ARABIC LETTER YEH
44
+ 'َ' => 'a', # ARABIC FATHA
45
+ 'ُ' => 'u', # ARABIC DAMMA
46
+ 'ِ' => 'i', # ARABIC KASRA
47
+ 'ْ' => '', # ARABIC SUKUN
48
+ 'ٔ' => "'", # ARABIC HAMZA ABOVE
49
+ 'ٕ' => "'", # ARABIC HAMZA BELOW
50
+ '٠' => '0', # ARABIC-INDIC DIGIT ZERO
51
+ '١' => '1', # ARABIC-INDIC DIGIT ONE
52
+ '٢' => '2', # ARABIC-INDIC DIGIT TWO
53
+ '٣' => '3', # ARABIC-INDIC DIGIT THREE
54
+ '٤' => '4', # ARABIC-INDIC DIGIT FOUR
55
+ '٥' => '5', # ARABIC-INDIC DIGIT FIVE
56
+ '٦' => '6', # ARABIC-INDIC DIGIT SIX
57
+ '٧' => '7', # ARABIC-INDIC DIGIT SEVEN
58
+ '٨' => '8', # ARABIC-INDIC DIGIT EIGHT
59
+ '٩' => '9', # ARABIC-INDIC DIGIT NINE
60
+ '٪' => '%', # ARABIC PERCENT SIGN
61
+ '٫' => ' => ', # ARABIC DECIMAL SEPARATOR
62
+ '٬' => ' => ', # ARABIC THOUSANDS SEPARATOR
63
+ 'ٮ' => 'b', # ARABIC LETTER DOTLESS BEH
64
+ 'ٯ' => 'q', # ARABIC LETTER DOTLESS QAF
65
+ 'ٰ' => 'a', # ARABIC LETTER SUPERSCRIPT ALEF
66
+ 'ٱ' => 'a', # ARABIC LETTER ALEF WASLA
67
+ 'ٲ' => 'a', # ARABIC LETTER ALEF WITH WAVY HAMZA ABOVE
68
+ 'ٳ' => 'a', # ARABIC LETTER ALEF WITH WAVY HAMZA BELOW
69
+ 'ٷ' => 'u', # ARABIC LETTER U WITH HAMZA ABOVE
70
+ 'ٹ' => 'tt', # ARABIC LETTER TTEH
71
+ 'ٺ' => 'tt', # ARABIC LETTER TTEHEH
72
+ 'ٻ' => 'b', # ARABIC LETTER BEEH
73
+ 'ټ' => 't', # ARABIC LETTER TEH WITH RING
74
+ 'ٽ' => 't', # ARABIC LETTER TEH WITH THREE DOTS ABOVE DOWNWARDS
75
+ 'پ' => 'p', # ARABIC LETTER PEH
76
+ 'ٿ' => 't', # ARABIC LETTER TEHEH
77
+ 'ڀ' => 'b', # ARABIC LETTER BEHEH
78
+ 'ځ' => 'h', # ARABIC LETTER HAH WITH HAMZA ABOVE
79
+ 'ڂ' => 'h', # ARABIC LETTER HAH WITH TWO DOTS VERTICAL ABOVE
80
+ 'ڃ' => 'ny', # ARABIC LETTER NYEH
81
+ 'ڄ' => 'dy', # ARABIC LETTER DYEH
82
+ 'څ' => 'h', # ARABIC LETTER HAH WITH THREE DOTS ABOVE
83
+ 'چ' => 'tch', # ARABIC LETTER TCHEH
84
+ 'ڇ' => 'tch', # ARABIC LETTER TCHEHEH
85
+ 'ڈ' => 'dd', # ARABIC LETTER DDAL
86
+ 'ډ' => 'd', # ARABIC LETTER DAL WITH RING
87
+ 'ڊ' => 'd', # ARABIC LETTER DAL WITH DOT BELOW
88
+ 'ڋ' => 'd', # ARABIC LETTER DAL WITH DOT BELOW AND SMALL TAH
89
+ 'ڌ' => 'd', # ARABIC LETTER DAHAL
90
+ 'ڍ' => 'dd', # ARABIC LETTER DDAHAL
91
+ 'ڎ' => 'd', # ARABIC LETTER DUL
92
+ 'ڏ' => 'd', # ARABIC LETTER DAL WITH THREE DOTS ABOVE DOWNWARDS
93
+ 'ڐ' => 'd', # ARABIC LETTER DAL WITH FOUR DOTS ABOVE
94
+ 'ڑ' => 'rr', # ARABIC LETTER RREH
95
+ 'ڒ' => 'r', # ARABIC LETTER REH WITH SMALL V
96
+ 'ړ' => 'r', # ARABIC LETTER REH WITH RING
97
+ 'ڔ' => 'r', # ARABIC LETTER REH WITH DOT BELOW
98
+ 'ڕ' => 'r', # ARABIC LETTER REH WITH SMALL V BELOW
99
+ 'ږ' => 'r', # ARABIC LETTER REH WITH DOT BELOW AND DOT ABOVE
100
+ 'ڗ' => 'r', # ARABIC LETTER REH WITH TWO DOTS ABOVE
101
+ 'ژ' => 'j', # ARABIC LETTER JEH
102
+ 'ڙ' => 'r', # ARABIC LETTER REH WITH FOUR DOTS ABOVE
103
+ 'ښ' => 's', # ARABIC LETTER SEEN WITH DOT BELOW AND DOT ABOVE
104
+ 'ڛ' => 's', # ARABIC LETTER SEEN WITH THREE DOTS BELOW
105
+ 'ڜ' => 's', # ARABIC LETTER SEEN WITH THREE DOTS BELOW AND THREE DOTS ABOVE
106
+ 'ڝ' => 's', # ARABIC LETTER SAD WITH TWO DOTS BELOW
107
+ 'ڞ' => 's', # ARABIC LETTER SAD WITH THREE DOTS ABOVE
108
+ 'ڟ' => 't', # ARABIC LETTER TAH WITH THREE DOTS ABOVE
109
+ 'ڠ' => 'n', # ARABIC LETTER AIN WITH THREE DOTS ABOVE
110
+ 'ڡ' => 'f', # ARABIC LETTER DOTLESS FEH
111
+ 'ڢ' => 'f', # ARABIC LETTER FEH WITH DOT MOVED BELOW
112
+ 'ڣ' => 'f', # ARABIC LETTER FEH WITH DOT BELOW
113
+ 'ڤ' => 'v', # ARABIC LETTER VEH
114
+ 'ڥ' => 'f', # ARABIC LETTER FEH WITH THREE DOTS BELOW
115
+ 'ڦ' => 'p', # ARABIC LETTER PEHEH
116
+ 'ڧ' => 'q', # ARABIC LETTER QAF WITH DOT ABOVE
117
+ 'ڨ' => 'q', # ARABIC LETTER QAF WITH THREE DOTS ABOVE
118
+ 'ک' => 'k', # ARABIC LETTER KEHEH
119
+ 'ڪ' => 'k', # ARABIC LETTER SWASH KAF
120
+ 'ګ' => 'k', # ARABIC LETTER KAF WITH RING
121
+ 'ڬ' => 'k', # ARABIC LETTER KAF WITH DOT ABOVE
122
+ 'ڭ' => 'ng', # ARABIC LETTER NG
123
+ 'ڮ' => 'k', # ARABIC LETTER KAF WITH THREE DOTS BELOW
124
+ 'گ' => 'g', # ARABIC LETTER GAF
125
+ 'ڰ' => 'g', # ARABIC LETTER GAF WITH RING
126
+ 'ڱ' => 'ng', # ARABIC LETTER NGOEH
127
+ 'ڲ' => 'g', # ARABIC LETTER GAF WITH TWO DOTS BELOW
128
+ 'ڳ' => 'g', # ARABIC LETTER GUEH
129
+ 'ڴ' => 'g', # ARABIC LETTER GAF WITH THREE DOTS ABOVE
130
+ 'ڵ' => 'l', # ARABIC LETTER LAM WITH SMALL V
131
+ 'ڶ' => 'l', # ARABIC LETTER LAM WITH DOT ABOVE
132
+ 'ڷ' => 'l', # ARABIC LETTER LAM WITH THREE DOTS ABOVE
133
+ 'ڸ' => 'l', # ARABIC LETTER LAM WITH THREE DOTS BELOW
134
+ 'ڹ' => 'n', # ARABIC LETTER NOON WITH DOT BELOW
135
+ 'ں' => 'n', # ARABIC LETTER NOON GHUNNA
136
+ 'ڻ' => 'rn', # ARABIC LETTER RNOON
137
+ 'ڼ' => 'n', # ARABIC LETTER NOON WITH RING
138
+ 'ڽ' => 'n', # ARABIC LETTER NOON WITH THREE DOTS ABOVE
139
+ 'ھ' => 'h', # ARABIC LETTER HEH DOACHASHMEE
140
+ 'ڿ' => 'tch', # ARABIC LETTER TCHEH WITH DOT ABOVE
141
+ 'ۀ' => 'h', # ARABIC LETTER HEH WITH YEH ABOVE
142
+ 'ہ' => 'h', # ARABIC LETTER HEH GOAL
143
+ 'ۂ' => 'h', # ARABIC LETTER HEH GOAL WITH HAMZA ABOVE
144
+ 'ۃ' => 'a', # ARABIC LETTER TEH MARBUTA GOAL
145
+ 'ۄ' => 'w', # ARABIC LETTER WAW WITH RING
146
+ 'ۅ' => 'oe', # ARABIC LETTER KIRGHIZ OE
147
+ 'ۆ' => 'oe', # ARABIC LETTER OE
148
+ 'ۇ' => 'u', # ARABIC LETTER U
149
+ 'ۈ' => 'yu', # ARABIC LETTER YU
150
+ 'ۉ' => 'yu', # ARABIC LETTER KIRGHIZ YU
151
+ 'ۊ' => 'w', # ARABIC LETTER WAW WITH TWO DOTS ABOVE
152
+ 'ۋ' => 'v', # ARABIC LETTER VE
153
+ 'ی' => 'y', # ARABIC LETTER FARSI YEH
154
+ 'ۍ' => 'y', # ARABIC LETTER YEH WITH TAIL
155
+ 'ێ' => 'y', # ARABIC LETTER YEH WITH SMALL V
156
+ 'ۏ' => 'w', # ARABIC LETTER WAW WITH DOT ABOVE
157
+ 'ې' => 'e', # ARABIC LETTER E
158
+ 'ۑ' => 'y', # ARABIC LETTER YEH WITH THREE DOTS BELOW
159
+ 'ے' => 'y', # ARABIC LETTER YEH BARREE
160
+ 'ۓ' => 'y', # ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
161
+ '۔' => '.', # ARABIC FULL STOP
162
+ 'ە' => 'ae', # ARABIC LETTER AE
163
+ 'ۮ' => 'd', # ARABIC LETTER DAL WITH INVERTED V
164
+ 'ۯ' => 'r', # ARABIC LETTER REH WITH INVERTED V
165
+ '۰' => '0', # EXTENDED ARABIC-INDIC DIGIT ZERO
166
+ '۱' => '1', # EXTENDED ARABIC-INDIC DIGIT ONE
167
+ '۲' => '2', # EXTENDED ARABIC-INDIC DIGIT TWO
168
+ '۳' => '3', # EXTENDED ARABIC-INDIC DIGIT THREE
169
+ '۴' => '4', # EXTENDED ARABIC-INDIC DIGIT FOUR
170
+ '۵' => '5', # EXTENDED ARABIC-INDIC DIGIT FIVE
171
+ '۶' => '6', # EXTENDED ARABIC-INDIC DIGIT SIX
172
+ '۷' => '7', # EXTENDED ARABIC-INDIC DIGIT SEVEN
173
+ '۸' => '8', # EXTENDED ARABIC-INDIC DIGIT EIGHT
174
+ '۹' => '9', # EXTENDED ARABIC-INDIC DIGIT NINE
175
+ 'ۺ' => 'sh', # ARABIC LETTER SHEEN WITH DOT BELOW
176
+ 'ۻ' => 'd', # ARABIC LETTER DAD WITH DOT BELOW
177
+ 'ۼ' => 'gh', # ARABIC LETTER GHAIN WITH DOT BELOW
178
+ '۽' => '&', # ARABIC SIGN SINDHI AMPERSAND
179
+ 'ﷲ' => 'Allah', # ARABIC LIGATURE ALLAH ISOLATED FORM
180
+ 'ۡ' => '', # comment small high dotless head of khah; like sukun (no vowel); used in Kashmiri => Assamese
181
+ 'اً' => 'an', # alef + fathatan
182
+ 'ً' => '', # fathatan ("-an")
183
+ 'ٌ' => '', # dammatan ("-un")
184
+ 'ٍ' => '', # kasratan ("-in")
185
+ 'ّ' => '', # shadda (consonant doubler)
186
+ '۾' => 'men', # Sindhi postposition men
187
+ 'ؑ' => 'alayhe wasallam', # "upon him be peace"
188
+ 'ﷴ' => 'Mohammad', # "Mohammad"
189
+ 'ﷸ' => 'wasallam', # "and peace"
190
+ 'ﷺ' => 'sallallahou alayhe wasallam', # "prayer of God be upon him and his family and peace"
191
+ }.freeze
192
+
193
+ def self.t(text)
194
+ latin = text.dup
195
+ ARABIC.each { |k,v| latin.gsub!(k, v)}
196
+ latin
197
+ end
198
+
199
+ end
@@ -0,0 +1,18 @@
1
+ class Han
2
+ require 'chinese_pinyin'
3
+
4
+ def self.t(text, opt = nil)
5
+ latin = []
6
+ chars = text.split("")
7
+ chars.each_with_index do |char, index|
8
+ if char =~ /\p{Han}/
9
+ converted_char = Pinyin.t(char, opt == :ascii ? {} : {tonemarks: true})
10
+ latin << ' '
11
+ latin << converted_char
12
+ else
13
+ latin << char
14
+ end
15
+ end
16
+ latin.join('').gsub(' ', ' ')
17
+ end
18
+ end
@@ -0,0 +1,61 @@
1
+ class Japanese
2
+ require 'mecab_standalone'
3
+ require 'romaji'
4
+
5
+ JAPANESE_PONCTUATION = {
6
+ ' ' => ' ',
7
+ '、' => ',',
8
+ '。' => '.',
9
+ ':' => ':',
10
+ '!' => '!',
11
+ '?' => '?',
12
+ '〜' => '~',
13
+ '…' => '...',
14
+ '‥' => '..',
15
+ '「 ' => ' \'',
16
+ '」' => '\'',
17
+ '『 ' => ' "',
18
+ '』' => '"',
19
+ '〝 ' => ' "',
20
+ '〟' => '"',
21
+ '( ' => ' (',
22
+ ')' => ')',
23
+ '【 ' => ' [',
24
+ '】' => ']',
25
+ '{ ' => ' {',
26
+ '}' => '}',
27
+ }.freeze
28
+
29
+ def self.t(text)
30
+ latin = text.dup
31
+ parsed = parse(text)
32
+ parsed.each do |token|
33
+ if token[-1]=~ /\p{Katakana}/
34
+ latin.sub!(token[0], ' ' + Romaji.kana2romaji(token[-1]) )
35
+ end
36
+ end
37
+ JAPANESE_PONCTUATION.each { |k,v| latin.gsub!(k, v)}
38
+ latin
39
+ end
40
+
41
+ def self.parse(text)
42
+ mecab_parsed = MecabStandalone.parse(text)
43
+ .split("\n")
44
+ .map{|k| k.split("\t")}.tap(&:pop)
45
+ .map{|k| [k[0]].concat(k[1].split(','))}
46
+ .map{|k| [k[0], k[1], k[-2]]}
47
+ tokenized_kana = []
48
+ mecab_parsed.each do |token|
49
+ if token[1] == "助動詞"
50
+ tokenized_kana[-1][0] += token[0]
51
+ tokenized_kana[-1][-1] += token[-1]
52
+ elsif token[-1] == '*'
53
+ tokenized_kana << [token[0], token[1], token[0]]
54
+ else
55
+ tokenized_kana << token
56
+ end
57
+ end
58
+ tokenized_kana
59
+ end
60
+
61
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: latinizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - William Yugue
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-09 00:00:00.000000000 Z
11
+ date: 2020-09-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: chinese_pinyin
@@ -108,6 +108,9 @@ extensions: []
108
108
  extra_rdoc_files: []
109
109
  files:
110
110
  - lib/latinizer.rb
111
+ - lib/lib/arabic.rb
112
+ - lib/lib/han.rb
113
+ - lib/lib/japanese.rb
111
114
  homepage: https://github.com/wyugue/latinizer
112
115
  licenses:
113
116
  - MIT