latinizer 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 79227b3ecff306b41fc210a377ddb1811a8cdfc2823390bf7ea3a59b6a91c7fc
4
- data.tar.gz: 32a70e923290a31c5b8fdc1eeda516c2390333445adcacfac712e8fcd5f2e9ec
3
+ metadata.gz: 21b15ac5fbb50b85ac397a6b1e58bb4e7e63c5ce9800d7c21eddd02a5fa09d7d
4
+ data.tar.gz: 36e10b84402711854a0345eaf5ccf488a125ce197690daabdff6443526d20522
5
5
  SHA512:
6
- metadata.gz: b513f27ef4716f3827aad163ea9ba7c260cf2b64ed20ff455624fef52c174f10f5e763f85b6f62ccf31adf64382aedd1d56d9e0443ac9cf411bb78620411ab53
7
- data.tar.gz: 0efcd64fa9ce46b45efdbdcdbd3150e478add0d42566279461f90f145068f201efed0ad4adb43fadddc3cf1327a0b0e8c2d9eea8a35ea2941e8791b27699a8a0
6
+ metadata.gz: 4c1b6a8eddc5258615cf0017b8209e6ee15ecb4332e0f241457e7a403364597e4ee525a89bcec3a93eb2e89821498ac65cbae893183b54cffffa7e88d3431672
7
+ data.tar.gz: 3a56c28ccbb182a25ba8e65998d0c95038f613ad4cbdd42d81103e47dc26bcacd0f7eee80549650736d9bb5657e9dd1de9bd824709a1ab9602ca9c77ff9417f1
@@ -1,293 +1,70 @@
1
1
  class Latinizer
2
- require 'chinese_pinyin'
3
- require 'mecab_standalone'
4
- require 'romaji'
5
2
  require 'translit'
6
3
  require 'unicode/scripts'
7
4
  require 'babosa'
5
+ require_relative './lib/arabic.rb'
6
+ require_relative './lib/han.rb'
7
+ require_relative './lib/japanese.rb'
8
+
9
+ SUPPORTED_SCRIPTS = [
10
+ 'Arabic',
11
+ 'Cyrillic',
12
+ 'Han',
13
+ 'Japanese'
14
+ ]
8
15
 
9
16
  def self.t(text, opt = nil)
10
- scripts = Unicode::Scripts.scripts(text) - ['Common', 'Inherited', 'Latin']
11
- pinyin_options = {tonemarks: true}
17
+ scripts = detect_non_latin_scripts(text)
12
18
 
13
- if opt == :ascii
14
- pinyin_options = {}
15
- elsif opt == :ja
16
- return romanize_japanese(text)
19
+ if scripts.size == 0
20
+ return opt == :ascii ? remove_non_ascii(text) : text
21
+ elsif scripts.size > 1
22
+ latinized = latinize_script(text, scripts.first, opt)
23
+ return t(latinized, opt)
17
24
  end
18
25
 
19
- if scripts.size == 1
20
- case scripts.first
21
- when 'Arabic'
22
- return romanize_arabic(text)
23
- when 'Cyrillic'
24
- latinized = Translit.convert(text, :english)
25
- return opt == :ascii ? latinized.to_slug.to_ascii.to_s : latinized
26
- when 'Han'
27
- return Pinyin.t(text, pinyin_options)
28
- end
29
- end
26
+ latinized = latinize_script(text, scripts.first, opt)
27
+ opt == :ascii ? remove_non_ascii(latinized) : latinized
28
+ end
30
29
 
31
- if is_japanese?(scripts)
32
- return romanize_japanese(text)
30
+ def self.latinize_script(text, script, opt = nil)
31
+ case script
32
+ when 'Arabic'
33
+ return Arabic.t(text)
34
+ when 'Cyrillic'
35
+ latinized = Translit.convert(text, :english)
36
+ return opt == :ascii ? remove_diacritics(latinized) : latinized
37
+ when 'Han'
38
+ return Han.t(text, opt)
39
+ when 'Japanese'
40
+ return Japanese.t(text)
33
41
  end
34
-
35
42
  text
36
43
  end
37
44
 
38
- def self.has_non_latin?(text)
45
+ def self.detect_non_latin_scripts(text)
39
46
  scripts = Unicode::Scripts.scripts(text) - ['Common', 'Inherited', 'Latin']
40
- scripts.size > 0 ? true : false
47
+ if is_japanese?(scripts)
48
+ scripts -= ['Han', 'Hiragana', 'Katakana']
49
+ scripts += ['Japanese']
50
+ end
51
+ scripts.intersection(SUPPORTED_SCRIPTS)
41
52
  end
42
53
 
43
- def self.is_japanese?(scripts) #fix only kana text
44
- (scripts.include?('Han') && (scripts.include?('Hiragana') || scripts.include?('Katakana'))) ||
45
- (scripts.include?('Hiragana') || scripts.include?('Katakana'))
54
+ def self.remove_diacritics(text)
55
+ text.to_slug.transliterate.to_s
46
56
  end
47
57
 
48
- def self.romanize_japanese(text)
49
- Romaji.kana2romaji(parse_japanese(text)
50
- .map{|k| k[-1]}
51
- .join(' ')
52
- .gsub('ー','')
53
- .gsub(' 。','.')
54
- .gsub(' ・','-')
55
- .gsub(' 、',',')
56
- )
58
+ def self.remove_non_ascii(text)
59
+ text.to_slug.transliterate.to_ascii.to_s
57
60
  end
58
61
 
59
- def self.parse_japanese(text)
60
- mecab_parsed = MecabStandalone.parse(text)
61
- .split("\n")
62
- .map{|k| k.split("\t")}.tap(&:pop)
63
- .map{|k| [k[0]].concat(k[1].split(','))}
64
- .map{|k| [k[0], k[1], k[-2]]}
65
- tokenized_kana = []
66
- mecab_parsed.each do |token|
67
- if token[1] == "助動詞"
68
- tokenized_kana[-1][0] += token[0]
69
- tokenized_kana[-1][-1] += token[-1]
70
- elsif token[-1] == '*'
71
- tokenized_kana << [token[0], token[1], token[0]]
72
- else
73
- tokenized_kana << token
74
- end
75
- end
76
- tokenized_kana
62
+ def self.has_non_latin?(text)
63
+ scripts = Unicode::Scripts.scripts(text) - ['Common', 'Inherited', 'Latin']
64
+ scripts.size > 0 ? true : false
77
65
  end
78
66
 
79
- def self.romanize_arabic(text)
80
- text
81
- .gsub('،',',') # ARABIC COMMA
82
- .gsub('؛',';') # ARABIC SEMICOLON
83
- .gsub('؟','?') # ARABIC QUESTION MARK
84
- .gsub('ء',"'") # ARABIC LETTER HAMZA
85
- .gsub('آ','a') # ARABIC LETTER ALEF WITH MADDA ABOVE
86
- .gsub('أ','a') # ARABIC LETTER ALEF WITH HAMZA ABOVE
87
- .gsub('ؤ','w') # ARABIC LETTER WAW WITH HAMZA ABOVE
88
- .gsub('إ','i') # ARABIC LETTER ALEF WITH HAMZA BELOW
89
- .gsub('ئ','ye') # ARABIC LETTER YEH WITH HAMZA ABOVE
90
- .gsub('ا','a') # ARABIC LETTER ALEF
91
- .gsub('ب','b') # ARABIC LETTER BEH
92
- .gsub('ة','a') # ARABIC LETTER TEH MARBUTA
93
- .gsub('ت','t') # ARABIC LETTER TEH
94
- .gsub('ث','th') # ARABIC LETTER THEH
95
- .gsub('ج','j') # ARABIC LETTER JEEM
96
- .gsub('ح','h') # ARABIC LETTER HAH
97
- .gsub('خ','kh') # ARABIC LETTER KHAH
98
- .gsub('د','d') # ARABIC LETTER DAL
99
- .gsub('ذ','th') # ARABIC LETTER THAL
100
- .gsub('ر','r') # ARABIC LETTER REH
101
- .gsub('ز','z') # ARABIC LETTER ZAIN
102
- .gsub('س','s') # ARABIC LETTER SEEN
103
- .gsub('ش','sh') # ARABIC LETTER SHEEN
104
- .gsub('ص','s') # ARABIC LETTER SAD
105
- .gsub('ض','d') # ARABIC LETTER DAD
106
- .gsub('ط','t') # ARABIC LETTER TAH
107
- .gsub('ظ','z') # ARABIC LETTER ZAH
108
- .gsub('ع',"'") # ARABIC LETTER AIN
109
- .gsub('غ','gh') # ARABIC LETTER GHAIN
110
- .gsub('ـ','-') # ARABIC TATWEEL
111
- .gsub('ف','f') # ARABIC LETTER FEH
112
- .gsub('ق','q') # ARABIC LETTER QAF
113
- .gsub('ك','k') # ARABIC LETTER KAF
114
- .gsub('ل','l') # ARABIC LETTER LAM
115
- .gsub('م','m') # ARABIC LETTER MEEM
116
- .gsub('ن','n') # ARABIC LETTER NOON
117
- .gsub('ه','h') # ARABIC LETTER HEH
118
- .gsub('و','w') # ARABIC LETTER WAW
119
- .gsub('ى','a') # ARABIC LETTER ALEF MAKSURA
120
- .gsub('ي','y') # ARABIC LETTER YEH
121
- .gsub('َ','a') # ARABIC FATHA
122
- .gsub('ُ','u') # ARABIC DAMMA
123
- .gsub('ِ','i') # ARABIC KASRA
124
- .gsub('ْ','') # ARABIC SUKUN
125
- .gsub('ٔ',"'") # ARABIC HAMZA ABOVE
126
- .gsub('ٕ',"'") # ARABIC HAMZA BELOW
127
- .gsub('٠','0') # ARABIC-INDIC DIGIT ZERO
128
- .gsub('١','1') # ARABIC-INDIC DIGIT ONE
129
- .gsub('٢','2') # ARABIC-INDIC DIGIT TWO
130
- .gsub('٣','3') # ARABIC-INDIC DIGIT THREE
131
- .gsub('٤','4') # ARABIC-INDIC DIGIT FOUR
132
- .gsub('٥','5') # ARABIC-INDIC DIGIT FIVE
133
- .gsub('٦','6') # ARABIC-INDIC DIGIT SIX
134
- .gsub('٧','7') # ARABIC-INDIC DIGIT SEVEN
135
- .gsub('٨','8') # ARABIC-INDIC DIGIT EIGHT
136
- .gsub('٩','9') # ARABIC-INDIC DIGIT NINE
137
- .gsub('٪','%') # ARABIC PERCENT SIGN
138
- .gsub('٫',',') # ARABIC DECIMAL SEPARATOR
139
- .gsub('٬',',') # ARABIC THOUSANDS SEPARATOR
140
- .gsub('ٮ','b') # ARABIC LETTER DOTLESS BEH
141
- .gsub('ٯ','q') # ARABIC LETTER DOTLESS QAF
142
- .gsub('ٰ','a') # ARABIC LETTER SUPERSCRIPT ALEF
143
- .gsub('ٱ','a') # ARABIC LETTER ALEF WASLA
144
- .gsub('ٲ','a') # ARABIC LETTER ALEF WITH WAVY HAMZA ABOVE
145
- .gsub('ٳ','a') # ARABIC LETTER ALEF WITH WAVY HAMZA BELOW
146
- .gsub('ٷ','u') # ARABIC LETTER U WITH HAMZA ABOVE
147
- .gsub('ٹ','tt') # ARABIC LETTER TTEH
148
- .gsub('ٺ','tt') # ARABIC LETTER TTEHEH
149
- .gsub('ٻ','b') # ARABIC LETTER BEEH
150
- .gsub('ټ','t') # ARABIC LETTER TEH WITH RING
151
- .gsub('ٽ','t') # ARABIC LETTER TEH WITH THREE DOTS ABOVE DOWNWARDS
152
- .gsub('پ','p') # ARABIC LETTER PEH
153
- .gsub('ٿ','t') # ARABIC LETTER TEHEH
154
- .gsub('ڀ','b') # ARABIC LETTER BEHEH
155
- .gsub('ځ','h') # ARABIC LETTER HAH WITH HAMZA ABOVE
156
- .gsub('ڂ','h') # ARABIC LETTER HAH WITH TWO DOTS VERTICAL ABOVE
157
- .gsub('ڃ','ny') # ARABIC LETTER NYEH
158
- .gsub('ڄ','dy') # ARABIC LETTER DYEH
159
- .gsub('څ','h') # ARABIC LETTER HAH WITH THREE DOTS ABOVE
160
- .gsub('چ','tch') # ARABIC LETTER TCHEH
161
- .gsub('ڇ','tch') # ARABIC LETTER TCHEHEH
162
- .gsub('ڈ','dd') # ARABIC LETTER DDAL
163
- .gsub('ډ','d') # ARABIC LETTER DAL WITH RING
164
- .gsub('ڊ','d') # ARABIC LETTER DAL WITH DOT BELOW
165
- .gsub('ڋ','d') # ARABIC LETTER DAL WITH DOT BELOW AND SMALL TAH
166
- .gsub('ڌ','d') # ARABIC LETTER DAHAL
167
- .gsub('ڍ','dd') # ARABIC LETTER DDAHAL
168
- .gsub('ڎ','d') # ARABIC LETTER DUL
169
- .gsub('ڏ','d') # ARABIC LETTER DAL WITH THREE DOTS ABOVE DOWNWARDS
170
- .gsub('ڐ','d') # ARABIC LETTER DAL WITH FOUR DOTS ABOVE
171
- .gsub('ڑ','rr') # ARABIC LETTER RREH
172
- .gsub('ڒ','r') # ARABIC LETTER REH WITH SMALL V
173
- .gsub('ړ','r') # ARABIC LETTER REH WITH RING
174
- .gsub('ڔ','r') # ARABIC LETTER REH WITH DOT BELOW
175
- .gsub('ڕ','r') # ARABIC LETTER REH WITH SMALL V BELOW
176
- .gsub('ږ','r') # ARABIC LETTER REH WITH DOT BELOW AND DOT ABOVE
177
- .gsub('ڗ','r') # ARABIC LETTER REH WITH TWO DOTS ABOVE
178
- .gsub('ژ','j') # ARABIC LETTER JEH
179
- .gsub('ڙ','r') # ARABIC LETTER REH WITH FOUR DOTS ABOVE
180
- .gsub('ښ','s') # ARABIC LETTER SEEN WITH DOT BELOW AND DOT ABOVE
181
- .gsub('ڛ','s') # ARABIC LETTER SEEN WITH THREE DOTS BELOW
182
- .gsub('ڜ','s') # ARABIC LETTER SEEN WITH THREE DOTS BELOW AND THREE DOTS ABOVE
183
- .gsub('ڝ','s') # ARABIC LETTER SAD WITH TWO DOTS BELOW
184
- .gsub('ڞ','s') # ARABIC LETTER SAD WITH THREE DOTS ABOVE
185
- .gsub('ڟ','t') # ARABIC LETTER TAH WITH THREE DOTS ABOVE
186
- .gsub('ڠ','n') # ARABIC LETTER AIN WITH THREE DOTS ABOVE
187
- .gsub('ڡ','f') # ARABIC LETTER DOTLESS FEH
188
- .gsub('ڢ','f') # ARABIC LETTER FEH WITH DOT MOVED BELOW
189
- .gsub('ڣ','f') # ARABIC LETTER FEH WITH DOT BELOW
190
- .gsub('ڤ','v') # ARABIC LETTER VEH
191
- .gsub('ڥ','f') # ARABIC LETTER FEH WITH THREE DOTS BELOW
192
- .gsub('ڦ','p') # ARABIC LETTER PEHEH
193
- .gsub('ڧ','q') # ARABIC LETTER QAF WITH DOT ABOVE
194
- .gsub('ڨ','q') # ARABIC LETTER QAF WITH THREE DOTS ABOVE
195
- .gsub('ک','k') # ARABIC LETTER KEHEH
196
- .gsub('ڪ','k') # ARABIC LETTER SWASH KAF
197
- .gsub('ګ','k') # ARABIC LETTER KAF WITH RING
198
- .gsub('ڬ','k') # ARABIC LETTER KAF WITH DOT ABOVE
199
- .gsub('ڭ','ng') # ARABIC LETTER NG
200
- .gsub('ڮ','k') # ARABIC LETTER KAF WITH THREE DOTS BELOW
201
- .gsub('گ','g') # ARABIC LETTER GAF
202
- .gsub('ڰ','g') # ARABIC LETTER GAF WITH RING
203
- .gsub('ڱ','ng') # ARABIC LETTER NGOEH
204
- .gsub('ڲ','g') # ARABIC LETTER GAF WITH TWO DOTS BELOW
205
- .gsub('ڳ','g') # ARABIC LETTER GUEH
206
- .gsub('ڴ','g') # ARABIC LETTER GAF WITH THREE DOTS ABOVE
207
- .gsub('ڵ','l') # ARABIC LETTER LAM WITH SMALL V
208
- .gsub('ڶ','l') # ARABIC LETTER LAM WITH DOT ABOVE
209
- .gsub('ڷ','l') # ARABIC LETTER LAM WITH THREE DOTS ABOVE
210
- .gsub('ڸ','l') # ARABIC LETTER LAM WITH THREE DOTS BELOW
211
- .gsub('ڹ','n') # ARABIC LETTER NOON WITH DOT BELOW
212
- .gsub('ں','n') # ARABIC LETTER NOON GHUNNA
213
- .gsub('ڻ','rn') # ARABIC LETTER RNOON
214
- .gsub('ڼ','n') # ARABIC LETTER NOON WITH RING
215
- .gsub('ڽ','n') # ARABIC LETTER NOON WITH THREE DOTS ABOVE
216
- .gsub('ھ','h') # ARABIC LETTER HEH DOACHASHMEE
217
- .gsub('ڿ','tch') # ARABIC LETTER TCHEH WITH DOT ABOVE
218
- .gsub('ۀ','h') # ARABIC LETTER HEH WITH YEH ABOVE
219
- .gsub('ہ','h') # ARABIC LETTER HEH GOAL
220
- .gsub('ۂ','h') # ARABIC LETTER HEH GOAL WITH HAMZA ABOVE
221
- .gsub('ۃ','a') # ARABIC LETTER TEH MARBUTA GOAL
222
- .gsub('ۄ','w') # ARABIC LETTER WAW WITH RING
223
- .gsub('ۅ','oe') # ARABIC LETTER KIRGHIZ OE
224
- .gsub('ۆ','oe') # ARABIC LETTER OE
225
- .gsub('ۇ','u') # ARABIC LETTER U
226
- .gsub('ۈ','yu') # ARABIC LETTER YU
227
- .gsub('ۉ','yu') # ARABIC LETTER KIRGHIZ YU
228
- .gsub('ۊ','w') # ARABIC LETTER WAW WITH TWO DOTS ABOVE
229
- .gsub('ۋ','v') # ARABIC LETTER VE
230
- .gsub('ی','y') # ARABIC LETTER FARSI YEH
231
- .gsub('ۍ','y') # ARABIC LETTER YEH WITH TAIL
232
- .gsub('ێ','y') # ARABIC LETTER YEH WITH SMALL V
233
- .gsub('ۏ','w') # ARABIC LETTER WAW WITH DOT ABOVE
234
- .gsub('ې','e') # ARABIC LETTER E
235
- .gsub('ۑ','y') # ARABIC LETTER YEH WITH THREE DOTS BELOW
236
- .gsub('ے','y') # ARABIC LETTER YEH BARREE
237
- .gsub('ۓ','y') # ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
238
- .gsub('۔','.') # ARABIC FULL STOP
239
- .gsub('ە','ae') # ARABIC LETTER AE
240
- .gsub('ۮ','d') # ARABIC LETTER DAL WITH INVERTED V
241
- .gsub('ۯ','r') # ARABIC LETTER REH WITH INVERTED V
242
- .gsub('۰','0') # EXTENDED ARABIC-INDIC DIGIT ZERO
243
- .gsub('۱','1') # EXTENDED ARABIC-INDIC DIGIT ONE
244
- .gsub('۲','2') # EXTENDED ARABIC-INDIC DIGIT TWO
245
- .gsub('۳','3') # EXTENDED ARABIC-INDIC DIGIT THREE
246
- .gsub('۴','4') # EXTENDED ARABIC-INDIC DIGIT FOUR
247
- .gsub('۵','5') # EXTENDED ARABIC-INDIC DIGIT FIVE
248
- .gsub('۶','6') # EXTENDED ARABIC-INDIC DIGIT SIX
249
- .gsub('۷','7') # EXTENDED ARABIC-INDIC DIGIT SEVEN
250
- .gsub('۸','8') # EXTENDED ARABIC-INDIC DIGIT EIGHT
251
- .gsub('۹','9') # EXTENDED ARABIC-INDIC DIGIT NINE
252
- .gsub('ۺ','sh') # ARABIC LETTER SHEEN WITH DOT BELOW
253
- .gsub('ۻ','d') # ARABIC LETTER DAD WITH DOT BELOW
254
- .gsub('ۼ','gh') # ARABIC LETTER GHAIN WITH DOT BELOW
255
- .gsub('۽','&') # ARABIC SIGN SINDHI AMPERSAND
256
- .gsub('ﷲ','Allah') # ARABIC LIGATURE ALLAH ISOLATED FORM
257
- .gsub('و','w') # Arabic letter waw
258
- .gsub('ء',"'") # hamza
259
- .gsub('ٔ',"'") # hamza above
260
- .gsub('ٕ',"'") # hamza below
261
- .gsub('ع',"'") # ain
262
- .gsub('آ','a') # alef madda
263
- .gsub('إ','i') # alef with hamza below
264
- .gsub('ٱ','a') # alef wasla
265
- .gsub('ة','a') # teh marbuta
266
- .gsub('ۃ','a') # teh marbuta goal
267
- .gsub('ي','y') # Arabic yeh
268
- .gsub('ى','a') # alef maksura
269
- .gsub('ﻯ','a') # alef maksura isolated form
270
- .gsub('ﻰ','a') # alef maksura final form
271
- .gsub('ﯨ','a') # Uighur Kazach Kirghiz alef maksura initial form
272
- .gsub('ﯩ','a') # Uighur Kazach Kirghiz alef maksura medial form
273
- .gsub('ٰ','a ') # Arabic letter superscript alef
274
- .gsub('ـ','') # tatweel (filler)
275
- .gsub('َ','a') # fatha ("-a")
276
- .gsub('ُ','u') # damma ("-u")
277
- .gsub('ِ','i') # kasra ("-i")
278
- .gsub('ْ','') # sukun (no vowel)
279
- .gsub('ۡ','') # comment small high dotless head of khah; like sukun (no vowel); used in Kashmiri, Assamese
280
- .gsub('اً','an') # alef + fathatan
281
- .gsub('ً','') # fathatan ("-an")
282
- .gsub('ٌ','') # dammatan ("-un")
283
- .gsub('ٍ','') # kasratan ("-in")
284
- .gsub('ّ','') # shadda (consonant doubler)
285
- .gsub('ڃ','ny') # Arabic letter nyeh U+0683 (used in Sindhi (snd))
286
- .gsub('ڄ','dy') # Arabic letter dyeh U+0684 (used in Sindhi (snd))
287
- .gsub('۾','men') # Sindhi postposition men
288
- .gsub('ؑ','alayhe wasallam') # "upon him be peace"
289
- .gsub('ﷴ','Mohammad') # "Mohammad"
290
- .gsub('ﷸ','wasallam') # "and peace"
291
- .gsub('ﷺ','sallallahou alayhe wasallam') # "prayer of God be upon him and his family and peace"
67
+ def self.is_japanese?(scripts)
68
+ scripts.include?('Hiragana') || scripts.include?('Katakana')
292
69
  end
293
70
  end
@@ -0,0 +1,199 @@
1
+ class Arabic
2
+
3
+ ARABIC = {
4
+ '،' => ',', # ARABIC COMMA
5
+ '؛' => ';', # ARABIC SEMICOLON
6
+ '؟' => '?', # ARABIC QUESTION MARK
7
+ 'ء' => "'", # ARABIC LETTER HAMZA
8
+ 'آ' => 'a', # ARABIC LETTER ALEF WITH MADDA ABOVE
9
+ 'أ' => 'a', # ARABIC LETTER ALEF WITH HAMZA ABOVE
10
+ 'ؤ' => 'w', # ARABIC LETTER WAW WITH HAMZA ABOVE
11
+ 'إ' => 'i', # ARABIC LETTER ALEF WITH HAMZA BELOW
12
+ 'ئ' => 'ye', # ARABIC LETTER YEH WITH HAMZA ABOVE
13
+ 'ا' => 'a', # ARABIC LETTER ALEF
14
+ 'ب' => 'b', # ARABIC LETTER BEH
15
+ 'ة' => 'a', # ARABIC LETTER TEH MARBUTA
16
+ 'ت' => 't', # ARABIC LETTER TEH
17
+ 'ث' => 'th', # ARABIC LETTER THEH
18
+ 'ج' => 'j', # ARABIC LETTER JEEM
19
+ 'ح' => 'h', # ARABIC LETTER HAH
20
+ 'خ' => 'kh', # ARABIC LETTER KHAH
21
+ 'د' => 'd', # ARABIC LETTER DAL
22
+ 'ذ' => 'th', # ARABIC LETTER THAL
23
+ 'ر' => 'r', # ARABIC LETTER REH
24
+ 'ز' => 'z', # ARABIC LETTER ZAIN
25
+ 'س' => 's', # ARABIC LETTER SEEN
26
+ 'ش' => 'sh', # ARABIC LETTER SHEEN
27
+ 'ص' => 's', # ARABIC LETTER SAD
28
+ 'ض' => 'd', # ARABIC LETTER DAD
29
+ 'ط' => 't', # ARABIC LETTER TAH
30
+ 'ظ' => 'z', # ARABIC LETTER ZAH
31
+ 'ع' => "'", # ARABIC LETTER AIN
32
+ 'غ' => 'gh', # ARABIC LETTER GHAIN
33
+ 'ـ' => '-', # ARABIC TATWEEL
34
+ 'ف' => 'f', # ARABIC LETTER FEH
35
+ 'ق' => 'q', # ARABIC LETTER QAF
36
+ 'ك' => 'k', # ARABIC LETTER KAF
37
+ 'ل' => 'l', # ARABIC LETTER LAM
38
+ 'م' => 'm', # ARABIC LETTER MEEM
39
+ 'ن' => 'n', # ARABIC LETTER NOON
40
+ 'ه' => 'h', # ARABIC LETTER HEH
41
+ 'و' => 'w', # ARABIC LETTER WAW
42
+ 'ى' => 'a', # ARABIC LETTER ALEF MAKSURA
43
+ 'ي' => 'y', # ARABIC LETTER YEH
44
+ 'َ' => 'a', # ARABIC FATHA
45
+ 'ُ' => 'u', # ARABIC DAMMA
46
+ 'ِ' => 'i', # ARABIC KASRA
47
+ 'ْ' => '', # ARABIC SUKUN
48
+ 'ٔ' => "'", # ARABIC HAMZA ABOVE
49
+ 'ٕ' => "'", # ARABIC HAMZA BELOW
50
+ '٠' => '0', # ARABIC-INDIC DIGIT ZERO
51
+ '١' => '1', # ARABIC-INDIC DIGIT ONE
52
+ '٢' => '2', # ARABIC-INDIC DIGIT TWO
53
+ '٣' => '3', # ARABIC-INDIC DIGIT THREE
54
+ '٤' => '4', # ARABIC-INDIC DIGIT FOUR
55
+ '٥' => '5', # ARABIC-INDIC DIGIT FIVE
56
+ '٦' => '6', # ARABIC-INDIC DIGIT SIX
57
+ '٧' => '7', # ARABIC-INDIC DIGIT SEVEN
58
+ '٨' => '8', # ARABIC-INDIC DIGIT EIGHT
59
+ '٩' => '9', # ARABIC-INDIC DIGIT NINE
60
+ '٪' => '%', # ARABIC PERCENT SIGN
61
+ '٫' => ' => ', # ARABIC DECIMAL SEPARATOR
62
+ '٬' => ' => ', # ARABIC THOUSANDS SEPARATOR
63
+ 'ٮ' => 'b', # ARABIC LETTER DOTLESS BEH
64
+ 'ٯ' => 'q', # ARABIC LETTER DOTLESS QAF
65
+ 'ٰ' => 'a', # ARABIC LETTER SUPERSCRIPT ALEF
66
+ 'ٱ' => 'a', # ARABIC LETTER ALEF WASLA
67
+ 'ٲ' => 'a', # ARABIC LETTER ALEF WITH WAVY HAMZA ABOVE
68
+ 'ٳ' => 'a', # ARABIC LETTER ALEF WITH WAVY HAMZA BELOW
69
+ 'ٷ' => 'u', # ARABIC LETTER U WITH HAMZA ABOVE
70
+ 'ٹ' => 'tt', # ARABIC LETTER TTEH
71
+ 'ٺ' => 'tt', # ARABIC LETTER TTEHEH
72
+ 'ٻ' => 'b', # ARABIC LETTER BEEH
73
+ 'ټ' => 't', # ARABIC LETTER TEH WITH RING
74
+ 'ٽ' => 't', # ARABIC LETTER TEH WITH THREE DOTS ABOVE DOWNWARDS
75
+ 'پ' => 'p', # ARABIC LETTER PEH
76
+ 'ٿ' => 't', # ARABIC LETTER TEHEH
77
+ 'ڀ' => 'b', # ARABIC LETTER BEHEH
78
+ 'ځ' => 'h', # ARABIC LETTER HAH WITH HAMZA ABOVE
79
+ 'ڂ' => 'h', # ARABIC LETTER HAH WITH TWO DOTS VERTICAL ABOVE
80
+ 'ڃ' => 'ny', # ARABIC LETTER NYEH
81
+ 'ڄ' => 'dy', # ARABIC LETTER DYEH
82
+ 'څ' => 'h', # ARABIC LETTER HAH WITH THREE DOTS ABOVE
83
+ 'چ' => 'tch', # ARABIC LETTER TCHEH
84
+ 'ڇ' => 'tch', # ARABIC LETTER TCHEHEH
85
+ 'ڈ' => 'dd', # ARABIC LETTER DDAL
86
+ 'ډ' => 'd', # ARABIC LETTER DAL WITH RING
87
+ 'ڊ' => 'd', # ARABIC LETTER DAL WITH DOT BELOW
88
+ 'ڋ' => 'd', # ARABIC LETTER DAL WITH DOT BELOW AND SMALL TAH
89
+ 'ڌ' => 'd', # ARABIC LETTER DAHAL
90
+ 'ڍ' => 'dd', # ARABIC LETTER DDAHAL
91
+ 'ڎ' => 'd', # ARABIC LETTER DUL
92
+ 'ڏ' => 'd', # ARABIC LETTER DAL WITH THREE DOTS ABOVE DOWNWARDS
93
+ 'ڐ' => 'd', # ARABIC LETTER DAL WITH FOUR DOTS ABOVE
94
+ 'ڑ' => 'rr', # ARABIC LETTER RREH
95
+ 'ڒ' => 'r', # ARABIC LETTER REH WITH SMALL V
96
+ 'ړ' => 'r', # ARABIC LETTER REH WITH RING
97
+ 'ڔ' => 'r', # ARABIC LETTER REH WITH DOT BELOW
98
+ 'ڕ' => 'r', # ARABIC LETTER REH WITH SMALL V BELOW
99
+ 'ږ' => 'r', # ARABIC LETTER REH WITH DOT BELOW AND DOT ABOVE
100
+ 'ڗ' => 'r', # ARABIC LETTER REH WITH TWO DOTS ABOVE
101
+ 'ژ' => 'j', # ARABIC LETTER JEH
102
+ 'ڙ' => 'r', # ARABIC LETTER REH WITH FOUR DOTS ABOVE
103
+ 'ښ' => 's', # ARABIC LETTER SEEN WITH DOT BELOW AND DOT ABOVE
104
+ 'ڛ' => 's', # ARABIC LETTER SEEN WITH THREE DOTS BELOW
105
+ 'ڜ' => 's', # ARABIC LETTER SEEN WITH THREE DOTS BELOW AND THREE DOTS ABOVE
106
+ 'ڝ' => 's', # ARABIC LETTER SAD WITH TWO DOTS BELOW
107
+ 'ڞ' => 's', # ARABIC LETTER SAD WITH THREE DOTS ABOVE
108
+ 'ڟ' => 't', # ARABIC LETTER TAH WITH THREE DOTS ABOVE
109
+ 'ڠ' => 'n', # ARABIC LETTER AIN WITH THREE DOTS ABOVE
110
+ 'ڡ' => 'f', # ARABIC LETTER DOTLESS FEH
111
+ 'ڢ' => 'f', # ARABIC LETTER FEH WITH DOT MOVED BELOW
112
+ 'ڣ' => 'f', # ARABIC LETTER FEH WITH DOT BELOW
113
+ 'ڤ' => 'v', # ARABIC LETTER VEH
114
+ 'ڥ' => 'f', # ARABIC LETTER FEH WITH THREE DOTS BELOW
115
+ 'ڦ' => 'p', # ARABIC LETTER PEHEH
116
+ 'ڧ' => 'q', # ARABIC LETTER QAF WITH DOT ABOVE
117
+ 'ڨ' => 'q', # ARABIC LETTER QAF WITH THREE DOTS ABOVE
118
+ 'ک' => 'k', # ARABIC LETTER KEHEH
119
+ 'ڪ' => 'k', # ARABIC LETTER SWASH KAF
120
+ 'ګ' => 'k', # ARABIC LETTER KAF WITH RING
121
+ 'ڬ' => 'k', # ARABIC LETTER KAF WITH DOT ABOVE
122
+ 'ڭ' => 'ng', # ARABIC LETTER NG
123
+ 'ڮ' => 'k', # ARABIC LETTER KAF WITH THREE DOTS BELOW
124
+ 'گ' => 'g', # ARABIC LETTER GAF
125
+ 'ڰ' => 'g', # ARABIC LETTER GAF WITH RING
126
+ 'ڱ' => 'ng', # ARABIC LETTER NGOEH
127
+ 'ڲ' => 'g', # ARABIC LETTER GAF WITH TWO DOTS BELOW
128
+ 'ڳ' => 'g', # ARABIC LETTER GUEH
129
+ 'ڴ' => 'g', # ARABIC LETTER GAF WITH THREE DOTS ABOVE
130
+ 'ڵ' => 'l', # ARABIC LETTER LAM WITH SMALL V
131
+ 'ڶ' => 'l', # ARABIC LETTER LAM WITH DOT ABOVE
132
+ 'ڷ' => 'l', # ARABIC LETTER LAM WITH THREE DOTS ABOVE
133
+ 'ڸ' => 'l', # ARABIC LETTER LAM WITH THREE DOTS BELOW
134
+ 'ڹ' => 'n', # ARABIC LETTER NOON WITH DOT BELOW
135
+ 'ں' => 'n', # ARABIC LETTER NOON GHUNNA
136
+ 'ڻ' => 'rn', # ARABIC LETTER RNOON
137
+ 'ڼ' => 'n', # ARABIC LETTER NOON WITH RING
138
+ 'ڽ' => 'n', # ARABIC LETTER NOON WITH THREE DOTS ABOVE
139
+ 'ھ' => 'h', # ARABIC LETTER HEH DOACHASHMEE
140
+ 'ڿ' => 'tch', # ARABIC LETTER TCHEH WITH DOT ABOVE
141
+ 'ۀ' => 'h', # ARABIC LETTER HEH WITH YEH ABOVE
142
+ 'ہ' => 'h', # ARABIC LETTER HEH GOAL
143
+ 'ۂ' => 'h', # ARABIC LETTER HEH GOAL WITH HAMZA ABOVE
144
+ 'ۃ' => 'a', # ARABIC LETTER TEH MARBUTA GOAL
145
+ 'ۄ' => 'w', # ARABIC LETTER WAW WITH RING
146
+ 'ۅ' => 'oe', # ARABIC LETTER KIRGHIZ OE
147
+ 'ۆ' => 'oe', # ARABIC LETTER OE
148
+ 'ۇ' => 'u', # ARABIC LETTER U
149
+ 'ۈ' => 'yu', # ARABIC LETTER YU
150
+ 'ۉ' => 'yu', # ARABIC LETTER KIRGHIZ YU
151
+ 'ۊ' => 'w', # ARABIC LETTER WAW WITH TWO DOTS ABOVE
152
+ 'ۋ' => 'v', # ARABIC LETTER VE
153
+ 'ی' => 'y', # ARABIC LETTER FARSI YEH
154
+ 'ۍ' => 'y', # ARABIC LETTER YEH WITH TAIL
155
+ 'ێ' => 'y', # ARABIC LETTER YEH WITH SMALL V
156
+ 'ۏ' => 'w', # ARABIC LETTER WAW WITH DOT ABOVE
157
+ 'ې' => 'e', # ARABIC LETTER E
158
+ 'ۑ' => 'y', # ARABIC LETTER YEH WITH THREE DOTS BELOW
159
+ 'ے' => 'y', # ARABIC LETTER YEH BARREE
160
+ 'ۓ' => 'y', # ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
161
+ '۔' => '.', # ARABIC FULL STOP
162
+ 'ە' => 'ae', # ARABIC LETTER AE
163
+ 'ۮ' => 'd', # ARABIC LETTER DAL WITH INVERTED V
164
+ 'ۯ' => 'r', # ARABIC LETTER REH WITH INVERTED V
165
+ '۰' => '0', # EXTENDED ARABIC-INDIC DIGIT ZERO
166
+ '۱' => '1', # EXTENDED ARABIC-INDIC DIGIT ONE
167
+ '۲' => '2', # EXTENDED ARABIC-INDIC DIGIT TWO
168
+ '۳' => '3', # EXTENDED ARABIC-INDIC DIGIT THREE
169
+ '۴' => '4', # EXTENDED ARABIC-INDIC DIGIT FOUR
170
+ '۵' => '5', # EXTENDED ARABIC-INDIC DIGIT FIVE
171
+ '۶' => '6', # EXTENDED ARABIC-INDIC DIGIT SIX
172
+ '۷' => '7', # EXTENDED ARABIC-INDIC DIGIT SEVEN
173
+ '۸' => '8', # EXTENDED ARABIC-INDIC DIGIT EIGHT
174
+ '۹' => '9', # EXTENDED ARABIC-INDIC DIGIT NINE
175
+ 'ۺ' => 'sh', # ARABIC LETTER SHEEN WITH DOT BELOW
176
+ 'ۻ' => 'd', # ARABIC LETTER DAD WITH DOT BELOW
177
+ 'ۼ' => 'gh', # ARABIC LETTER GHAIN WITH DOT BELOW
178
+ '۽' => '&', # ARABIC SIGN SINDHI AMPERSAND
179
+ 'ﷲ' => 'Allah', # ARABIC LIGATURE ALLAH ISOLATED FORM
180
+ 'ۡ' => '', # comment small high dotless head of khah; like sukun (no vowel); used in Kashmiri => Assamese
181
+ 'اً' => 'an', # alef + fathatan
182
+ 'ً' => '', # fathatan ("-an")
183
+ 'ٌ' => '', # dammatan ("-un")
184
+ 'ٍ' => '', # kasratan ("-in")
185
+ 'ّ' => '', # shadda (consonant doubler)
186
+ '۾' => 'men', # Sindhi postposition men
187
+ 'ؑ' => 'alayhe wasallam', # "upon him be peace"
188
+ 'ﷴ' => 'Mohammad', # "Mohammad"
189
+ 'ﷸ' => 'wasallam', # "and peace"
190
+ 'ﷺ' => 'sallallahou alayhe wasallam', # "prayer of God be upon him and his family and peace"
191
+ }.freeze
192
+
193
+ def self.t(text)
194
+ latin = text.dup
195
+ ARABIC.each { |k,v| latin.gsub!(k, v)}
196
+ latin
197
+ end
198
+
199
+ end
@@ -0,0 +1,18 @@
1
+ class Han
2
+ require 'chinese_pinyin'
3
+
4
+ def self.t(text, opt = nil)
5
+ latin = []
6
+ chars = text.split("")
7
+ chars.each_with_index do |char, index|
8
+ if char =~ /\p{Han}/
9
+ converted_char = Pinyin.t(char, opt == :ascii ? {} : {tonemarks: true})
10
+ latin << ' '
11
+ latin << converted_char
12
+ else
13
+ latin << char
14
+ end
15
+ end
16
+ latin.join('').gsub(' ', ' ')
17
+ end
18
+ end
@@ -0,0 +1,61 @@
1
+ class Japanese
2
+ require 'mecab_standalone'
3
+ require 'romaji'
4
+
5
+ JAPANESE_PONCTUATION = {
6
+ ' ' => ' ',
7
+ '、' => ',',
8
+ '。' => '.',
9
+ ':' => ':',
10
+ '!' => '!',
11
+ '?' => '?',
12
+ '〜' => '~',
13
+ '…' => '...',
14
+ '‥' => '..',
15
+ '「 ' => ' \'',
16
+ '」' => '\'',
17
+ '『 ' => ' "',
18
+ '』' => '"',
19
+ '〝 ' => ' "',
20
+ '〟' => '"',
21
+ '( ' => ' (',
22
+ ')' => ')',
23
+ '【 ' => ' [',
24
+ '】' => ']',
25
+ '{ ' => ' {',
26
+ '}' => '}',
27
+ }.freeze
28
+
29
+ def self.t(text)
30
+ latin = text.dup
31
+ parsed = parse(text)
32
+ parsed.each do |token|
33
+ if token[-1]=~ /\p{Katakana}/
34
+ latin.sub!(token[0], ' ' + Romaji.kana2romaji(token[-1]) )
35
+ end
36
+ end
37
+ JAPANESE_PONCTUATION.each { |k,v| latin.gsub!(k, v)}
38
+ latin
39
+ end
40
+
41
+ def self.parse(text)
42
+ mecab_parsed = MecabStandalone.parse(text)
43
+ .split("\n")
44
+ .map{|k| k.split("\t")}.tap(&:pop)
45
+ .map{|k| [k[0]].concat(k[1].split(','))}
46
+ .map{|k| [k[0], k[1], k[-2]]}
47
+ tokenized_kana = []
48
+ mecab_parsed.each do |token|
49
+ if token[1] == "助動詞"
50
+ tokenized_kana[-1][0] += token[0]
51
+ tokenized_kana[-1][-1] += token[-1]
52
+ elsif token[-1] == '*'
53
+ tokenized_kana << [token[0], token[1], token[0]]
54
+ else
55
+ tokenized_kana << token
56
+ end
57
+ end
58
+ tokenized_kana
59
+ end
60
+
61
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: latinizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - William Yugue
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-09 00:00:00.000000000 Z
11
+ date: 2020-09-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: chinese_pinyin
@@ -108,6 +108,9 @@ extensions: []
108
108
  extra_rdoc_files: []
109
109
  files:
110
110
  - lib/latinizer.rb
111
+ - lib/lib/arabic.rb
112
+ - lib/lib/han.rb
113
+ - lib/lib/japanese.rb
111
114
  homepage: https://github.com/wyugue/latinizer
112
115
  licenses:
113
116
  - MIT