tamil 0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/tamil.rb +519 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cd7f55b1244ad7f539d0d0211027171f2ea7acdf
|
4
|
+
data.tar.gz: 198b5958f12c7e363b3d86ed736ab82a4f496fb4
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c1d8bd9c3df20586ffb1f4e56a36e0bb949fe961d365781e770d371c1bd56957b0a208417c457d8f0a20c34278b994790ee6e7e6db570166344bab3b4cd8055b
|
7
|
+
data.tar.gz: 9abf039aadfdc6b8a95cc7646ee433ead2ac54d8e252c6df8283e612e7c15431b5bb810a06a4274028e378a2d3527b56531ad6cb5a6aacab47ae5bf0a97554f5
|
data/lib/tamil.rb
ADDED
@@ -0,0 +1,519 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
# (C) 2015 Muthiah Annamalai <ezhillang@gmail.com>
|
3
|
+
|
4
|
+
class AssertionError < RuntimeError
|
5
|
+
end
|
6
|
+
|
7
|
+
def assert &block
|
8
|
+
raise AssertionError unless yield
|
9
|
+
end
|
10
|
+
|
11
|
+
module Tamil
|
12
|
+
## constants
|
13
|
+
TA_ACCENT_LEN = 13 #12 + 1
|
14
|
+
TA_AYUDHA_LEN = 1
|
15
|
+
TA_UYIR_LEN = 12
|
16
|
+
TA_MEI_LEN = 18
|
17
|
+
TA_AGARAM_LEN = 18
|
18
|
+
TA_SANSKRIT_LEN = 6
|
19
|
+
TA_UYIRMEI_LEN = 216
|
20
|
+
TA_GRANTHA_UYIRMEI_LEN = 24*12
|
21
|
+
TA_LETTERS_LEN = 247 + 6*12 + 22 + 4 - TA_AGARAM_LEN - 4 #323
|
22
|
+
|
23
|
+
# List of letters you can use
|
24
|
+
@@agaram_letters = ["க","ச","ட","த","ப","ற","ஞ","ங","ண","ந","ம","ன","ய","ர","ல","வ","ழ","ள"]
|
25
|
+
AGARAM_LETTERS = @@agaram_letters.clone
|
26
|
+
|
27
|
+
@@uyir_letters = ["அ","ஆ","இ","ஈ","உ","ஊ","எ","ஏ","ஐ","ஒ","ஓ","ஔ"]
|
28
|
+
@@ayudha_letter = "ஃ"
|
29
|
+
|
30
|
+
@@kuril_letters = ["அ", "இ", "உ", "எ", "ஒ"]
|
31
|
+
@@nedil_letters = ["ஆ", "ஈ", "ஊ", "ஏ", "ஓ"]
|
32
|
+
|
33
|
+
@@vallinam_letters = ["க்", "ச்", "ட்", "த்", "ப்", "ற்"]
|
34
|
+
@@mellinam_letters = ["ங்", "ஞ்", "ண்", "ந்", "ம்", "ன்"]
|
35
|
+
@@idayinam_letters = ["ய்", "ர்", "ல்", "வ்", "ழ்", "ள்"]
|
36
|
+
|
37
|
+
@@mei_letters = ["க்","ச்","ட்","த்","ப்","ற்","ஞ்","ங்","ண்","ந்","ம்","ன்","ய்","ர்","ல்","வ்","ழ்","ள்" ]
|
38
|
+
|
39
|
+
@@accent_symbols = ["","ா","ி","ீ","ு","ூ","ெ","ே","ை","ொ","ோ","ௌ","ஃ"]
|
40
|
+
@@pulli_symbols = ["்"]
|
41
|
+
|
42
|
+
@@sanskrit_letters = ["ஶ","ஜ","ஷ", "ஸ","ஹ","க்ஷ"]
|
43
|
+
@@sanskrit_mei_letters =["ஶ்","ஜ்","ஷ்", "ஸ்","ஹ்","க்ஷ்"]
|
44
|
+
|
45
|
+
@@grantha_mei_letters = @@mei_letters.clone()
|
46
|
+
@@grantha_mei_letters.concat(@@sanskrit_mei_letters)
|
47
|
+
|
48
|
+
@@grantha_agaram_letters = @@agaram_letters.clone()
|
49
|
+
@@grantha_agaram_letters.concat(@@sanskrit_letters)
|
50
|
+
|
51
|
+
@@uyirmei_letters = [ "க" ,"கா" ,"கி" ,"கீ" ,"கு" ,"கூ" ,"கெ" ,"கே" ,"கை" ,"கொ" ,"கோ" ,"கௌ" ,
|
52
|
+
"ச" ,"சா" ,"சி" ,"சீ" ,"சு" ,"சூ" ,"செ" ,"சே" ,"சை" ,"சொ" ,"சோ" ,"சௌ" ,
|
53
|
+
"ட" ,"டா" ,"டி" ,"டீ" ,"டு" ,"டூ" ,"டெ" ,"டே" ,"டை" ,"டொ" ,"டோ" ,"டௌ",
|
54
|
+
"த" ,"தா" ,"தி" ,"தீ" ,"து" ,"தூ" ,"தெ" ,"தே" ,"தை" ,"தொ" ,"தோ" ,"தௌ",
|
55
|
+
"ப" ,"பா" ,"பி" ,"பீ" ,"பு" ,"பூ" ,"பெ" ,"பே" ,"பை" ,"பொ" ,"போ" ,"பௌ" ,
|
56
|
+
"ற" ,"றா" ,"றி" ,"றீ" ,"று" ,"றூ" ,"றெ" ,"றே" ,"றை" ,"றொ" ,"றோ" ,"றௌ",
|
57
|
+
"ஞ" ,"ஞா" ,"ஞி" ,"ஞீ" ,"ஞு" ,"ஞூ" ,"ஞெ" ,"ஞே" ,"ஞை" ,"ஞொ" ,"ஞோ" ,"ஞௌ" ,
|
58
|
+
"ங" ,"ஙா" ,"ஙி" ,"ஙீ" ,"ஙு" ,"ஙூ" ,"ஙெ" ,"ஙே" ,"ஙை" ,"ஙொ" ,"ஙோ" ,"ஙௌ" ,
|
59
|
+
"ண" ,"ணா" ,"ணி" ,"ணீ" ,"ணு" ,"ணூ" ,"ணெ" ,"ணே" ,"ணை" ,"ணொ" ,"ணோ" ,"ணௌ" ,
|
60
|
+
"ந" ,"நா" ,"நி" ,"நீ" ,"நு" ,"நூ" ,"நெ" ,"நே" ,"நை" ,"நொ" ,"நோ" ,"நௌ" ,
|
61
|
+
"ம" ,"மா" ,"மி" ,"மீ" ,"மு" ,"மூ" ,"மெ" ,"மே" ,"மை" ,"மொ" ,"மோ" ,"மௌ" ,
|
62
|
+
"ன" ,"னா" ,"னி" ,"னீ" ,"னு" ,"னூ" ,"னெ" ,"னே" ,"னை" ,"னொ" ,"னோ" ,"னௌ",
|
63
|
+
"ய" ,"யா" ,"யி" ,"யீ" ,"யு" ,"யூ" ,"யெ" ,"யே" ,"யை" ,"யொ" ,"யோ" ,"யௌ",
|
64
|
+
"ர" ,"ரா" ,"ரி" ,"ரீ" ,"ரு" ,"ரூ" ,"ரெ" ,"ரே" ,"ரை" ,"ரொ" ,"ரோ" ,"ரௌ",
|
65
|
+
"ல" ,"லா" ,"லி" ,"லீ" ,"லு" ,"லூ" ,"லெ" ,"லே" ,"லை" ,"லொ" ,"லோ" ,"லௌ" ,
|
66
|
+
"வ" ,"வா" ,"வி" ,"வீ" ,"வு" ,"வூ" ,"வெ" ,"வே" ,"வை" ,"வொ" ,"வோ" ,"வௌ" ,
|
67
|
+
"ழ" ,"ழா" ,"ழி" ,"ழீ" ,"ழு" ,"ழூ" ,"ழெ" ,"ழே" ,"ழை" ,"ழொ" ,"ழோ" ,"ழௌ" ,
|
68
|
+
"ள" ,"ளா" ,"ளி" ,"ளீ" ,"ளு" ,"ளூ" ,"ளெ" ,"ளே" ,"ளை" ,"ளொ" ,"ளோ" ,"ளௌ" ]
|
69
|
+
|
70
|
+
def Tamil.get_letters(word)
|
71
|
+
## Split a tamil-unicode stream into
|
72
|
+
## tamil characters (individuals).
|
73
|
+
""" splits the word into a character-list of tamil/english
|
74
|
+
characters present in the stream """
|
75
|
+
ta_letters = Array.new()
|
76
|
+
not_empty = false
|
77
|
+
wlen = word.length()
|
78
|
+
idx = 0
|
79
|
+
while (idx < wlen)
|
80
|
+
c = word[idx]
|
81
|
+
if @@uyir_letters.include?(c) or c == @@ayudha_letter
|
82
|
+
ta_letters.insert(-1,c)
|
83
|
+
not_empty = true
|
84
|
+
elsif @@grantha_agaram_letters.include?(c)
|
85
|
+
ta_letters.insert(-1,c)
|
86
|
+
not_empty = true
|
87
|
+
elsif @@accent_symbols.include?(c)
|
88
|
+
if not not_empty
|
89
|
+
# odd situation
|
90
|
+
ta_letters.insert(-1,c)
|
91
|
+
not_empty = true
|
92
|
+
else
|
93
|
+
ta_letters[-1] += c
|
94
|
+
end
|
95
|
+
else
|
96
|
+
if c < "\u00FF"
|
97
|
+
ta_letters.insert(-1, c )
|
98
|
+
else
|
99
|
+
if not_empty
|
100
|
+
ta_letters[-1]+= c
|
101
|
+
else
|
102
|
+
ta_letters.insert(-1,c)
|
103
|
+
not_empty = true
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
idx = idx + 1
|
108
|
+
end
|
109
|
+
return ta_letters
|
110
|
+
end
|
111
|
+
|
112
|
+
## length of the definitions
|
113
|
+
def Tamil.accent_len( )
|
114
|
+
return Tamil::TA_ACCENT_LEN ## 13 = 12 + 1
|
115
|
+
end
|
116
|
+
|
117
|
+
def Tamil.ayudha_len( )
|
118
|
+
return Tamil::TA_AYUDHA_LEN ## 1
|
119
|
+
end
|
120
|
+
|
121
|
+
def Tamil.uyir_len( )
|
122
|
+
return Tamil::TA_UYIR_LEN ##12
|
123
|
+
end
|
124
|
+
|
125
|
+
def Tamil.mei_len( )
|
126
|
+
return Tamil::TA_MEI_LEN ##18
|
127
|
+
end
|
128
|
+
|
129
|
+
def Tamil.agaram_len( )
|
130
|
+
assert { @@agaram_letters.length == Tamil::TA_AGARAM_LEN }
|
131
|
+
return Tamil::TA_AGARAM_LEN ##18
|
132
|
+
end
|
133
|
+
|
134
|
+
def Tamil.uyirmei_len( )
|
135
|
+
return Tamil::TA_UYIRMEI_LEN ##216
|
136
|
+
end
|
137
|
+
|
138
|
+
def Tamil.tamil_len( )
|
139
|
+
return @@tamil_letters.length
|
140
|
+
end
|
141
|
+
|
142
|
+
## access the letters
|
143
|
+
def Tamil.uyir( idx )
|
144
|
+
assert { ( idx >= 0 ) and ( idx < Tamil.uyir_len() ) }
|
145
|
+
return Tamil::uyir_letters[idx]
|
146
|
+
end
|
147
|
+
|
148
|
+
def Tamil.agaram( idx )
|
149
|
+
assert {( idx >= 0) and ( idx < Tamil.agaram_len() )}
|
150
|
+
return @@agaram_letters[idx]
|
151
|
+
end
|
152
|
+
|
153
|
+
def Tamil.mei( idx )
|
154
|
+
assert {( idx >= 0 ) and ( idx < Tamil.mei_len() )}
|
155
|
+
return @@mei_letters[idx]
|
156
|
+
end
|
157
|
+
|
158
|
+
def Tamil.uyirmei( idx )
|
159
|
+
assert {( idx >= 0 ) and ( idx < Tamil.uyirmei_len() ) }
|
160
|
+
return @@uyirmei_letters[idx]
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
|
165
|
+
# ## total tamil letters in use, including sanskrit letters
|
166
|
+
# tamil_letters = [
|
167
|
+
|
168
|
+
# ## /* Uyir */
|
169
|
+
# "அ","ஆ","இ", "ஈ","உ","ஊ","எ","ஏ","ஐ","ஒ","ஓ","ஔ",
|
170
|
+
|
171
|
+
# ##/* Ayuda Ezhuthu */
|
172
|
+
# "ஃ",
|
173
|
+
|
174
|
+
# ## /* Mei */
|
175
|
+
# "க்","ச்","ட்","த்","ப்","ற்","ஞ்","ங்","ண்","ந்","ம்","ன்","ய்","ர்","ல்","வ்","ழ்","ள்",
|
176
|
+
|
177
|
+
# ## /* Agaram */
|
178
|
+
# ## "க","ச","ட","த","ப","ற","ஞ","ங","ண","ந","ம","ன","ய","ர","ல","வ","ழ","ள",
|
179
|
+
|
180
|
+
# ## /* Sanskrit (Vada Mozhi) */
|
181
|
+
# ## "ஜ","ஷ", "ஸ","ஹ",
|
182
|
+
|
183
|
+
# ##/* Sanskrit (Mei) */
|
184
|
+
# "ஜ்","ஷ்", "ஸ்","ஹ்",
|
185
|
+
|
186
|
+
# ## /* Uyir Mei */
|
187
|
+
# "க" ,"கா" ,"கி" ,"கீ" ,"கு" ,"கூ" ,"கெ" ,"கே" ,"கை" ,"கொ" ,"கோ" ,"கௌ"
|
188
|
+
# ,"ச" ,"சா" ,"சி" ,"சீ" ,"சு" ,"சூ" ,"செ" ,"சே" ,"சை" ,"சொ" ,"சோ" ,"சௌ"
|
189
|
+
# ,"ட" ,"டா" ,"டி" ,"டீ" ,"டு" ,"டூ" ,"டெ" ,"டே" ,"டை" ,"டொ" ,"டோ" ,"டௌ"
|
190
|
+
# ,"த" ,"தா" ,"தி" ,"தீ" ,"து" ,"தூ" ,"தெ" ,"தே" ,"தை" ,"தொ" ,"தோ" ,"தௌ"
|
191
|
+
# ,"ப" ,"பா" ,"பி" ,"பீ" ,"பு" ,"பூ" ,"பெ" ,"பே" ,"பை" ,"பொ" ,"போ" ,"பௌ"
|
192
|
+
# ,"ற" ,"றா" ,"றி" ,"றீ" ,"று" ,"றூ" ,"றெ" ,"றே" ,"றை" ,"றொ" ,"றோ" ,"றௌ"
|
193
|
+
# ,"ஞ" ,"ஞா" ,"ஞி" ,"ஞீ" ,"ஞு" ,"ஞூ" ,"ஞெ" ,"ஞே" ,"ஞை" ,"ஞொ" ,"ஞோ" ,"ஞௌ"
|
194
|
+
# ,"ங" ,"ஙா" ,"ஙி" ,"ஙீ" ,"ஙு" ,"ஙூ" ,"ஙெ" ,"ஙே" ,"ஙை" ,"ஙொ" ,"ஙோ" ,"ஙௌ"
|
195
|
+
# ,"ண" ,"ணா" ,"ணி" ,"ணீ" ,"ணு" ,"ணூ" ,"ணெ" ,"ணே" ,"ணை" ,"ணொ" ,"ணோ" ,"ணௌ"
|
196
|
+
# ,"ந" ,"நா" ,"நி" ,"நீ" ,"நு" ,"நூ" ,"நெ" ,"நே" ,"நை" ,"நொ" ,"நோ" ,"நௌ"
|
197
|
+
# ,"ம" ,"மா" ,"மி" ,"மீ" ,"மு" ,"மூ" ,"மெ" ,"மே" ,"மை" ,"மொ" ,"மோ" ,"மௌ"
|
198
|
+
# ,"ன" ,"னா" ,"னி" ,"னீ" ,"னு" ,"னூ" ,"னெ" ,"னே" ,"னை" ,"னொ" ,"னோ" ,"னௌ"
|
199
|
+
# ,"ய" ,"யா" ,"யி" ,"யீ" ,"யு" ,"யூ" ,"யெ" ,"யே" ,"யை" ,"யொ" ,"யோ" ,"யௌ"
|
200
|
+
# ,"ர" ,"ரா" ,"ரி" ,"ரீ" ,"ரு" ,"ரூ" ,"ரெ" ,"ரே" ,"ரை" ,"ரொ" ,"ரோ" ,"ரௌ"
|
201
|
+
# ,"ல" ,"லா" ,"லி" ,"லீ" ,"லு" ,"லூ" ,"லெ" ,"லே" ,"லை" ,"லொ" ,"லோ" ,"லௌ"
|
202
|
+
# ,"வ" ,"வா" ,"வி" ,"வீ" ,"வு" ,"வூ" ,"வெ" ,"வே" ,"வை" ,"வொ" ,"வோ" ,"வௌ"
|
203
|
+
# ,"ழ" ,"ழா" ,"ழி" ,"ழீ" ,"ழு" ,"ழூ" ,"ழெ" ,"ழே" ,"ழை" ,"ழொ" ,"ழோ" ,"ழௌ"
|
204
|
+
# ,"ள" ,"ளா" ,"ளி" ,"ளீ" ,"ளு" ,"ளூ" ,"ளெ" ,"ளே" ,"ளை" ,"ளொ" ,"ளோ" ,"ளௌ"
|
205
|
+
|
206
|
+
# ##/* Sanskrit Uyir-Mei */
|
207
|
+
# ,"ஶ", "ஶா", "ஶி", "ஶீ", "ஶு", "ஶூ", "ஶெ", "ஶே", "ஶை", "ஶொ", "ஶோ", "ஶௌ"
|
208
|
+
# ,"ஜ" ,"ஜா" ,"ஜி" ,"ஜீ" ,"ஜு" ,"ஜூ" ,"ஜெ" ,"ஜே" ,"ஜை" ,"ஜொ" ,"ஜோ" ,"ஜௌ"
|
209
|
+
# ,"ஷ" ,"ஷா" ,"ஷி" ,"ஷீ" ,"ஷு" ,"ஷூ" ,"ஷெ" ,"ஷே" ,"ஷை" ,"ஷொ" ,"ஷோ" ,"ஷௌ"
|
210
|
+
# ,"ஸ" ,"ஸா" ,"ஸி" ,"ஸீ" ,"ஸு" ,"ஸூ" ,"ஸெ" ,"ஸே" ,"ஸை" ,"ஸொ" ,"ஸோ" ,"ஸௌ"
|
211
|
+
# ,"ஹ" ,"ஹா" ,"ஹி" ,"ஹீ" ,"ஹு" ,"ஹூ" ,"ஹெ" ,"ஹே" ,"ஹை" ,"ஹொ" ,"ஹோ" ,"ஹௌ"
|
212
|
+
# ,"க்ஷ" ,"க்ஷா" ,"க்ஷி" ,"க்ஷீ" ,"க்ஷு" ,"க்ஷூ" ,"க்ஷெ" ,"க்ஷே" ,"க்ஷை" ,"க்ஷொ" ,"க்ஷோ" ,"க்ஷௌ" ]
|
213
|
+
|
214
|
+
# grantha_uyirmei_letters = tamil_letters[tamil_letters.index("கா")-1:].clone()
|
215
|
+
|
216
|
+
|
217
|
+
# def uyirmei_constructed( mei_idx, uyir_idx):
|
218
|
+
# """ construct uyirmei letter give mei index and uyir index """
|
219
|
+
# idx,idy = mei_idx,uyir_idx
|
220
|
+
# assert ( idy >= 0 and idy < uyir_len() )
|
221
|
+
# assert ( idx >= 0 and idx < mei_len() )
|
222
|
+
# return agaram_letters[mei_idx]+accent_symbols[uyir_idx]
|
223
|
+
|
224
|
+
# def tamil( idx ):
|
225
|
+
# """ retrieve Tamil letter at canonical index from array utf8.tamil_letters """
|
226
|
+
# assert ( idx >= 0 and idx < tamil_len() )
|
227
|
+
# return tamil_letters[idx]
|
228
|
+
|
229
|
+
# # companion function to @tamil()
|
230
|
+
# def getidx(letter):
|
231
|
+
# for itr in range(0,tamil_len()):
|
232
|
+
# if tamil_letters[itr] == letter:
|
233
|
+
# return itr
|
234
|
+
# raise Exception("Cannot find letter in Tamil arichuvadi")
|
235
|
+
|
236
|
+
# ## useful part of the API:
|
237
|
+
# def istamil_prefix( word ):
|
238
|
+
# """ check if the given word has a tamil prefix. Returns
|
239
|
+
# either a True/False flag """
|
240
|
+
# for letter in tamil_letters:
|
241
|
+
# if ( word.find(letter) == 0 ):
|
242
|
+
# return True
|
243
|
+
# return False
|
244
|
+
|
245
|
+
# if not PYTHON3:
|
246
|
+
# is_tamil_unicode_predicate = lambda x: x >= unichr(2946) and x <= unichr(3066)
|
247
|
+
# else:
|
248
|
+
# is_tamil_unicode_predicate = lambda x: x >= chr(2946) and x <= chr(3066)
|
249
|
+
# def is_tamil_unicode( sequence ):
|
250
|
+
# # Ref: languagetool-office-extension/src/main/java/org/languagetool/openoffice/TamilDetector.java
|
251
|
+
# if type(sequence) is list:
|
252
|
+
# return list(map( is_tamil_unicode_predicate, sequence ))
|
253
|
+
# if len(sequence) > 1:
|
254
|
+
# return list(map( is_tamil_unicode_predicate, get_letters(sequence) ))
|
255
|
+
# return is_tamil_unicode_predicate( sequence )
|
256
|
+
|
257
|
+
# def all_tamil( word_in ):
|
258
|
+
# """ predicate checks if all letters of the input word are Tamil letters """
|
259
|
+
# if isinstance(word_in,list):
|
260
|
+
# word = word_in
|
261
|
+
# else:
|
262
|
+
# word = get_letters( word_in )
|
263
|
+
# return all( [(letter in tamil_letters) for letter in word] )
|
264
|
+
|
265
|
+
# def has_tamil( word ):
|
266
|
+
# """check if the word has any occurance of any tamil letter """
|
267
|
+
# # list comprehension is not necessary - we bail at earliest
|
268
|
+
# for letters in tamil_letters:
|
269
|
+
# if ( word.find(letters) >= 0 ):
|
270
|
+
# return True
|
271
|
+
# return False
|
272
|
+
|
273
|
+
# def istamil( tchar ):
|
274
|
+
# """ check if the letter tchar is prefix of
|
275
|
+
# any of tamil-letter. It suggests we have a tamil identifier"""
|
276
|
+
# if (tchar in tamil_letters):
|
277
|
+
# return True
|
278
|
+
# return False
|
279
|
+
|
280
|
+
# def istamil_alnum( tchar ):
|
281
|
+
# """ check if the character is alphanumeric, or tamil.
|
282
|
+
# This saves time from running through istamil() check. """
|
283
|
+
# return ( tchar.isalnum( ) or istamil( tchar ) )
|
284
|
+
|
285
|
+
# def reverse_word( word ):
|
286
|
+
# """ reverse a Tamil word according to letters not unicode-points """
|
287
|
+
# op = get_letters( word )
|
288
|
+
# op.reverse()
|
289
|
+
# return "".join(op)
|
290
|
+
|
291
|
+
# ## find out if the letters like, "பொ" are written in canonical "ப + ொ"" graphemes then
|
292
|
+
# ## return True. If they are written like "ப + ெ + ா" then return False on first occurrence
|
293
|
+
# def is_normalized( text ):
|
294
|
+
# TLEN,idx = len(text),1
|
295
|
+
# kaal = "ா"
|
296
|
+
# sinna_kombu, periya_kombu = "ெ", "ே"
|
297
|
+
# kombugal = [sinna_kombu, periya_kombu]
|
298
|
+
|
299
|
+
# def predicate( last_letter, prev_letter):
|
300
|
+
# if ((last_letter == kaal) and (prev_letter in kombugal)):
|
301
|
+
# return True
|
302
|
+
# return False
|
303
|
+
# if TLEN < 2:
|
304
|
+
# return True
|
305
|
+
# elif TLEN == 2:
|
306
|
+
# if predicate( text[-1], text[-2] ):
|
307
|
+
# return False
|
308
|
+
# return True
|
309
|
+
# a = text[0]
|
310
|
+
# b = text[1]
|
311
|
+
# assert idx == 1
|
312
|
+
# while (idx < TLEN):
|
313
|
+
# if predicate(b,a):
|
314
|
+
# return False
|
315
|
+
# a=b
|
316
|
+
# idx = idx + 1
|
317
|
+
# if idx < TLEN:
|
318
|
+
# b=text[idx]
|
319
|
+
# # reached end and nothing tripped us
|
320
|
+
# return True
|
321
|
+
|
322
|
+
# def _make_set(args):
|
323
|
+
# if PYTHON3:
|
324
|
+
# return frozenset(args)
|
325
|
+
# return set(args)
|
326
|
+
|
327
|
+
# grantha_agaram_set = _make_set(grantha_agaram_letters)
|
328
|
+
# accent_symbol_set = _make_set(accent_symbols)
|
329
|
+
# uyir_letter_set = _make_set(uyir_letters)
|
330
|
+
|
331
|
+
|
332
|
+
# _all_symbols = copy( accent_symbols )
|
333
|
+
# _all_symbols.extend( pulli_symbols )
|
334
|
+
# all_symbol_set = _make_set(_all_symbols)
|
335
|
+
|
336
|
+
# # same as get_letters but use as iterable
|
337
|
+
# def get_letters_iterable( word ):
|
338
|
+
# """ splits the word into a character-list of tamil/english
|
339
|
+
# characters present in the stream """
|
340
|
+
# WLEN,idx = len(word),0
|
341
|
+
|
342
|
+
# while (idx < WLEN):
|
343
|
+
# c = word[idx]
|
344
|
+
# #print(idx,hex(ord(c)),len(ta_letters))
|
345
|
+
# if c in uyir_letter_set or c == ayudha_letter:
|
346
|
+
# idx = idx + 1
|
347
|
+
# yield c
|
348
|
+
# elif c in grantha_agaram_set:
|
349
|
+
# if idx + 1 < WLEN and word[idx+1] in all_symbol_set:
|
350
|
+
# c2 = word[idx+1]
|
351
|
+
# idx = idx + 2
|
352
|
+
# yield (c + c2)
|
353
|
+
# else:
|
354
|
+
# idx = idx + 1
|
355
|
+
# yield c
|
356
|
+
# else:
|
357
|
+
# idx = idx + 1
|
358
|
+
# yield c
|
359
|
+
# raise StopIteration
|
360
|
+
|
361
|
+
# def get_words(letters,tamil_only=False):
|
362
|
+
# return [ word for word in get_words_iterable(letters,tamil_only) ]
|
363
|
+
|
364
|
+
# def get_words_iterable( letters, tamil_only=False ):
|
365
|
+
# """ given a list of UTF-8 letters section them into words, grouping them at spaces """
|
366
|
+
|
367
|
+
# # correct algorithm for get-tamil-words
|
368
|
+
# buf = []
|
369
|
+
# for idx,let in enumerate(letters):
|
370
|
+
# if not let.isspace():
|
371
|
+
# if istamil(let) or (not tamil_only):
|
372
|
+
# buf.append( let )
|
373
|
+
# else:
|
374
|
+
# if len(buf) > 0:
|
375
|
+
# yield "".join( buf )
|
376
|
+
# buf = []
|
377
|
+
# if len(buf) > 0:
|
378
|
+
# yield "".join(buf)
|
379
|
+
|
380
|
+
# def get_tamil_words( letters ):
|
381
|
+
# """ reverse a Tamil word according to letters, not unicode-points """
|
382
|
+
# return [word for word in get_words_iterable( letters, tamil_only = True )]
|
383
|
+
|
384
|
+
# if PYTHON3:
|
385
|
+
# def cmp( x, y):
|
386
|
+
# if x == y:
|
387
|
+
# return 0
|
388
|
+
# if x > y:
|
389
|
+
# return 1
|
390
|
+
# return -1
|
391
|
+
|
392
|
+
# # answer if word_a ranks ahead of, or at same level, as word_b in a Tamil dictionary order...
|
393
|
+
# # for use with Python : if a > 0
|
394
|
+
# def compare_words_lexicographic( word_a, word_b ):
|
395
|
+
# """ compare words in Tamil lexicographic order """
|
396
|
+
# # sanity check for words to be all Tamil
|
397
|
+
# if ( not all_tamil(word_a) ) or (not all_tamil(word_b)) :
|
398
|
+
# print("## ")
|
399
|
+
# print(word_a)
|
400
|
+
# print(word_b)
|
401
|
+
# print("Both operands need to be Tamil words")
|
402
|
+
# La = len(word_a)
|
403
|
+
# Lb = len(word_b)
|
404
|
+
# all_TA_letters = "".join(tamil_letters)
|
405
|
+
# for itr in range(0,min(La,Lb)):
|
406
|
+
# pos1 = all_TA_letters.find( word_a[itr] )
|
407
|
+
# pos2 = all_TA_letters.find( word_b[itr] )
|
408
|
+
|
409
|
+
# if pos1 != pos2 :
|
410
|
+
# #print not( pos1 > pos2), pos1, pos2
|
411
|
+
# return cmp(pos1, pos2)
|
412
|
+
|
413
|
+
# # result depends on if La is shorter than Lb, or 0 if La == Lb i.e. cmp
|
414
|
+
# return cmp(La,Lb)
|
415
|
+
|
416
|
+
# # return a list of ordered-pairs containing positions
|
417
|
+
# # that are common in word_a, and word_b; e.g.
|
418
|
+
# # தேடுக x தடங்கல் -> one common letter க [(2,3)]
|
419
|
+
# # சொல் x தேடுக -> no common letters []
|
420
|
+
# def word_intersection( word_a, word_b ):
|
421
|
+
# """ return a list of tuples where word_a, word_b intersect """
|
422
|
+
# positions = []
|
423
|
+
# word_a_letters = get_letters( word_a )
|
424
|
+
# word_b_letters = get_letters( word_b )
|
425
|
+
# for idx,wa in enumerate(word_a_letters):
|
426
|
+
# for idy,wb in enumerate(word_b_letters):
|
427
|
+
# if ( wa == wb ):
|
428
|
+
# positions.append( (idx, idy) )
|
429
|
+
# return positions
|
430
|
+
|
431
|
+
# def splitMeiUyir(uyirmei_char):
|
432
|
+
# """
|
433
|
+
# This function split uyirmei compound character into mei + uyir characters
|
434
|
+
# and returns in tuple.
|
435
|
+
|
436
|
+
# Input : It must be unicode tamil char.
|
437
|
+
|
438
|
+
# Written By : Arulalan.T
|
439
|
+
# Date : 22.09.2014
|
440
|
+
|
441
|
+
# """
|
442
|
+
|
443
|
+
# if not isinstance(uyirmei_char, PYTHON3 and str or unicode):
|
444
|
+
# raise ValueError("Passed input letter '%s' must be unicode, \
|
445
|
+
# not just string" % uyirmei_char)
|
446
|
+
|
447
|
+
# if uyirmei_char in mei_letters:
|
448
|
+
# return uyirmei_char
|
449
|
+
|
450
|
+
# if uyirmei_char in uyir_letters:
|
451
|
+
# return uyirmei_char
|
452
|
+
|
453
|
+
# if uyirmei_char not in grantha_uyirmei_letters:
|
454
|
+
# raise ValueError("Passed input letter '%s' is not tamil letter" % uyirmei_char)
|
455
|
+
|
456
|
+
# idx = grantha_uyirmei_letters.index(uyirmei_char)
|
457
|
+
# uyiridx = idx % 12
|
458
|
+
# meiidx = int((idx - uyiridx)/ 12)
|
459
|
+
# return (grantha_mei_letters[meiidx], uyir_letters[uyiridx])
|
460
|
+
# # end of def splitMeiUyir(uyirmei_char):
|
461
|
+
|
462
|
+
# def joinMeiUyir(mei_char, uyir_char):
|
463
|
+
# """
|
464
|
+
# This function join mei character and uyir character, and retuns as
|
465
|
+
# compound uyirmei unicode character.
|
466
|
+
|
467
|
+
# Inputs:
|
468
|
+
# mei_char : It must be unicode tamil mei char.
|
469
|
+
# uyir_char : It must be unicode tamil uyir char.
|
470
|
+
|
471
|
+
# Written By : Arulalan.T
|
472
|
+
# Date : 22.09.2014
|
473
|
+
# """
|
474
|
+
# if not isinstance(mei_char, PYTHON3 and str or unicode):
|
475
|
+
# raise ValueError("Passed input mei character '%s' must be unicode, \
|
476
|
+
# not just string" % mei_char)
|
477
|
+
# if not isinstance(uyir_char, PYTHON3 and str or unicode):
|
478
|
+
# raise ValueError("Passed input uyir character '%s' must be unicode, \
|
479
|
+
# not just string" % uyir_char)
|
480
|
+
# if mei_char not in grantha_mei_letters:
|
481
|
+
# raise ValueError("Passed input character '%s' is not a"
|
482
|
+
# "tamil mei character" % mei_char)
|
483
|
+
# if uyir_char not in uyir_letters:
|
484
|
+
# raise ValueError("Passed input character '%s' is not a"
|
485
|
+
# "tamil uyir character" % uyir_char)
|
486
|
+
# uyiridx = uyir_letters.index(uyir_char)
|
487
|
+
# meiidx = grantha_mei_letters.index(mei_char)
|
488
|
+
# # calculate uyirmei index
|
489
|
+
# uyirmeiidx = meiidx*12 + uyiridx
|
490
|
+
# return grantha_uyirmei_letters[uyirmeiidx]
|
491
|
+
|
492
|
+
# Tamil Letters
|
493
|
+
# அ ஆ இ ஈ உ ஊ எ ஏ ஐ ஒ ஓ ஔ ஃ
|
494
|
+
# க் ச் ட் த் ப் ற் ஞ் ங் ண் ந் ம் ன் ய் ர் ல் வ் ழ் ள் ஜ் ஷ் ஸ் ஹ்
|
495
|
+
# க ச ட த ப ற ஞ ங ண ந ம ன ய ர ல வ ழ ள ஜ ஷ ஸ ஹ
|
496
|
+
# க கா கி கீ கு கூ கெ கே கை கௌ
|
497
|
+
# ச சா சி சீ சு சூ செ சே சை சொ சோ சௌ
|
498
|
+
# ட டா டி டீ டு டூ டெ டே டை டொ டோ டௌ
|
499
|
+
# த தா தி தீ து தூ தெ தே தை தொ தோ தௌ
|
500
|
+
# ப பா பி பீ பு பூ பெ பே பை பொ போ பௌ
|
501
|
+
# ற றா றி றீ று றூ றெ றே றை றொ றோ றௌ
|
502
|
+
# ஞ ஞா ஞி ஞீ ஞு ஞூ ஞெ ஞே ஞை ஞொ ஞோ ஞௌ
|
503
|
+
# ங ஙா ஙி ஙீ ஙு ஙூ ஙெ ஙே ஙை ஙொ ஙோ ஙௌ
|
504
|
+
# ண ணா ணி ணீ ணு ணூ ணெ ணே ணை ணொ ணோ ணௌ
|
505
|
+
# ந நா நி நீ நு நூ நெ நே நை நொ நோ நௌ
|
506
|
+
# ம மா மி மீ மு மூ மெ மே மை மொ மோ மௌ
|
507
|
+
# ன னா னி னீ னு னூ னெ னே னை னொ னோ னௌ
|
508
|
+
# ய யா யி யீ யு யூ யெ யே யை யொ யோ யௌ
|
509
|
+
# ர ரா ரி ரீ ரு ரூ ரெ ரே ரை ரொ ரோ ரௌ
|
510
|
+
# ல லா லி லீ லு லூ லெ லே லை லொ லோ லௌ
|
511
|
+
# வ வா வி வீ வு வூ வெ வே வை வொ வோ வௌ
|
512
|
+
# ழ ழா ழி ழீ ழு ழூ ழெ ழே ழை ழொ ழோ ழௌ
|
513
|
+
# ள ளா ளி ளீ ளு ளூ ளெ ளே ளை ளொ ளோ ளௌ
|
514
|
+
# ஶ ஶா ஶி ஶீ ஶு ஶூ ஶெ ஶே ஶை ஶொ ஶோ ஶௌ
|
515
|
+
# ஜ ஜா ஜி ஜீ ஜு ஜூ ஜெ ஜே ஜை ஜொ ஜோ ஜௌ
|
516
|
+
# ஷ ஷா ஷி ஷீ ஷு ஷூ ஷெ ஷே ஷை ஷொ ஷோ ஷௌ
|
517
|
+
# ஸ ஸா ஸி ஸீ ஸு ஸூ ஸெ ஸே ஸை ஸொ ஸோ ஸௌ
|
518
|
+
# ஹ ஹா ஹி ஹீ ஹு ஹூ ஹெ ஹே ஹை ஹொ ஹோ ஹௌ
|
519
|
+
# க்ஷ க்ஷா க்ஷி க்ஷீ க்ஷு க்ஷூ க்ஷெ க்ஷே க்ஷை க்ஷொ க்ஷோ க்ஷௌ
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tamil
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.11'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Muthu Annamalai
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-05-19 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: open-tamil project provides a ruby gem 'tamil' for working with Tamil
|
14
|
+
language text and NLP
|
15
|
+
email: ezhillang@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/tamil.rb
|
21
|
+
homepage: http://ezhillang.org
|
22
|
+
licenses:
|
23
|
+
- MIT
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubyforge_project:
|
41
|
+
rubygems_version: 2.0.14
|
42
|
+
signing_key:
|
43
|
+
specification_version: 4
|
44
|
+
summary: open-tamil project provides a ruby gem 'tamil'
|
45
|
+
test_files: []
|