tamil 0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/tamil.rb +519 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cd7f55b1244ad7f539d0d0211027171f2ea7acdf
|
4
|
+
data.tar.gz: 198b5958f12c7e363b3d86ed736ab82a4f496fb4
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c1d8bd9c3df20586ffb1f4e56a36e0bb949fe961d365781e770d371c1bd56957b0a208417c457d8f0a20c34278b994790ee6e7e6db570166344bab3b4cd8055b
|
7
|
+
data.tar.gz: 9abf039aadfdc6b8a95cc7646ee433ead2ac54d8e252c6df8283e612e7c15431b5bb810a06a4274028e378a2d3527b56531ad6cb5a6aacab47ae5bf0a97554f5
|
data/lib/tamil.rb
ADDED
@@ -0,0 +1,519 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
# (C) 2015 Muthiah Annamalai <ezhillang@gmail.com>
|
3
|
+
|
4
|
+
class AssertionError < RuntimeError
|
5
|
+
end
|
6
|
+
|
7
|
+
def assert &block
|
8
|
+
raise AssertionError unless yield
|
9
|
+
end
|
10
|
+
|
11
|
+
module Tamil
|
12
|
+
## constants
|
13
|
+
TA_ACCENT_LEN = 13 #12 + 1
|
14
|
+
TA_AYUDHA_LEN = 1
|
15
|
+
TA_UYIR_LEN = 12
|
16
|
+
TA_MEI_LEN = 18
|
17
|
+
TA_AGARAM_LEN = 18
|
18
|
+
TA_SANSKRIT_LEN = 6
|
19
|
+
TA_UYIRMEI_LEN = 216
|
20
|
+
TA_GRANTHA_UYIRMEI_LEN = 24*12
|
21
|
+
TA_LETTERS_LEN = 247 + 6*12 + 22 + 4 - TA_AGARAM_LEN - 4 #323
|
22
|
+
|
23
|
+
# List of letters you can use
|
24
|
+
@@agaram_letters = ["க","ச","ட","த","ப","ற","ஞ","ங","ண","ந","ம","ன","ய","ர","ல","வ","ழ","ள"]
|
25
|
+
AGARAM_LETTERS = @@agaram_letters.clone
|
26
|
+
|
27
|
+
@@uyir_letters = ["அ","ஆ","இ","ஈ","உ","ஊ","எ","ஏ","ஐ","ஒ","ஓ","ஔ"]
|
28
|
+
@@ayudha_letter = "ஃ"
|
29
|
+
|
30
|
+
@@kuril_letters = ["அ", "இ", "உ", "எ", "ஒ"]
|
31
|
+
@@nedil_letters = ["ஆ", "ஈ", "ஊ", "ஏ", "ஓ"]
|
32
|
+
|
33
|
+
@@vallinam_letters = ["க்", "ச்", "ட்", "த்", "ப்", "ற்"]
|
34
|
+
@@mellinam_letters = ["ங்", "ஞ்", "ண்", "ந்", "ம்", "ன்"]
|
35
|
+
@@idayinam_letters = ["ய்", "ர்", "ல்", "வ்", "ழ்", "ள்"]
|
36
|
+
|
37
|
+
@@mei_letters = ["க்","ச்","ட்","த்","ப்","ற்","ஞ்","ங்","ண்","ந்","ம்","ன்","ய்","ர்","ல்","வ்","ழ்","ள்" ]
|
38
|
+
|
39
|
+
@@accent_symbols = ["","ா","ி","ீ","ு","ூ","ெ","ே","ை","ொ","ோ","ௌ","ஃ"]
|
40
|
+
@@pulli_symbols = ["்"]
|
41
|
+
|
42
|
+
@@sanskrit_letters = ["ஶ","ஜ","ஷ", "ஸ","ஹ","க்ஷ"]
|
43
|
+
@@sanskrit_mei_letters =["ஶ்","ஜ்","ஷ்", "ஸ்","ஹ்","க்ஷ்"]
|
44
|
+
|
45
|
+
@@grantha_mei_letters = @@mei_letters.clone()
|
46
|
+
@@grantha_mei_letters.concat(@@sanskrit_mei_letters)
|
47
|
+
|
48
|
+
@@grantha_agaram_letters = @@agaram_letters.clone()
|
49
|
+
@@grantha_agaram_letters.concat(@@sanskrit_letters)
|
50
|
+
|
51
|
+
@@uyirmei_letters = [ "க" ,"கா" ,"கி" ,"கீ" ,"கு" ,"கூ" ,"கெ" ,"கே" ,"கை" ,"கொ" ,"கோ" ,"கௌ" ,
|
52
|
+
"ச" ,"சா" ,"சி" ,"சீ" ,"சு" ,"சூ" ,"செ" ,"சே" ,"சை" ,"சொ" ,"சோ" ,"சௌ" ,
|
53
|
+
"ட" ,"டா" ,"டி" ,"டீ" ,"டு" ,"டூ" ,"டெ" ,"டே" ,"டை" ,"டொ" ,"டோ" ,"டௌ",
|
54
|
+
"த" ,"தா" ,"தி" ,"தீ" ,"து" ,"தூ" ,"தெ" ,"தே" ,"தை" ,"தொ" ,"தோ" ,"தௌ",
|
55
|
+
"ப" ,"பா" ,"பி" ,"பீ" ,"பு" ,"பூ" ,"பெ" ,"பே" ,"பை" ,"பொ" ,"போ" ,"பௌ" ,
|
56
|
+
"ற" ,"றா" ,"றி" ,"றீ" ,"று" ,"றூ" ,"றெ" ,"றே" ,"றை" ,"றொ" ,"றோ" ,"றௌ",
|
57
|
+
"ஞ" ,"ஞா" ,"ஞி" ,"ஞீ" ,"ஞு" ,"ஞூ" ,"ஞெ" ,"ஞே" ,"ஞை" ,"ஞொ" ,"ஞோ" ,"ஞௌ" ,
|
58
|
+
"ங" ,"ஙா" ,"ஙி" ,"ஙீ" ,"ஙு" ,"ஙூ" ,"ஙெ" ,"ஙே" ,"ஙை" ,"ஙொ" ,"ஙோ" ,"ஙௌ" ,
|
59
|
+
"ண" ,"ணா" ,"ணி" ,"ணீ" ,"ணு" ,"ணூ" ,"ணெ" ,"ணே" ,"ணை" ,"ணொ" ,"ணோ" ,"ணௌ" ,
|
60
|
+
"ந" ,"நா" ,"நி" ,"நீ" ,"நு" ,"நூ" ,"நெ" ,"நே" ,"நை" ,"நொ" ,"நோ" ,"நௌ" ,
|
61
|
+
"ம" ,"மா" ,"மி" ,"மீ" ,"மு" ,"மூ" ,"மெ" ,"மே" ,"மை" ,"மொ" ,"மோ" ,"மௌ" ,
|
62
|
+
"ன" ,"னா" ,"னி" ,"னீ" ,"னு" ,"னூ" ,"னெ" ,"னே" ,"னை" ,"னொ" ,"னோ" ,"னௌ",
|
63
|
+
"ய" ,"யா" ,"யி" ,"யீ" ,"யு" ,"யூ" ,"யெ" ,"யே" ,"யை" ,"யொ" ,"யோ" ,"யௌ",
|
64
|
+
"ர" ,"ரா" ,"ரி" ,"ரீ" ,"ரு" ,"ரூ" ,"ரெ" ,"ரே" ,"ரை" ,"ரொ" ,"ரோ" ,"ரௌ",
|
65
|
+
"ல" ,"லா" ,"லி" ,"லீ" ,"லு" ,"லூ" ,"லெ" ,"லே" ,"லை" ,"லொ" ,"லோ" ,"லௌ" ,
|
66
|
+
"வ" ,"வா" ,"வி" ,"வீ" ,"வு" ,"வூ" ,"வெ" ,"வே" ,"வை" ,"வொ" ,"வோ" ,"வௌ" ,
|
67
|
+
"ழ" ,"ழா" ,"ழி" ,"ழீ" ,"ழு" ,"ழூ" ,"ழெ" ,"ழே" ,"ழை" ,"ழொ" ,"ழோ" ,"ழௌ" ,
|
68
|
+
"ள" ,"ளா" ,"ளி" ,"ளீ" ,"ளு" ,"ளூ" ,"ளெ" ,"ளே" ,"ளை" ,"ளொ" ,"ளோ" ,"ளௌ" ]
|
69
|
+
|
70
|
+
def Tamil.get_letters(word)
|
71
|
+
## Split a tamil-unicode stream into
|
72
|
+
## tamil characters (individuals).
|
73
|
+
""" splits the word into a character-list of tamil/english
|
74
|
+
characters present in the stream """
|
75
|
+
ta_letters = Array.new()
|
76
|
+
not_empty = false
|
77
|
+
wlen = word.length()
|
78
|
+
idx = 0
|
79
|
+
while (idx < wlen)
|
80
|
+
c = word[idx]
|
81
|
+
if @@uyir_letters.include?(c) or c == @@ayudha_letter
|
82
|
+
ta_letters.insert(-1,c)
|
83
|
+
not_empty = true
|
84
|
+
elsif @@grantha_agaram_letters.include?(c)
|
85
|
+
ta_letters.insert(-1,c)
|
86
|
+
not_empty = true
|
87
|
+
elsif @@accent_symbols.include?(c)
|
88
|
+
if not not_empty
|
89
|
+
# odd situation
|
90
|
+
ta_letters.insert(-1,c)
|
91
|
+
not_empty = true
|
92
|
+
else
|
93
|
+
ta_letters[-1] += c
|
94
|
+
end
|
95
|
+
else
|
96
|
+
if c < "\u00FF"
|
97
|
+
ta_letters.insert(-1, c )
|
98
|
+
else
|
99
|
+
if not_empty
|
100
|
+
ta_letters[-1]+= c
|
101
|
+
else
|
102
|
+
ta_letters.insert(-1,c)
|
103
|
+
not_empty = true
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
idx = idx + 1
|
108
|
+
end
|
109
|
+
return ta_letters
|
110
|
+
end
|
111
|
+
|
112
|
+
## length of the definitions
|
113
|
+
def Tamil.accent_len( )
|
114
|
+
return Tamil::TA_ACCENT_LEN ## 13 = 12 + 1
|
115
|
+
end
|
116
|
+
|
117
|
+
def Tamil.ayudha_len( )
|
118
|
+
return Tamil::TA_AYUDHA_LEN ## 1
|
119
|
+
end
|
120
|
+
|
121
|
+
def Tamil.uyir_len( )
|
122
|
+
return Tamil::TA_UYIR_LEN ##12
|
123
|
+
end
|
124
|
+
|
125
|
+
def Tamil.mei_len( )
|
126
|
+
return Tamil::TA_MEI_LEN ##18
|
127
|
+
end
|
128
|
+
|
129
|
+
def Tamil.agaram_len( )
|
130
|
+
assert { @@agaram_letters.length == Tamil::TA_AGARAM_LEN }
|
131
|
+
return Tamil::TA_AGARAM_LEN ##18
|
132
|
+
end
|
133
|
+
|
134
|
+
def Tamil.uyirmei_len( )
|
135
|
+
return Tamil::TA_UYIRMEI_LEN ##216
|
136
|
+
end
|
137
|
+
|
138
|
+
def Tamil.tamil_len( )
|
139
|
+
return @@tamil_letters.length
|
140
|
+
end
|
141
|
+
|
142
|
+
## access the letters
|
143
|
+
def Tamil.uyir( idx )
|
144
|
+
assert { ( idx >= 0 ) and ( idx < Tamil.uyir_len() ) }
|
145
|
+
return Tamil::uyir_letters[idx]
|
146
|
+
end
|
147
|
+
|
148
|
+
def Tamil.agaram( idx )
|
149
|
+
assert {( idx >= 0) and ( idx < Tamil.agaram_len() )}
|
150
|
+
return @@agaram_letters[idx]
|
151
|
+
end
|
152
|
+
|
153
|
+
def Tamil.mei( idx )
|
154
|
+
assert {( idx >= 0 ) and ( idx < Tamil.mei_len() )}
|
155
|
+
return @@mei_letters[idx]
|
156
|
+
end
|
157
|
+
|
158
|
+
def Tamil.uyirmei( idx )
|
159
|
+
assert {( idx >= 0 ) and ( idx < Tamil.uyirmei_len() ) }
|
160
|
+
return @@uyirmei_letters[idx]
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
|
165
|
+
# ## total tamil letters in use, including sanskrit letters
|
166
|
+
# tamil_letters = [
|
167
|
+
|
168
|
+
# ## /* Uyir */
|
169
|
+
# "அ","ஆ","இ", "ஈ","உ","ஊ","எ","ஏ","ஐ","ஒ","ஓ","ஔ",
|
170
|
+
|
171
|
+
# ##/* Ayuda Ezhuthu */
|
172
|
+
# "ஃ",
|
173
|
+
|
174
|
+
# ## /* Mei */
|
175
|
+
# "க்","ச்","ட்","த்","ப்","ற்","ஞ்","ங்","ண்","ந்","ம்","ன்","ய்","ர்","ல்","வ்","ழ்","ள்",
|
176
|
+
|
177
|
+
# ## /* Agaram */
|
178
|
+
# ## "க","ச","ட","த","ப","ற","ஞ","ங","ண","ந","ம","ன","ய","ர","ல","வ","ழ","ள",
|
179
|
+
|
180
|
+
# ## /* Sanskrit (Vada Mozhi) */
|
181
|
+
# ## "ஜ","ஷ", "ஸ","ஹ",
|
182
|
+
|
183
|
+
# ##/* Sanskrit (Mei) */
|
184
|
+
# "ஜ்","ஷ்", "ஸ்","ஹ்",
|
185
|
+
|
186
|
+
# ## /* Uyir Mei */
|
187
|
+
# "க" ,"கா" ,"கி" ,"கீ" ,"கு" ,"கூ" ,"கெ" ,"கே" ,"கை" ,"கொ" ,"கோ" ,"கௌ"
|
188
|
+
# ,"ச" ,"சா" ,"சி" ,"சீ" ,"சு" ,"சூ" ,"செ" ,"சே" ,"சை" ,"சொ" ,"சோ" ,"சௌ"
|
189
|
+
# ,"ட" ,"டா" ,"டி" ,"டீ" ,"டு" ,"டூ" ,"டெ" ,"டே" ,"டை" ,"டொ" ,"டோ" ,"டௌ"
|
190
|
+
# ,"த" ,"தா" ,"தி" ,"தீ" ,"து" ,"தூ" ,"தெ" ,"தே" ,"தை" ,"தொ" ,"தோ" ,"தௌ"
|
191
|
+
# ,"ப" ,"பா" ,"பி" ,"பீ" ,"பு" ,"பூ" ,"பெ" ,"பே" ,"பை" ,"பொ" ,"போ" ,"பௌ"
|
192
|
+
# ,"ற" ,"றா" ,"றி" ,"றீ" ,"று" ,"றூ" ,"றெ" ,"றே" ,"றை" ,"றொ" ,"றோ" ,"றௌ"
|
193
|
+
# ,"ஞ" ,"ஞா" ,"ஞி" ,"ஞீ" ,"ஞு" ,"ஞூ" ,"ஞெ" ,"ஞே" ,"ஞை" ,"ஞொ" ,"ஞோ" ,"ஞௌ"
|
194
|
+
# ,"ங" ,"ஙா" ,"ஙி" ,"ஙீ" ,"ஙு" ,"ஙூ" ,"ஙெ" ,"ஙே" ,"ஙை" ,"ஙொ" ,"ஙோ" ,"ஙௌ"
|
195
|
+
# ,"ண" ,"ணா" ,"ணி" ,"ணீ" ,"ணு" ,"ணூ" ,"ணெ" ,"ணே" ,"ணை" ,"ணொ" ,"ணோ" ,"ணௌ"
|
196
|
+
# ,"ந" ,"நா" ,"நி" ,"நீ" ,"நு" ,"நூ" ,"நெ" ,"நே" ,"நை" ,"நொ" ,"நோ" ,"நௌ"
|
197
|
+
# ,"ம" ,"மா" ,"மி" ,"மீ" ,"மு" ,"மூ" ,"மெ" ,"மே" ,"மை" ,"மொ" ,"மோ" ,"மௌ"
|
198
|
+
# ,"ன" ,"னா" ,"னி" ,"னீ" ,"னு" ,"னூ" ,"னெ" ,"னே" ,"னை" ,"னொ" ,"னோ" ,"னௌ"
|
199
|
+
# ,"ய" ,"யா" ,"யி" ,"யீ" ,"யு" ,"யூ" ,"யெ" ,"யே" ,"யை" ,"யொ" ,"யோ" ,"யௌ"
|
200
|
+
# ,"ர" ,"ரா" ,"ரி" ,"ரீ" ,"ரு" ,"ரூ" ,"ரெ" ,"ரே" ,"ரை" ,"ரொ" ,"ரோ" ,"ரௌ"
|
201
|
+
# ,"ல" ,"லா" ,"லி" ,"லீ" ,"லு" ,"லூ" ,"லெ" ,"லே" ,"லை" ,"லொ" ,"லோ" ,"லௌ"
|
202
|
+
# ,"வ" ,"வா" ,"வி" ,"வீ" ,"வு" ,"வூ" ,"வெ" ,"வே" ,"வை" ,"வொ" ,"வோ" ,"வௌ"
|
203
|
+
# ,"ழ" ,"ழா" ,"ழி" ,"ழீ" ,"ழு" ,"ழூ" ,"ழெ" ,"ழே" ,"ழை" ,"ழொ" ,"ழோ" ,"ழௌ"
|
204
|
+
# ,"ள" ,"ளா" ,"ளி" ,"ளீ" ,"ளு" ,"ளூ" ,"ளெ" ,"ளே" ,"ளை" ,"ளொ" ,"ளோ" ,"ளௌ"
|
205
|
+
|
206
|
+
# ##/* Sanskrit Uyir-Mei */
|
207
|
+
# ,"ஶ", "ஶா", "ஶி", "ஶீ", "ஶு", "ஶூ", "ஶெ", "ஶே", "ஶை", "ஶொ", "ஶோ", "ஶௌ"
|
208
|
+
# ,"ஜ" ,"ஜா" ,"ஜி" ,"ஜீ" ,"ஜு" ,"ஜூ" ,"ஜெ" ,"ஜே" ,"ஜை" ,"ஜொ" ,"ஜோ" ,"ஜௌ"
|
209
|
+
# ,"ஷ" ,"ஷா" ,"ஷி" ,"ஷீ" ,"ஷு" ,"ஷூ" ,"ஷெ" ,"ஷே" ,"ஷை" ,"ஷொ" ,"ஷோ" ,"ஷௌ"
|
210
|
+
# ,"ஸ" ,"ஸா" ,"ஸி" ,"ஸீ" ,"ஸு" ,"ஸூ" ,"ஸெ" ,"ஸே" ,"ஸை" ,"ஸொ" ,"ஸோ" ,"ஸௌ"
|
211
|
+
# ,"ஹ" ,"ஹா" ,"ஹி" ,"ஹீ" ,"ஹு" ,"ஹூ" ,"ஹெ" ,"ஹே" ,"ஹை" ,"ஹொ" ,"ஹோ" ,"ஹௌ"
|
212
|
+
# ,"க்ஷ" ,"க்ஷா" ,"க்ஷி" ,"க்ஷீ" ,"க்ஷு" ,"க்ஷூ" ,"க்ஷெ" ,"க்ஷே" ,"க்ஷை" ,"க்ஷொ" ,"க்ஷோ" ,"க்ஷௌ" ]
|
213
|
+
|
214
|
+
# grantha_uyirmei_letters = tamil_letters[tamil_letters.index("கா")-1:].clone()
|
215
|
+
|
216
|
+
|
217
|
+
# def uyirmei_constructed( mei_idx, uyir_idx):
|
218
|
+
# """ construct uyirmei letter give mei index and uyir index """
|
219
|
+
# idx,idy = mei_idx,uyir_idx
|
220
|
+
# assert ( idy >= 0 and idy < uyir_len() )
|
221
|
+
# assert ( idx >= 0 and idx < mei_len() )
|
222
|
+
# return agaram_letters[mei_idx]+accent_symbols[uyir_idx]
|
223
|
+
|
224
|
+
# def tamil( idx ):
|
225
|
+
# """ retrieve Tamil letter at canonical index from array utf8.tamil_letters """
|
226
|
+
# assert ( idx >= 0 and idx < tamil_len() )
|
227
|
+
# return tamil_letters[idx]
|
228
|
+
|
229
|
+
# # companion function to @tamil()
|
230
|
+
# def getidx(letter):
|
231
|
+
# for itr in range(0,tamil_len()):
|
232
|
+
# if tamil_letters[itr] == letter:
|
233
|
+
# return itr
|
234
|
+
# raise Exception("Cannot find letter in Tamil arichuvadi")
|
235
|
+
|
236
|
+
# ## useful part of the API:
|
237
|
+
# def istamil_prefix( word ):
|
238
|
+
# """ check if the given word has a tamil prefix. Returns
|
239
|
+
# either a True/False flag """
|
240
|
+
# for letter in tamil_letters:
|
241
|
+
# if ( word.find(letter) == 0 ):
|
242
|
+
# return True
|
243
|
+
# return False
|
244
|
+
|
245
|
+
# if not PYTHON3:
|
246
|
+
# is_tamil_unicode_predicate = lambda x: x >= unichr(2946) and x <= unichr(3066)
|
247
|
+
# else:
|
248
|
+
# is_tamil_unicode_predicate = lambda x: x >= chr(2946) and x <= chr(3066)
|
249
|
+
# def is_tamil_unicode( sequence ):
|
250
|
+
# # Ref: languagetool-office-extension/src/main/java/org/languagetool/openoffice/TamilDetector.java
|
251
|
+
# if type(sequence) is list:
|
252
|
+
# return list(map( is_tamil_unicode_predicate, sequence ))
|
253
|
+
# if len(sequence) > 1:
|
254
|
+
# return list(map( is_tamil_unicode_predicate, get_letters(sequence) ))
|
255
|
+
# return is_tamil_unicode_predicate( sequence )
|
256
|
+
|
257
|
+
# def all_tamil( word_in ):
|
258
|
+
# """ predicate checks if all letters of the input word are Tamil letters """
|
259
|
+
# if isinstance(word_in,list):
|
260
|
+
# word = word_in
|
261
|
+
# else:
|
262
|
+
# word = get_letters( word_in )
|
263
|
+
# return all( [(letter in tamil_letters) for letter in word] )
|
264
|
+
|
265
|
+
# def has_tamil( word ):
|
266
|
+
# """check if the word has any occurance of any tamil letter """
|
267
|
+
# # list comprehension is not necessary - we bail at earliest
|
268
|
+
# for letters in tamil_letters:
|
269
|
+
# if ( word.find(letters) >= 0 ):
|
270
|
+
# return True
|
271
|
+
# return False
|
272
|
+
|
273
|
+
# def istamil( tchar ):
|
274
|
+
# """ check if the letter tchar is prefix of
|
275
|
+
# any of tamil-letter. It suggests we have a tamil identifier"""
|
276
|
+
# if (tchar in tamil_letters):
|
277
|
+
# return True
|
278
|
+
# return False
|
279
|
+
|
280
|
+
# def istamil_alnum( tchar ):
|
281
|
+
# """ check if the character is alphanumeric, or tamil.
|
282
|
+
# This saves time from running through istamil() check. """
|
283
|
+
# return ( tchar.isalnum( ) or istamil( tchar ) )
|
284
|
+
|
285
|
+
# def reverse_word( word ):
|
286
|
+
# """ reverse a Tamil word according to letters not unicode-points """
|
287
|
+
# op = get_letters( word )
|
288
|
+
# op.reverse()
|
289
|
+
# return "".join(op)
|
290
|
+
|
291
|
+
# ## find out if the letters like, "பொ" are written in canonical "ப + ொ"" graphemes then
|
292
|
+
# ## return True. If they are written like "ப + ெ + ா" then return False on first occurrence
|
293
|
+
# def is_normalized( text ):
|
294
|
+
# TLEN,idx = len(text),1
|
295
|
+
# kaal = "ா"
|
296
|
+
# sinna_kombu, periya_kombu = "ெ", "ே"
|
297
|
+
# kombugal = [sinna_kombu, periya_kombu]
|
298
|
+
|
299
|
+
# def predicate( last_letter, prev_letter):
|
300
|
+
# if ((last_letter == kaal) and (prev_letter in kombugal)):
|
301
|
+
# return True
|
302
|
+
# return False
|
303
|
+
# if TLEN < 2:
|
304
|
+
# return True
|
305
|
+
# elif TLEN == 2:
|
306
|
+
# if predicate( text[-1], text[-2] ):
|
307
|
+
# return False
|
308
|
+
# return True
|
309
|
+
# a = text[0]
|
310
|
+
# b = text[1]
|
311
|
+
# assert idx == 1
|
312
|
+
# while (idx < TLEN):
|
313
|
+
# if predicate(b,a):
|
314
|
+
# return False
|
315
|
+
# a=b
|
316
|
+
# idx = idx + 1
|
317
|
+
# if idx < TLEN:
|
318
|
+
# b=text[idx]
|
319
|
+
# # reached end and nothing tripped us
|
320
|
+
# return True
|
321
|
+
|
322
|
+
# def _make_set(args):
|
323
|
+
# if PYTHON3:
|
324
|
+
# return frozenset(args)
|
325
|
+
# return set(args)
|
326
|
+
|
327
|
+
# grantha_agaram_set = _make_set(grantha_agaram_letters)
|
328
|
+
# accent_symbol_set = _make_set(accent_symbols)
|
329
|
+
# uyir_letter_set = _make_set(uyir_letters)
|
330
|
+
|
331
|
+
|
332
|
+
# _all_symbols = copy( accent_symbols )
|
333
|
+
# _all_symbols.extend( pulli_symbols )
|
334
|
+
# all_symbol_set = _make_set(_all_symbols)
|
335
|
+
|
336
|
+
# # same as get_letters but use as iterable
|
337
|
+
# def get_letters_iterable( word ):
|
338
|
+
# """ splits the word into a character-list of tamil/english
|
339
|
+
# characters present in the stream """
|
340
|
+
# WLEN,idx = len(word),0
|
341
|
+
|
342
|
+
# while (idx < WLEN):
|
343
|
+
# c = word[idx]
|
344
|
+
# #print(idx,hex(ord(c)),len(ta_letters))
|
345
|
+
# if c in uyir_letter_set or c == ayudha_letter:
|
346
|
+
# idx = idx + 1
|
347
|
+
# yield c
|
348
|
+
# elif c in grantha_agaram_set:
|
349
|
+
# if idx + 1 < WLEN and word[idx+1] in all_symbol_set:
|
350
|
+
# c2 = word[idx+1]
|
351
|
+
# idx = idx + 2
|
352
|
+
# yield (c + c2)
|
353
|
+
# else:
|
354
|
+
# idx = idx + 1
|
355
|
+
# yield c
|
356
|
+
# else:
|
357
|
+
# idx = idx + 1
|
358
|
+
# yield c
|
359
|
+
# raise StopIteration
|
360
|
+
|
361
|
+
# def get_words(letters,tamil_only=False):
|
362
|
+
# return [ word for word in get_words_iterable(letters,tamil_only) ]
|
363
|
+
|
364
|
+
# def get_words_iterable( letters, tamil_only=False ):
|
365
|
+
# """ given a list of UTF-8 letters section them into words, grouping them at spaces """
|
366
|
+
|
367
|
+
# # correct algorithm for get-tamil-words
|
368
|
+
# buf = []
|
369
|
+
# for idx,let in enumerate(letters):
|
370
|
+
# if not let.isspace():
|
371
|
+
# if istamil(let) or (not tamil_only):
|
372
|
+
# buf.append( let )
|
373
|
+
# else:
|
374
|
+
# if len(buf) > 0:
|
375
|
+
# yield "".join( buf )
|
376
|
+
# buf = []
|
377
|
+
# if len(buf) > 0:
|
378
|
+
# yield "".join(buf)
|
379
|
+
|
380
|
+
# def get_tamil_words( letters ):
|
381
|
+
# """ reverse a Tamil word according to letters, not unicode-points """
|
382
|
+
# return [word for word in get_words_iterable( letters, tamil_only = True )]
|
383
|
+
|
384
|
+
# if PYTHON3:
|
385
|
+
# def cmp( x, y):
|
386
|
+
# if x == y:
|
387
|
+
# return 0
|
388
|
+
# if x > y:
|
389
|
+
# return 1
|
390
|
+
# return -1
|
391
|
+
|
392
|
+
# # answer if word_a ranks ahead of, or at same level, as word_b in a Tamil dictionary order...
|
393
|
+
# # for use with Python : if a > 0
|
394
|
+
# def compare_words_lexicographic( word_a, word_b ):
|
395
|
+
# """ compare words in Tamil lexicographic order """
|
396
|
+
# # sanity check for words to be all Tamil
|
397
|
+
# if ( not all_tamil(word_a) ) or (not all_tamil(word_b)) :
|
398
|
+
# print("## ")
|
399
|
+
# print(word_a)
|
400
|
+
# print(word_b)
|
401
|
+
# print("Both operands need to be Tamil words")
|
402
|
+
# La = len(word_a)
|
403
|
+
# Lb = len(word_b)
|
404
|
+
# all_TA_letters = "".join(tamil_letters)
|
405
|
+
# for itr in range(0,min(La,Lb)):
|
406
|
+
# pos1 = all_TA_letters.find( word_a[itr] )
|
407
|
+
# pos2 = all_TA_letters.find( word_b[itr] )
|
408
|
+
|
409
|
+
# if pos1 != pos2 :
|
410
|
+
# #print not( pos1 > pos2), pos1, pos2
|
411
|
+
# return cmp(pos1, pos2)
|
412
|
+
|
413
|
+
# # result depends on if La is shorter than Lb, or 0 if La == Lb i.e. cmp
|
414
|
+
# return cmp(La,Lb)
|
415
|
+
|
416
|
+
# # return a list of ordered-pairs containing positions
|
417
|
+
# # that are common in word_a, and word_b; e.g.
|
418
|
+
# # தேடுக x தடங்கல் -> one common letter க [(2,3)]
|
419
|
+
# # சொல் x தேடுக -> no common letters []
|
420
|
+
# def word_intersection( word_a, word_b ):
|
421
|
+
# """ return a list of tuples where word_a, word_b intersect """
|
422
|
+
# positions = []
|
423
|
+
# word_a_letters = get_letters( word_a )
|
424
|
+
# word_b_letters = get_letters( word_b )
|
425
|
+
# for idx,wa in enumerate(word_a_letters):
|
426
|
+
# for idy,wb in enumerate(word_b_letters):
|
427
|
+
# if ( wa == wb ):
|
428
|
+
# positions.append( (idx, idy) )
|
429
|
+
# return positions
|
430
|
+
|
431
|
+
# def splitMeiUyir(uyirmei_char):
|
432
|
+
# """
|
433
|
+
# This function split uyirmei compound character into mei + uyir characters
|
434
|
+
# and returns in tuple.
|
435
|
+
|
436
|
+
# Input : It must be unicode tamil char.
|
437
|
+
|
438
|
+
# Written By : Arulalan.T
|
439
|
+
# Date : 22.09.2014
|
440
|
+
|
441
|
+
# """
|
442
|
+
|
443
|
+
# if not isinstance(uyirmei_char, PYTHON3 and str or unicode):
|
444
|
+
# raise ValueError("Passed input letter '%s' must be unicode, \
|
445
|
+
# not just string" % uyirmei_char)
|
446
|
+
|
447
|
+
# if uyirmei_char in mei_letters:
|
448
|
+
# return uyirmei_char
|
449
|
+
|
450
|
+
# if uyirmei_char in uyir_letters:
|
451
|
+
# return uyirmei_char
|
452
|
+
|
453
|
+
# if uyirmei_char not in grantha_uyirmei_letters:
|
454
|
+
# raise ValueError("Passed input letter '%s' is not tamil letter" % uyirmei_char)
|
455
|
+
|
456
|
+
# idx = grantha_uyirmei_letters.index(uyirmei_char)
|
457
|
+
# uyiridx = idx % 12
|
458
|
+
# meiidx = int((idx - uyiridx)/ 12)
|
459
|
+
# return (grantha_mei_letters[meiidx], uyir_letters[uyiridx])
|
460
|
+
# # end of def splitMeiUyir(uyirmei_char):
|
461
|
+
|
462
|
+
# def joinMeiUyir(mei_char, uyir_char):
|
463
|
+
# """
|
464
|
+
# This function join mei character and uyir character, and retuns as
|
465
|
+
# compound uyirmei unicode character.
|
466
|
+
|
467
|
+
# Inputs:
|
468
|
+
# mei_char : It must be unicode tamil mei char.
|
469
|
+
# uyir_char : It must be unicode tamil uyir char.
|
470
|
+
|
471
|
+
# Written By : Arulalan.T
|
472
|
+
# Date : 22.09.2014
|
473
|
+
# """
|
474
|
+
# if not isinstance(mei_char, PYTHON3 and str or unicode):
|
475
|
+
# raise ValueError("Passed input mei character '%s' must be unicode, \
|
476
|
+
# not just string" % mei_char)
|
477
|
+
# if not isinstance(uyir_char, PYTHON3 and str or unicode):
|
478
|
+
# raise ValueError("Passed input uyir character '%s' must be unicode, \
|
479
|
+
# not just string" % uyir_char)
|
480
|
+
# if mei_char not in grantha_mei_letters:
|
481
|
+
# raise ValueError("Passed input character '%s' is not a"
|
482
|
+
# "tamil mei character" % mei_char)
|
483
|
+
# if uyir_char not in uyir_letters:
|
484
|
+
# raise ValueError("Passed input character '%s' is not a"
|
485
|
+
# "tamil uyir character" % uyir_char)
|
486
|
+
# uyiridx = uyir_letters.index(uyir_char)
|
487
|
+
# meiidx = grantha_mei_letters.index(mei_char)
|
488
|
+
# # calculate uyirmei index
|
489
|
+
# uyirmeiidx = meiidx*12 + uyiridx
|
490
|
+
# return grantha_uyirmei_letters[uyirmeiidx]
|
491
|
+
|
492
|
+
# Tamil Letters
|
493
|
+
# அ ஆ இ ஈ உ ஊ எ ஏ ஐ ஒ ஓ ஔ ஃ
|
494
|
+
# க் ச் ட் த் ப் ற் ஞ் ங் ண் ந் ம் ன் ய் ர் ல் வ் ழ் ள் ஜ் ஷ் ஸ் ஹ்
|
495
|
+
# க ச ட த ப ற ஞ ங ண ந ம ன ய ர ல வ ழ ள ஜ ஷ ஸ ஹ
|
496
|
+
# க கா கி கீ கு கூ கெ கே கை கௌ
|
497
|
+
# ச சா சி சீ சு சூ செ சே சை சொ சோ சௌ
|
498
|
+
# ட டா டி டீ டு டூ டெ டே டை டொ டோ டௌ
|
499
|
+
# த தா தி தீ து தூ தெ தே தை தொ தோ தௌ
|
500
|
+
# ப பா பி பீ பு பூ பெ பே பை பொ போ பௌ
|
501
|
+
# ற றா றி றீ று றூ றெ றே றை றொ றோ றௌ
|
502
|
+
# ஞ ஞா ஞி ஞீ ஞு ஞூ ஞெ ஞே ஞை ஞொ ஞோ ஞௌ
|
503
|
+
# ங ஙா ஙி ஙீ ஙு ஙூ ஙெ ஙே ஙை ஙொ ஙோ ஙௌ
|
504
|
+
# ண ணா ணி ணீ ணு ணூ ணெ ணே ணை ணொ ணோ ணௌ
|
505
|
+
# ந நா நி நீ நு நூ நெ நே நை நொ நோ நௌ
|
506
|
+
# ம மா மி மீ மு மூ மெ மே மை மொ மோ மௌ
|
507
|
+
# ன னா னி னீ னு னூ னெ னே னை னொ னோ னௌ
|
508
|
+
# ய யா யி யீ யு யூ யெ யே யை யொ யோ யௌ
|
509
|
+
# ர ரா ரி ரீ ரு ரூ ரெ ரே ரை ரொ ரோ ரௌ
|
510
|
+
# ல லா லி லீ லு லூ லெ லே லை லொ லோ லௌ
|
511
|
+
# வ வா வி வீ வு வூ வெ வே வை வொ வோ வௌ
|
512
|
+
# ழ ழா ழி ழீ ழு ழூ ழெ ழே ழை ழொ ழோ ழௌ
|
513
|
+
# ள ளா ளி ளீ ளு ளூ ளெ ளே ளை ளொ ளோ ளௌ
|
514
|
+
# ஶ ஶா ஶி ஶீ ஶு ஶூ ஶெ ஶே ஶை ஶொ ஶோ ஶௌ
|
515
|
+
# ஜ ஜா ஜி ஜீ ஜு ஜூ ஜெ ஜே ஜை ஜொ ஜோ ஜௌ
|
516
|
+
# ஷ ஷா ஷி ஷீ ஷு ஷூ ஷெ ஷே ஷை ஷொ ஷோ ஷௌ
|
517
|
+
# ஸ ஸா ஸி ஸீ ஸு ஸூ ஸெ ஸே ஸை ஸொ ஸோ ஸௌ
|
518
|
+
# ஹ ஹா ஹி ஹீ ஹு ஹூ ஹெ ஹே ஹை ஹொ ஹோ ஹௌ
|
519
|
+
# க்ஷ க்ஷா க்ஷி க்ஷீ க்ஷு க்ஷூ க்ஷெ க்ஷே க்ஷை க்ஷொ க்ஷோ க்ஷௌ
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tamil
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.11'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Muthu Annamalai
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-05-19 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: open-tamil project provides a ruby gem 'tamil' for working with Tamil
|
14
|
+
language text and NLP
|
15
|
+
email: ezhillang@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/tamil.rb
|
21
|
+
homepage: http://ezhillang.org
|
22
|
+
licenses:
|
23
|
+
- MIT
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubyforge_project:
|
41
|
+
rubygems_version: 2.0.14
|
42
|
+
signing_key:
|
43
|
+
specification_version: 4
|
44
|
+
summary: open-tamil project provides a ruby gem 'tamil'
|
45
|
+
test_files: []
|