tamil 0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/tamil.rb +519 -0
  3. metadata +45 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cd7f55b1244ad7f539d0d0211027171f2ea7acdf
4
+ data.tar.gz: 198b5958f12c7e363b3d86ed736ab82a4f496fb4
5
+ SHA512:
6
+ metadata.gz: c1d8bd9c3df20586ffb1f4e56a36e0bb949fe961d365781e770d371c1bd56957b0a208417c457d8f0a20c34278b994790ee6e7e6db570166344bab3b4cd8055b
7
+ data.tar.gz: 9abf039aadfdc6b8a95cc7646ee433ead2ac54d8e252c6df8283e612e7c15431b5bb810a06a4274028e378a2d3527b56531ad6cb5a6aacab47ae5bf0a97554f5
@@ -0,0 +1,519 @@
1
+ # encoding: UTF-8
2
+ # (C) 2015 Muthiah Annamalai <ezhillang@gmail.com>
3
+
4
+ class AssertionError < RuntimeError
5
+ end
6
+
7
+ def assert &block
8
+ raise AssertionError unless yield
9
+ end
10
+
11
+ module Tamil
12
+ ## constants
13
+ TA_ACCENT_LEN = 13 #12 + 1
14
+ TA_AYUDHA_LEN = 1
15
+ TA_UYIR_LEN = 12
16
+ TA_MEI_LEN = 18
17
+ TA_AGARAM_LEN = 18
18
+ TA_SANSKRIT_LEN = 6
19
+ TA_UYIRMEI_LEN = 216
20
+ TA_GRANTHA_UYIRMEI_LEN = 24*12
21
+ TA_LETTERS_LEN = 247 + 6*12 + 22 + 4 - TA_AGARAM_LEN - 4 #323
22
+
23
+ # List of letters you can use
24
+ @@agaram_letters = ["க","ச","ட","த","ப","ற","ஞ","ங","ண","ந","ம","ன","ய","ர","ல","வ","ழ","ள"]
25
+ AGARAM_LETTERS = @@agaram_letters.clone
26
+
27
+ @@uyir_letters = ["அ","ஆ","இ","ஈ","உ","ஊ","எ","ஏ","ஐ","ஒ","ஓ","ஔ"]
28
+ @@ayudha_letter = "ஃ"
29
+
30
+ @@kuril_letters = ["அ", "இ", "உ", "எ", "ஒ"]
31
+ @@nedil_letters = ["ஆ", "ஈ", "ஊ", "ஏ", "ஓ"]
32
+
33
+ @@vallinam_letters = ["க்", "ச்", "ட்", "த்", "ப்", "ற்"]
34
+ @@mellinam_letters = ["ங்", "ஞ்", "ண்", "ந்", "ம்", "ன்"]
35
+ @@idayinam_letters = ["ய்", "ர்", "ல்", "வ்", "ழ்", "ள்"]
36
+
37
+ @@mei_letters = ["க்","ச்","ட்","த்","ப்","ற்","ஞ்","ங்","ண்","ந்","ம்","ன்","ய்","ர்","ல்","வ்","ழ்","ள்" ]
38
+
39
+ @@accent_symbols = ["","ா","ி","ீ","ு","ூ","ெ","ே","ை","ொ","ோ","ௌ","ஃ"]
40
+ @@pulli_symbols = ["்"]
41
+
42
+ @@sanskrit_letters = ["ஶ","ஜ","ஷ", "ஸ","ஹ","க்ஷ"]
43
+ @@sanskrit_mei_letters =["ஶ்","ஜ்","ஷ்", "ஸ்","ஹ்","க்ஷ்"]
44
+
45
+ @@grantha_mei_letters = @@mei_letters.clone()
46
+ @@grantha_mei_letters.concat(@@sanskrit_mei_letters)
47
+
48
+ @@grantha_agaram_letters = @@agaram_letters.clone()
49
+ @@grantha_agaram_letters.concat(@@sanskrit_letters)
50
+
51
+ @@uyirmei_letters = [ "க" ,"கா" ,"கி" ,"கீ" ,"கு" ,"கூ" ,"கெ" ,"கே" ,"கை" ,"கொ" ,"கோ" ,"கௌ" ,
52
+ "ச" ,"சா" ,"சி" ,"சீ" ,"சு" ,"சூ" ,"செ" ,"சே" ,"சை" ,"சொ" ,"சோ" ,"சௌ" ,
53
+ "ட" ,"டா" ,"டி" ,"டீ" ,"டு" ,"டூ" ,"டெ" ,"டே" ,"டை" ,"டொ" ,"டோ" ,"டௌ",
54
+ "த" ,"தா" ,"தி" ,"தீ" ,"து" ,"தூ" ,"தெ" ,"தே" ,"தை" ,"தொ" ,"தோ" ,"தௌ",
55
+ "ப" ,"பா" ,"பி" ,"பீ" ,"பு" ,"பூ" ,"பெ" ,"பே" ,"பை" ,"பொ" ,"போ" ,"பௌ" ,
56
+ "ற" ,"றா" ,"றி" ,"றீ" ,"று" ,"றூ" ,"றெ" ,"றே" ,"றை" ,"றொ" ,"றோ" ,"றௌ",
57
+ "ஞ" ,"ஞா" ,"ஞி" ,"ஞீ" ,"ஞு" ,"ஞூ" ,"ஞெ" ,"ஞே" ,"ஞை" ,"ஞொ" ,"ஞோ" ,"ஞௌ" ,
58
+ "ங" ,"ஙா" ,"ஙி" ,"ஙீ" ,"ஙு" ,"ஙூ" ,"ஙெ" ,"ஙே" ,"ஙை" ,"ஙொ" ,"ஙோ" ,"ஙௌ" ,
59
+ "ண" ,"ணா" ,"ணி" ,"ணீ" ,"ணு" ,"ணூ" ,"ணெ" ,"ணே" ,"ணை" ,"ணொ" ,"ணோ" ,"ணௌ" ,
60
+ "ந" ,"நா" ,"நி" ,"நீ" ,"நு" ,"நூ" ,"நெ" ,"நே" ,"நை" ,"நொ" ,"நோ" ,"நௌ" ,
61
+ "ம" ,"மா" ,"மி" ,"மீ" ,"மு" ,"மூ" ,"மெ" ,"மே" ,"மை" ,"மொ" ,"மோ" ,"மௌ" ,
62
+ "ன" ,"னா" ,"னி" ,"னீ" ,"னு" ,"னூ" ,"னெ" ,"னே" ,"னை" ,"னொ" ,"னோ" ,"னௌ",
63
+ "ய" ,"யா" ,"யி" ,"யீ" ,"யு" ,"யூ" ,"யெ" ,"யே" ,"யை" ,"யொ" ,"யோ" ,"யௌ",
64
+ "ர" ,"ரா" ,"ரி" ,"ரீ" ,"ரு" ,"ரூ" ,"ரெ" ,"ரே" ,"ரை" ,"ரொ" ,"ரோ" ,"ரௌ",
65
+ "ல" ,"லா" ,"லி" ,"லீ" ,"லு" ,"லூ" ,"லெ" ,"லே" ,"லை" ,"லொ" ,"லோ" ,"லௌ" ,
66
+ "வ" ,"வா" ,"வி" ,"வீ" ,"வு" ,"வூ" ,"வெ" ,"வே" ,"வை" ,"வொ" ,"வோ" ,"வௌ" ,
67
+ "ழ" ,"ழா" ,"ழி" ,"ழீ" ,"ழு" ,"ழூ" ,"ழெ" ,"ழே" ,"ழை" ,"ழொ" ,"ழோ" ,"ழௌ" ,
68
+ "ள" ,"ளா" ,"ளி" ,"ளீ" ,"ளு" ,"ளூ" ,"ளெ" ,"ளே" ,"ளை" ,"ளொ" ,"ளோ" ,"ளௌ" ]
69
+
70
+ def Tamil.get_letters(word)
71
+ ## Split a tamil-unicode stream into
72
+ ## tamil characters (individuals).
73
+ """ splits the word into a character-list of tamil/english
74
+ characters present in the stream """
75
+ ta_letters = Array.new()
76
+ not_empty = false
77
+ wlen = word.length()
78
+ idx = 0
79
+ while (idx < wlen)
80
+ c = word[idx]
81
+ if @@uyir_letters.include?(c) or c == @@ayudha_letter
82
+ ta_letters.insert(-1,c)
83
+ not_empty = true
84
+ elsif @@grantha_agaram_letters.include?(c)
85
+ ta_letters.insert(-1,c)
86
+ not_empty = true
87
+ elsif @@accent_symbols.include?(c)
88
+ if not not_empty
89
+ # odd situation
90
+ ta_letters.insert(-1,c)
91
+ not_empty = true
92
+ else
93
+ ta_letters[-1] += c
94
+ end
95
+ else
96
+ if c < "\u00FF"
97
+ ta_letters.insert(-1, c )
98
+ else
99
+ if not_empty
100
+ ta_letters[-1]+= c
101
+ else
102
+ ta_letters.insert(-1,c)
103
+ not_empty = true
104
+ end
105
+ end
106
+ end
107
+ idx = idx + 1
108
+ end
109
+ return ta_letters
110
+ end
111
+
112
+ ## length of the definitions
113
+ def Tamil.accent_len( )
114
+ return Tamil::TA_ACCENT_LEN ## 13 = 12 + 1
115
+ end
116
+
117
+ def Tamil.ayudha_len( )
118
+ return Tamil::TA_AYUDHA_LEN ## 1
119
+ end
120
+
121
+ def Tamil.uyir_len( )
122
+ return Tamil::TA_UYIR_LEN ##12
123
+ end
124
+
125
+ def Tamil.mei_len( )
126
+ return Tamil::TA_MEI_LEN ##18
127
+ end
128
+
129
+ def Tamil.agaram_len( )
130
+ assert { @@agaram_letters.length == Tamil::TA_AGARAM_LEN }
131
+ return Tamil::TA_AGARAM_LEN ##18
132
+ end
133
+
134
+ def Tamil.uyirmei_len( )
135
+ return Tamil::TA_UYIRMEI_LEN ##216
136
+ end
137
+
138
+ def Tamil.tamil_len( )
139
+ return @@tamil_letters.length
140
+ end
141
+
142
+ ## access the letters
143
+ def Tamil.uyir( idx )
144
+ assert { ( idx >= 0 ) and ( idx < Tamil.uyir_len() ) }
145
+ return Tamil::uyir_letters[idx]
146
+ end
147
+
148
+ def Tamil.agaram( idx )
149
+ assert {( idx >= 0) and ( idx < Tamil.agaram_len() )}
150
+ return @@agaram_letters[idx]
151
+ end
152
+
153
+ def Tamil.mei( idx )
154
+ assert {( idx >= 0 ) and ( idx < Tamil.mei_len() )}
155
+ return @@mei_letters[idx]
156
+ end
157
+
158
+ def Tamil.uyirmei( idx )
159
+ assert {( idx >= 0 ) and ( idx < Tamil.uyirmei_len() ) }
160
+ return @@uyirmei_letters[idx]
161
+ end
162
+
163
+ end
164
+
165
+ # ## total tamil letters in use, including sanskrit letters
166
+ # tamil_letters = [
167
+
168
+ # ## /* Uyir */
169
+ # "அ","ஆ","இ", "ஈ","உ","ஊ","எ","ஏ","ஐ","ஒ","ஓ","ஔ",
170
+
171
+ # ##/* Ayuda Ezhuthu */
172
+ # "ஃ",
173
+
174
+ # ## /* Mei */
175
+ # "க்","ச்","ட்","த்","ப்","ற்","ஞ்","ங்","ண்","ந்","ம்","ன்","ய்","ர்","ல்","வ்","ழ்","ள்",
176
+
177
+ # ## /* Agaram */
178
+ # ## "க","ச","ட","த","ப","ற","ஞ","ங","ண","ந","ம","ன","ய","ர","ல","வ","ழ","ள",
179
+
180
+ # ## /* Sanskrit (Vada Mozhi) */
181
+ # ## "ஜ","ஷ", "ஸ","ஹ",
182
+
183
+ # ##/* Sanskrit (Mei) */
184
+ # "ஜ்","ஷ்", "ஸ்","ஹ்",
185
+
186
+ # ## /* Uyir Mei */
187
+ # "க" ,"கா" ,"கி" ,"கீ" ,"கு" ,"கூ" ,"கெ" ,"கே" ,"கை" ,"கொ" ,"கோ" ,"கௌ"
188
+ # ,"ச" ,"சா" ,"சி" ,"சீ" ,"சு" ,"சூ" ,"செ" ,"சே" ,"சை" ,"சொ" ,"சோ" ,"சௌ"
189
+ # ,"ட" ,"டா" ,"டி" ,"டீ" ,"டு" ,"டூ" ,"டெ" ,"டே" ,"டை" ,"டொ" ,"டோ" ,"டௌ"
190
+ # ,"த" ,"தா" ,"தி" ,"தீ" ,"து" ,"தூ" ,"தெ" ,"தே" ,"தை" ,"தொ" ,"தோ" ,"தௌ"
191
+ # ,"ப" ,"பா" ,"பி" ,"பீ" ,"பு" ,"பூ" ,"பெ" ,"பே" ,"பை" ,"பொ" ,"போ" ,"பௌ"
192
+ # ,"ற" ,"றா" ,"றி" ,"றீ" ,"று" ,"றூ" ,"றெ" ,"றே" ,"றை" ,"றொ" ,"றோ" ,"றௌ"
193
+ # ,"ஞ" ,"ஞா" ,"ஞி" ,"ஞீ" ,"ஞு" ,"ஞூ" ,"ஞெ" ,"ஞே" ,"ஞை" ,"ஞொ" ,"ஞோ" ,"ஞௌ"
194
+ # ,"ங" ,"ஙா" ,"ஙி" ,"ஙீ" ,"ஙு" ,"ஙூ" ,"ஙெ" ,"ஙே" ,"ஙை" ,"ஙொ" ,"ஙோ" ,"ஙௌ"
195
+ # ,"ண" ,"ணா" ,"ணி" ,"ணீ" ,"ணு" ,"ணூ" ,"ணெ" ,"ணே" ,"ணை" ,"ணொ" ,"ணோ" ,"ணௌ"
196
+ # ,"ந" ,"நா" ,"நி" ,"நீ" ,"நு" ,"நூ" ,"நெ" ,"நே" ,"நை" ,"நொ" ,"நோ" ,"நௌ"
197
+ # ,"ம" ,"மா" ,"மி" ,"மீ" ,"மு" ,"மூ" ,"மெ" ,"மே" ,"மை" ,"மொ" ,"மோ" ,"மௌ"
198
+ # ,"ன" ,"னா" ,"னி" ,"னீ" ,"னு" ,"னூ" ,"னெ" ,"னே" ,"னை" ,"னொ" ,"னோ" ,"னௌ"
199
+ # ,"ய" ,"யா" ,"யி" ,"யீ" ,"யு" ,"யூ" ,"யெ" ,"யே" ,"யை" ,"யொ" ,"யோ" ,"யௌ"
200
+ # ,"ர" ,"ரா" ,"ரி" ,"ரீ" ,"ரு" ,"ரூ" ,"ரெ" ,"ரே" ,"ரை" ,"ரொ" ,"ரோ" ,"ரௌ"
201
+ # ,"ல" ,"லா" ,"லி" ,"லீ" ,"லு" ,"லூ" ,"லெ" ,"லே" ,"லை" ,"லொ" ,"லோ" ,"லௌ"
202
+ # ,"வ" ,"வா" ,"வி" ,"வீ" ,"வு" ,"வூ" ,"வெ" ,"வே" ,"வை" ,"வொ" ,"வோ" ,"வௌ"
203
+ # ,"ழ" ,"ழா" ,"ழி" ,"ழீ" ,"ழு" ,"ழூ" ,"ழெ" ,"ழே" ,"ழை" ,"ழொ" ,"ழோ" ,"ழௌ"
204
+ # ,"ள" ,"ளா" ,"ளி" ,"ளீ" ,"ளு" ,"ளூ" ,"ளெ" ,"ளே" ,"ளை" ,"ளொ" ,"ளோ" ,"ளௌ"
205
+
206
+ # ##/* Sanskrit Uyir-Mei */
207
+ # ,"ஶ", "ஶா", "ஶி", "ஶீ", "ஶு", "ஶூ", "ஶெ", "ஶே", "ஶை", "ஶொ", "ஶோ", "ஶௌ"
208
+ # ,"ஜ" ,"ஜா" ,"ஜி" ,"ஜீ" ,"ஜு" ,"ஜூ" ,"ஜெ" ,"ஜே" ,"ஜை" ,"ஜொ" ,"ஜோ" ,"ஜௌ"
209
+ # ,"ஷ" ,"ஷா" ,"ஷி" ,"ஷீ" ,"ஷு" ,"ஷூ" ,"ஷெ" ,"ஷே" ,"ஷை" ,"ஷொ" ,"ஷோ" ,"ஷௌ"
210
+ # ,"ஸ" ,"ஸா" ,"ஸி" ,"ஸீ" ,"ஸு" ,"ஸூ" ,"ஸெ" ,"ஸே" ,"ஸை" ,"ஸொ" ,"ஸோ" ,"ஸௌ"
211
+ # ,"ஹ" ,"ஹா" ,"ஹி" ,"ஹீ" ,"ஹு" ,"ஹூ" ,"ஹெ" ,"ஹே" ,"ஹை" ,"ஹொ" ,"ஹோ" ,"ஹௌ"
212
+ # ,"க்ஷ" ,"க்ஷா" ,"க்ஷி" ,"க்ஷீ" ,"க்ஷு" ,"க்ஷூ" ,"க்ஷெ" ,"க்ஷே" ,"க்ஷை" ,"க்ஷொ" ,"க்ஷோ" ,"க்ஷௌ" ]
213
+
214
+ # grantha_uyirmei_letters = tamil_letters[tamil_letters.index("கா")-1:].clone()
215
+
216
+
217
+ # def uyirmei_constructed( mei_idx, uyir_idx):
218
+ # """ construct uyirmei letter give mei index and uyir index """
219
+ # idx,idy = mei_idx,uyir_idx
220
+ # assert ( idy >= 0 and idy < uyir_len() )
221
+ # assert ( idx >= 0 and idx < mei_len() )
222
+ # return agaram_letters[mei_idx]+accent_symbols[uyir_idx]
223
+
224
+ # def tamil( idx ):
225
+ # """ retrieve Tamil letter at canonical index from array utf8.tamil_letters """
226
+ # assert ( idx >= 0 and idx < tamil_len() )
227
+ # return tamil_letters[idx]
228
+
229
+ # # companion function to @tamil()
230
+ # def getidx(letter):
231
+ # for itr in range(0,tamil_len()):
232
+ # if tamil_letters[itr] == letter:
233
+ # return itr
234
+ # raise Exception("Cannot find letter in Tamil arichuvadi")
235
+
236
+ # ## useful part of the API:
237
+ # def istamil_prefix( word ):
238
+ # """ check if the given word has a tamil prefix. Returns
239
+ # either a True/False flag """
240
+ # for letter in tamil_letters:
241
+ # if ( word.find(letter) == 0 ):
242
+ # return True
243
+ # return False
244
+
245
+ # if not PYTHON3:
246
+ # is_tamil_unicode_predicate = lambda x: x >= unichr(2946) and x <= unichr(3066)
247
+ # else:
248
+ # is_tamil_unicode_predicate = lambda x: x >= chr(2946) and x <= chr(3066)
249
+ # def is_tamil_unicode( sequence ):
250
+ # # Ref: languagetool-office-extension/src/main/java/org/languagetool/openoffice/TamilDetector.java
251
+ # if type(sequence) is list:
252
+ # return list(map( is_tamil_unicode_predicate, sequence ))
253
+ # if len(sequence) > 1:
254
+ # return list(map( is_tamil_unicode_predicate, get_letters(sequence) ))
255
+ # return is_tamil_unicode_predicate( sequence )
256
+
257
+ # def all_tamil( word_in ):
258
+ # """ predicate checks if all letters of the input word are Tamil letters """
259
+ # if isinstance(word_in,list):
260
+ # word = word_in
261
+ # else:
262
+ # word = get_letters( word_in )
263
+ # return all( [(letter in tamil_letters) for letter in word] )
264
+
265
+ # def has_tamil( word ):
266
+ # """check if the word has any occurance of any tamil letter """
267
+ # # list comprehension is not necessary - we bail at earliest
268
+ # for letters in tamil_letters:
269
+ # if ( word.find(letters) >= 0 ):
270
+ # return True
271
+ # return False
272
+
273
+ # def istamil( tchar ):
274
+ # """ check if the letter tchar is prefix of
275
+ # any of tamil-letter. It suggests we have a tamil identifier"""
276
+ # if (tchar in tamil_letters):
277
+ # return True
278
+ # return False
279
+
280
+ # def istamil_alnum( tchar ):
281
+ # """ check if the character is alphanumeric, or tamil.
282
+ # This saves time from running through istamil() check. """
283
+ # return ( tchar.isalnum( ) or istamil( tchar ) )
284
+
285
+ # def reverse_word( word ):
286
+ # """ reverse a Tamil word according to letters not unicode-points """
287
+ # op = get_letters( word )
288
+ # op.reverse()
289
+ # return "".join(op)
290
+
291
+ # ## find out if the letters like, "பொ" are written in canonical "ப + ொ"" graphemes then
292
+ # ## return True. If they are written like "ப + ெ + ா" then return False on first occurrence
293
+ # def is_normalized( text ):
294
+ # TLEN,idx = len(text),1
295
+ # kaal = "ா"
296
+ # sinna_kombu, periya_kombu = "ெ", "ே"
297
+ # kombugal = [sinna_kombu, periya_kombu]
298
+
299
+ # def predicate( last_letter, prev_letter):
300
+ # if ((last_letter == kaal) and (prev_letter in kombugal)):
301
+ # return True
302
+ # return False
303
+ # if TLEN < 2:
304
+ # return True
305
+ # elif TLEN == 2:
306
+ # if predicate( text[-1], text[-2] ):
307
+ # return False
308
+ # return True
309
+ # a = text[0]
310
+ # b = text[1]
311
+ # assert idx == 1
312
+ # while (idx < TLEN):
313
+ # if predicate(b,a):
314
+ # return False
315
+ # a=b
316
+ # idx = idx + 1
317
+ # if idx < TLEN:
318
+ # b=text[idx]
319
+ # # reached end and nothing tripped us
320
+ # return True
321
+
322
+ # def _make_set(args):
323
+ # if PYTHON3:
324
+ # return frozenset(args)
325
+ # return set(args)
326
+
327
+ # grantha_agaram_set = _make_set(grantha_agaram_letters)
328
+ # accent_symbol_set = _make_set(accent_symbols)
329
+ # uyir_letter_set = _make_set(uyir_letters)
330
+
331
+
332
+ # _all_symbols = copy( accent_symbols )
333
+ # _all_symbols.extend( pulli_symbols )
334
+ # all_symbol_set = _make_set(_all_symbols)
335
+
336
+ # # same as get_letters but use as iterable
337
+ # def get_letters_iterable( word ):
338
+ # """ splits the word into a character-list of tamil/english
339
+ # characters present in the stream """
340
+ # WLEN,idx = len(word),0
341
+
342
+ # while (idx < WLEN):
343
+ # c = word[idx]
344
+ # #print(idx,hex(ord(c)),len(ta_letters))
345
+ # if c in uyir_letter_set or c == ayudha_letter:
346
+ # idx = idx + 1
347
+ # yield c
348
+ # elif c in grantha_agaram_set:
349
+ # if idx + 1 < WLEN and word[idx+1] in all_symbol_set:
350
+ # c2 = word[idx+1]
351
+ # idx = idx + 2
352
+ # yield (c + c2)
353
+ # else:
354
+ # idx = idx + 1
355
+ # yield c
356
+ # else:
357
+ # idx = idx + 1
358
+ # yield c
359
+ # raise StopIteration
360
+
361
+ # def get_words(letters,tamil_only=False):
362
+ # return [ word for word in get_words_iterable(letters,tamil_only) ]
363
+
364
+ # def get_words_iterable( letters, tamil_only=False ):
365
+ # """ given a list of UTF-8 letters section them into words, grouping them at spaces """
366
+
367
+ # # correct algorithm for get-tamil-words
368
+ # buf = []
369
+ # for idx,let in enumerate(letters):
370
+ # if not let.isspace():
371
+ # if istamil(let) or (not tamil_only):
372
+ # buf.append( let )
373
+ # else:
374
+ # if len(buf) > 0:
375
+ # yield "".join( buf )
376
+ # buf = []
377
+ # if len(buf) > 0:
378
+ # yield "".join(buf)
379
+
380
+ # def get_tamil_words( letters ):
381
+ # """ reverse a Tamil word according to letters, not unicode-points """
382
+ # return [word for word in get_words_iterable( letters, tamil_only = True )]
383
+
384
+ # if PYTHON3:
385
+ # def cmp( x, y):
386
+ # if x == y:
387
+ # return 0
388
+ # if x > y:
389
+ # return 1
390
+ # return -1
391
+
392
+ # # answer if word_a ranks ahead of, or at same level, as word_b in a Tamil dictionary order...
393
+ # # for use with Python : if a > 0
394
+ # def compare_words_lexicographic( word_a, word_b ):
395
+ # """ compare words in Tamil lexicographic order """
396
+ # # sanity check for words to be all Tamil
397
+ # if ( not all_tamil(word_a) ) or (not all_tamil(word_b)) :
398
+ # print("## ")
399
+ # print(word_a)
400
+ # print(word_b)
401
+ # print("Both operands need to be Tamil words")
402
+ # La = len(word_a)
403
+ # Lb = len(word_b)
404
+ # all_TA_letters = "".join(tamil_letters)
405
+ # for itr in range(0,min(La,Lb)):
406
+ # pos1 = all_TA_letters.find( word_a[itr] )
407
+ # pos2 = all_TA_letters.find( word_b[itr] )
408
+
409
+ # if pos1 != pos2 :
410
+ # #print not( pos1 > pos2), pos1, pos2
411
+ # return cmp(pos1, pos2)
412
+
413
+ # # result depends on if La is shorter than Lb, or 0 if La == Lb i.e. cmp
414
+ # return cmp(La,Lb)
415
+
416
+ # # return a list of ordered-pairs containing positions
417
+ # # that are common in word_a, and word_b; e.g.
418
+ # # தேடுக x தடங்கல் -> one common letter க [(2,3)]
419
+ # # சொல் x தேடுக -> no common letters []
420
+ # def word_intersection( word_a, word_b ):
421
+ # """ return a list of tuples where word_a, word_b intersect """
422
+ # positions = []
423
+ # word_a_letters = get_letters( word_a )
424
+ # word_b_letters = get_letters( word_b )
425
+ # for idx,wa in enumerate(word_a_letters):
426
+ # for idy,wb in enumerate(word_b_letters):
427
+ # if ( wa == wb ):
428
+ # positions.append( (idx, idy) )
429
+ # return positions
430
+
431
+ # def splitMeiUyir(uyirmei_char):
432
+ # """
433
+ # This function split uyirmei compound character into mei + uyir characters
434
+ # and returns in tuple.
435
+
436
+ # Input : It must be unicode tamil char.
437
+
438
+ # Written By : Arulalan.T
439
+ # Date : 22.09.2014
440
+
441
+ # """
442
+
443
+ # if not isinstance(uyirmei_char, PYTHON3 and str or unicode):
444
+ # raise ValueError("Passed input letter '%s' must be unicode, \
445
+ # not just string" % uyirmei_char)
446
+
447
+ # if uyirmei_char in mei_letters:
448
+ # return uyirmei_char
449
+
450
+ # if uyirmei_char in uyir_letters:
451
+ # return uyirmei_char
452
+
453
+ # if uyirmei_char not in grantha_uyirmei_letters:
454
+ # raise ValueError("Passed input letter '%s' is not tamil letter" % uyirmei_char)
455
+
456
+ # idx = grantha_uyirmei_letters.index(uyirmei_char)
457
+ # uyiridx = idx % 12
458
+ # meiidx = int((idx - uyiridx)/ 12)
459
+ # return (grantha_mei_letters[meiidx], uyir_letters[uyiridx])
460
+ # # end of def splitMeiUyir(uyirmei_char):
461
+
462
+ # def joinMeiUyir(mei_char, uyir_char):
463
+ # """
464
+ # This function join mei character and uyir character, and retuns as
465
+ # compound uyirmei unicode character.
466
+
467
+ # Inputs:
468
+ # mei_char : It must be unicode tamil mei char.
469
+ # uyir_char : It must be unicode tamil uyir char.
470
+
471
+ # Written By : Arulalan.T
472
+ # Date : 22.09.2014
473
+ # """
474
+ # if not isinstance(mei_char, PYTHON3 and str or unicode):
475
+ # raise ValueError("Passed input mei character '%s' must be unicode, \
476
+ # not just string" % mei_char)
477
+ # if not isinstance(uyir_char, PYTHON3 and str or unicode):
478
+ # raise ValueError("Passed input uyir character '%s' must be unicode, \
479
+ # not just string" % uyir_char)
480
+ # if mei_char not in grantha_mei_letters:
481
+ # raise ValueError("Passed input character '%s' is not a"
482
+ # "tamil mei character" % mei_char)
483
+ # if uyir_char not in uyir_letters:
484
+ # raise ValueError("Passed input character '%s' is not a"
485
+ # "tamil uyir character" % uyir_char)
486
+ # uyiridx = uyir_letters.index(uyir_char)
487
+ # meiidx = grantha_mei_letters.index(mei_char)
488
+ # # calculate uyirmei index
489
+ # uyirmeiidx = meiidx*12 + uyiridx
490
+ # return grantha_uyirmei_letters[uyirmeiidx]
491
+
492
+ # Tamil Letters
493
+ # அ ஆ இ ஈ உ ஊ எ ஏ ஐ ஒ ஓ ஔ ஃ
494
+ # க் ச் ட் த் ப் ற் ஞ் ங் ண் ந் ம் ன் ய் ர் ல் வ் ழ் ள் ஜ் ஷ் ஸ் ஹ்
495
+ # க ச ட த ப ற ஞ ங ண ந ம ன ய ர ல வ ழ ள ஜ ஷ ஸ ஹ
496
+ # க கா கி கீ கு கூ கெ கே கை கௌ
497
+ # ச சா சி சீ சு சூ செ சே சை சொ சோ சௌ
498
+ # ட டா டி டீ டு டூ டெ டே டை டொ டோ டௌ
499
+ # த தா தி தீ து தூ தெ தே தை தொ தோ தௌ
500
+ # ப பா பி பீ பு பூ பெ பே பை பொ போ பௌ
501
+ # ற றா றி றீ று றூ றெ றே றை றொ றோ றௌ
502
+ # ஞ ஞா ஞி ஞீ ஞு ஞூ ஞெ ஞே ஞை ஞொ ஞோ ஞௌ
503
+ # ங ஙா ஙி ஙீ ஙு ஙூ ஙெ ஙே ஙை ஙொ ஙோ ஙௌ
504
+ # ண ணா ணி ணீ ணு ணூ ணெ ணே ணை ணொ ணோ ணௌ
505
+ # ந நா நி நீ நு நூ நெ நே நை நொ நோ நௌ
506
+ # ம மா மி மீ மு மூ மெ மே மை மொ மோ மௌ
507
+ # ன னா னி னீ னு னூ னெ னே னை னொ னோ னௌ
508
+ # ய யா யி யீ யு யூ யெ யே யை யொ யோ யௌ
509
+ # ர ரா ரி ரீ ரு ரூ ரெ ரே ரை ரொ ரோ ரௌ
510
+ # ல லா லி லீ லு லூ லெ லே லை லொ லோ லௌ
511
+ # வ வா வி வீ வு வூ வெ வே வை வொ வோ வௌ
512
+ # ழ ழா ழி ழீ ழு ழூ ழெ ழே ழை ழொ ழோ ழௌ
513
+ # ள ளா ளி ளீ ளு ளூ ளெ ளே ளை ளொ ளோ ளௌ
514
+ # ஶ ஶா ஶி ஶீ ஶு ஶூ ஶெ ஶே ஶை ஶொ ஶோ ஶௌ
515
+ # ஜ ஜா ஜி ஜீ ஜு ஜூ ஜெ ஜே ஜை ஜொ ஜோ ஜௌ
516
+ # ஷ ஷா ஷி ஷீ ஷு ஷூ ஷெ ஷே ஷை ஷொ ஷோ ஷௌ
517
+ # ஸ ஸா ஸி ஸீ ஸு ஸூ ஸெ ஸே ஸை ஸொ ஸோ ஸௌ
518
+ # ஹ ஹா ஹி ஹீ ஹு ஹூ ஹெ ஹே ஹை ஹொ ஹோ ஹௌ
519
+ # க்ஷ க்ஷா க்ஷி க்ஷீ க்ஷு க்ஷூ க்ஷெ க்ஷே க்ஷை க்ஷொ க்ஷோ க்ஷௌ
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tamil
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.11'
5
+ platform: ruby
6
+ authors:
7
+ - Muthu Annamalai
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-05-19 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: open-tamil project provides a ruby gem 'tamil' for working with Tamil
14
+ language text and NLP
15
+ email: ezhillang@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/tamil.rb
21
+ homepage: http://ezhillang.org
22
+ licenses:
23
+ - MIT
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 2.0.14
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: open-tamil project provides a ruby gem 'tamil'
45
+ test_files: []