mosta-raramorph 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +56 -0
- data/bin/raramorph +6 -0
- data/lib/dictionaries/dictPrefixes +421 -0
- data/lib/dictionaries/dictStems +135989 -0
- data/lib/dictionaries/dictSuffixes +1170 -0
- data/lib/dictionaries/marshal_stems +0 -0
- data/lib/dictionaries/tableAB +2276 -0
- data/lib/dictionaries/tableAC +743 -0
- data/lib/dictionaries/tableBC +1584 -0
- data/lib/raramorph/arabic_latin_translator.rb +38 -0
- data/lib/raramorph/dictionary_entry.rb +40 -0
- data/lib/raramorph/in_memory_dictionary_handler.rb +325 -0
- data/lib/raramorph/in_memory_solutions_handler.rb +78 -0
- data/lib/raramorph/latin_arabic_translator.rb +35 -0
- data/lib/raramorph/logger.rb +20 -0
- data/lib/raramorph/raramorph.rb +417 -0
- data/lib/raramorph/solution.rb +592 -0
- data/lib/raramorph/translator.rb +40 -0
- data/lib/raramorph.rb +16 -0
- data/lib/raramorph_main.rb +34 -0
- data/lib/test_input/UTF-8.txt +32 -0
- data/raramorph.gemspec +42 -0
- metadata +75 -0
@@ -0,0 +1,592 @@
|
|
1
|
+
# A class to find the solution of the word
|
2
|
+
# Author:: eSpace technologies www.eSpace.com.eg
|
3
|
+
# Copyright:: 2008
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
class Solution
|
8
|
+
|
9
|
+
attr_reader :prefix, :stem, :suffix, :cnt
|
10
|
+
@@ends_with_set_for_pos_one = Set.new(["CONJ","EMPHATIC_PARTICLE","FUNC_WORD",
|
11
|
+
"FUT_PART","INTERJ","INTERROG_PART","IV1S","IV2MS",
|
12
|
+
"IV2FS","IV3MS","IV3FS","IV2D","IV2FD","IV3MD","IV3FD",
|
13
|
+
"IV1P","IV2MP","IV2FP","IV3MP","IV3FP","NEG_PART",
|
14
|
+
"PREP","RESULT_CLAUSE_PARTICLE"])
|
15
|
+
|
16
|
+
@@ends_with_set_for_pos_two = Set.new(["CASE_INDEF_NOM","CASE_INDEF_ACC",
|
17
|
+
"CASE_INDEF_ACCGEN","CASE_INDEF_GEN" ,"CASE_DEF_NOM" ,
|
18
|
+
"CASE_DEF_ACC" ,"CASE_DEF_ACCGEN","CASE_DEF_GEN" ,
|
19
|
+
"NSUFF_MASC_SG_ACC_INDEF" ,"NSUFF_FEM_SG" ,"NSUFF_MASC_DU_NOM" ,
|
20
|
+
"NSUFF_MASC_DU_NOM_POSS" ,"NSUFF_MASC_DU_ACCGEN" ,
|
21
|
+
"NSUFF_MASC_DU_ACCGEN_POSS" ,"NSUFF_FEM_DU_NOM" ,
|
22
|
+
"NSUFF_FEM_DU_NOM_POSS" ,"NSUFF_FEM_DU_ACCGEN" ,
|
23
|
+
"NSUFF_FEM_DU_ACCGEN_POSS" ,"NSUFF_MASC_PL_NOM" ,
|
24
|
+
"NSUFF_MASC_PL_NOM_POSS" ,"NSUFF_MASC_PL_ACCGEN" ,
|
25
|
+
"NSUFF_MASC_PL_ACCGEN_POSS" ,"NSUFF_FEM_PL" ,"POSS_PRON_1S",
|
26
|
+
"POSS_PRON_2MS" ,"POSS_PRON_2FS" ,"POSS_PRON_3MS" ,
|
27
|
+
"POSS_PRON_3FS","POSS_PRON_2D" ,"POSS_PRON_3D" ,"POSS_PRON_1P",
|
28
|
+
"POSS_PRON_2MP" ,"POSS_PRON_2FP" ,"POSS_PRON_3MP" ,"POSS_PRON_3FP" ,
|
29
|
+
"IVSUFF_DO:1S" ,"IVSUFF_DO:2MS" ,"IVSUFF_DO:2FS" ,"IVSUFF_DO:3MS" ,
|
30
|
+
"IVSUFF_DO:3FS" ,"IVSUFF_DO:2D" ,"IVSUFF_DO:3D" ,"IVSUFF_DO:1P" ,
|
31
|
+
"IVSUFF_DO:2MP" ,"IVSUFF_DO:2FP" ,"IVSUFF_DO:3MP" ,"IVSUFF_DO:3FP" ,
|
32
|
+
"IVSUFF_MOOD:I" ,"IVSUFF_SUBJ:2FS_MOOD:I" ,"IVSUFF_SUBJ:D_MOOD:I" ,
|
33
|
+
"IVSUFF_SUBJ:3D_MOOD:I" ,"IVSUFF_SUBJ:MP_MOOD:I" ,"IVSUFF_MOOD:S",
|
34
|
+
"IVSUFF_SUBJ:2FS_MOOD:SJ" ,"IVSUFF_SUBJ:D_MOOD:SJ","IVSUFF_SUBJ:MP_MOOD:SJ" ,
|
35
|
+
"IVSUFF_SUBJ:3MP_MOOD:SJ" ,"IVSUFF_SUBJ:FP" ,"PVSUFF_DO:1S" ,"PVSUFF_DO:2MS" ,
|
36
|
+
"PVSUFF_DO:2FS" ,"PVSUFF_DO:3MS" ,"PVSUFF_DO:3FS" ,"PVSUFF_DO:2D" ,
|
37
|
+
"PVSUFF_DO:3D" ,"PVSUFF_DO:1P" ,"PVSUFF_DO:2MP" ,"PVSUFF_DO:2FP" ,
|
38
|
+
"PVSUFF_DO:3MP" ,"PVSUFF_DO:3FP" ,"PVSUFF_SUBJ:1S" ,"PVSUFF_SUBJ:2MS" ,
|
39
|
+
"PVSUFF_SUBJ:2FS" ,"PVSUFF_SUBJ:3MS" ,"PVSUFF_SUBJ:3FS" ,"PVSUFF_SUBJ:2MD" ,
|
40
|
+
"PVSUFF_SUBJ:2FD" ,"PVSUFF_SUBJ:3MD" ,"PVSUFF_SUBJ:3FD" ,"PVSUFF_SUBJ:1P" ,
|
41
|
+
"PVSUFF_SUBJ:2MP" ,"PVSUFF_SUBJ:2FP" ,"PVSUFF_SUBJ:3MP" ,"PVSUFF_SUBJ:3FP" ,
|
42
|
+
"CVSUFF_DO:1S" ,"CVSUFF_DO:3MS" ,"CVSUFF_DO:3FS" ,"CVSUFF_DO:3D" ,
|
43
|
+
"CVSUFF_DO:1P" ,"CVSUFF_DO:3MP" ,"CVSUFF_DO:3FP" ,"CVSUFF_SUBJ:2MS" ,
|
44
|
+
"CVSUFF_SUBJ:2FS" ,"CVSUFF_SUBJ:2MP"])
|
45
|
+
protected
|
46
|
+
|
47
|
+
# Constructs a solution for a word. Note that the prefix, stem and suffix combination is <b>recomputed</b>
|
48
|
+
#and may not necessarily match with the information provided by the dictionaries.
|
49
|
+
# * [debug] Whether or not the dictionnaries inconsistencies should be output
|
50
|
+
# * [cnt] Order in sequence ; not very useful actually
|
51
|
+
# * [prefix The prefix as provided by the prefixes dictionnary
|
52
|
+
# * [stem] The stem as provided by the stems dictionnary
|
53
|
+
# * [suffix] The suffix as provided by the suffixes dictionnary
|
54
|
+
#
|
55
|
+
def initialize(debug, cnt, prefix, stem, suffix)
|
56
|
+
# Whether or not the dictionnaries inconsistencies should be output
|
57
|
+
@debug = debug;
|
58
|
+
# The order in solutions' sequence.
|
59
|
+
@cnt = cnt;
|
60
|
+
# The dictionary entry of the prefix.
|
61
|
+
@prefix = prefix;
|
62
|
+
# The dictionary entry of the stem.
|
63
|
+
@stem = stem;
|
64
|
+
# The dictionary entry of the suffix.
|
65
|
+
@suffix = suffix;
|
66
|
+
# The prefixes POS.
|
67
|
+
@prefixesPOS = prefix.pos
|
68
|
+
#The stems POS.
|
69
|
+
@stemsPOS = stem.pos
|
70
|
+
#The suffixes POS.
|
71
|
+
@suffixesPOS = suffix.pos
|
72
|
+
#The prefixes glosses.
|
73
|
+
@prefixesGlosses = prefix.glosses
|
74
|
+
#The stems glosses
|
75
|
+
@stemsGlosses = stem.glosses
|
76
|
+
#The suffixes glosses.
|
77
|
+
@suffixesGlosses = suffix.glosses
|
78
|
+
|
79
|
+
puts "\"#{get_lemma()}\" : stem's sizes for POS (\"#{@stemsPOS.length.to_s}\") and GLOSS (\"#{@stemsGlosses.length.to_s}\") do not match" if (@stemsPOS.length != @stemsGlosses.length and @debug)
|
80
|
+
|
81
|
+
#Normalize stems since some of them can contain prefixes
|
82
|
+
|
83
|
+
while(@stemsPOS.length>0)
|
84
|
+
stemPOS = @stemsPOS.slice(0)
|
85
|
+
|
86
|
+
stemPOS.force_encoding "UTF-8" if(stemPOS)
|
87
|
+
|
88
|
+
if (@stemsGlosses.length>0)
|
89
|
+
stemGloss = @stemsGlosses.slice(0)
|
90
|
+
else
|
91
|
+
stemGloss = nil
|
92
|
+
end
|
93
|
+
|
94
|
+
stemGloss.force_encoding "UTF-8" if(stemGloss)
|
95
|
+
|
96
|
+
|
97
|
+
if(stemPOS.ends_with_suffix_set?(@@ends_with_set_for_pos_one) )
|
98
|
+
@stemsPOS.slice!(0)
|
99
|
+
@prefixesPOS.push(stemPOS)
|
100
|
+
if (stemGloss)
|
101
|
+
@stemsGlosses.slice!(0)
|
102
|
+
@prefixesGlosses.push(stemGloss)
|
103
|
+
end
|
104
|
+
else
|
105
|
+
break
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
#Normalize stems since some of them can contain suffixes
|
110
|
+
while(@stemsPOS.length>0)
|
111
|
+
stemPOS = @stemsPOS.slice(@stemsPOS.length-1)
|
112
|
+
if(stemPOS)
|
113
|
+
stemPOS.force_encoding "UTF-8"
|
114
|
+
end
|
115
|
+
if (@stemsGlosses.length>0)
|
116
|
+
stemGloss = @stemsGlosses.slice(@stemsGlosses.length-1)
|
117
|
+
else
|
118
|
+
stemGloss = nil
|
119
|
+
end
|
120
|
+
if(stemGloss)
|
121
|
+
stemGloss.force_encoding "UTF-8"
|
122
|
+
end
|
123
|
+
|
124
|
+
if(stemPOS.ends_with_suffix_set?(@@ends_with_set_for_pos_two))
|
125
|
+
@stemsPOS.slice!(@stemsPOS.length-1)
|
126
|
+
@suffixesPOS.insert(0,stemPOS)
|
127
|
+
if (stemGloss)
|
128
|
+
@stemsGlosses.slice!(@stemsGlosses.length-1)
|
129
|
+
@suffixesGlosses.insert(0,stemGloss)
|
130
|
+
end
|
131
|
+
else
|
132
|
+
break
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
#Normalization of bayon, bayona, bayoni
|
137
|
+
if (@stemsPOS.length > 1)
|
138
|
+
pos0 = @stemsPOS[0]
|
139
|
+
pos1 = @stemsPOS[1]
|
140
|
+
if(pos1=="bayon" or pos1=="bayona" or pos1=="bayoni")
|
141
|
+
if (@debug)
|
142
|
+
puts "Merging \""+pos1+"\" into first part of stem \"" + pos0 + "\""
|
143
|
+
end
|
144
|
+
array = pos0.split("/");
|
145
|
+
sb = array[0] + pos1+"/"
|
146
|
+
i=1
|
147
|
+
while( i < array.length)
|
148
|
+
sb+=array[i]
|
149
|
+
end
|
150
|
+
@stemsPOS.slice!(0)
|
151
|
+
@stemsPOS[0] = sb
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# Sanity check
|
156
|
+
if (@stemsPOS.length > 1 and @debug)
|
157
|
+
puts"More than one stem for " + @stemsPOS.to_string()
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
# Returns the lemma id in the stems dictionary.
|
163
|
+
# * @return The lemma ID
|
164
|
+
#
|
165
|
+
def get_lemma
|
166
|
+
x = Regexp.compile("(_|-).*$")
|
167
|
+
@stem.lemma_id.sub(x,"")
|
168
|
+
end
|
169
|
+
|
170
|
+
|
171
|
+
# Returns the vocalizations of the <b>recomputed</b> prefixes in the Buckwalter transliteration system
|
172
|
+
# or <b>nil</b> if there are no prefixes for the word.
|
173
|
+
# * @return The vocalizations
|
174
|
+
#
|
175
|
+
def get_prefixes_vocalizations
|
176
|
+
vocalizations(false,@prefixesPOS,false)
|
177
|
+
end
|
178
|
+
|
179
|
+
# Returns the vocalizations of the <b>recomputed</b> prefixes in arabic
|
180
|
+
# or <b>nil</b> if there are no prefixes for the word.
|
181
|
+
# * @return The vocalizations
|
182
|
+
#
|
183
|
+
def get_prefixes_arabic_vocalizations
|
184
|
+
vocalizations(true,@prefixesPOS,false)
|
185
|
+
end
|
186
|
+
|
187
|
+
# Returns the vocalization of the <b>recomputed</b> stem in the Buckwalter transliteration system
|
188
|
+
# or <b>nil</b> if there is no stem for the word.
|
189
|
+
# * @return The vocalization
|
190
|
+
#
|
191
|
+
def get_stem_vocalization
|
192
|
+
vocalizations(false,@stemsPOS,true)
|
193
|
+
end
|
194
|
+
|
195
|
+
# Returns the vocalization of the <b>recomputed</b> stem in arabic
|
196
|
+
# or <b>nil</b> if there is no stem for the word.
|
197
|
+
# * @return The vocalization
|
198
|
+
#
|
199
|
+
def get_stem_arabic_vocalization
|
200
|
+
vocalizations(true,@stemsPOS,true)
|
201
|
+
end
|
202
|
+
|
203
|
+
# Returns the vocalizations of the <b>recomputed</b> suffixes in the Buckwalter transliteration system
|
204
|
+
# or <b>nil</b> if there are no suffixes for the word.
|
205
|
+
# * @return The vocalizations
|
206
|
+
#
|
207
|
+
def get_suffixes_vocalizations
|
208
|
+
vocalizations(false,@suffixesPOS,false)
|
209
|
+
end
|
210
|
+
|
211
|
+
# Returns the vocalizations of the <b>recomputed</b> suffixes in arabic
|
212
|
+
# or <b>nil</b> if there are no suffixes for the word.
|
213
|
+
# * @return The vocalizations
|
214
|
+
#
|
215
|
+
def get_suffixes_arabic_vocalizations
|
216
|
+
vocalizations(true,@suffixesPOS,false)
|
217
|
+
end
|
218
|
+
|
219
|
+
|
220
|
+
# Returns the vocalization of the word in the Buckwalter transliteration system.
|
221
|
+
# * @return The vocalization
|
222
|
+
#
|
223
|
+
def get_word_vocalization
|
224
|
+
sb = ""
|
225
|
+
sb.force_encoding "UTF-8"
|
226
|
+
vocal = get_prefixes_vocalizations()
|
227
|
+
if(vocal!=nil)
|
228
|
+
sb += vocal[0].to_s
|
229
|
+
end
|
230
|
+
|
231
|
+
s =get_stem_vocalization()
|
232
|
+
if ( s != nil)
|
233
|
+
sb+=s
|
234
|
+
end
|
235
|
+
vocal =get_suffixes_vocalizations()
|
236
|
+
if(vocal!=nil)
|
237
|
+
sb += vocal[0].to_s
|
238
|
+
end
|
239
|
+
|
240
|
+
return sb
|
241
|
+
end
|
242
|
+
|
243
|
+
# Returns the vocalization of the word in arabic.
|
244
|
+
# * @return The vocalization
|
245
|
+
#
|
246
|
+
def get_word_arabic_vocalization
|
247
|
+
sb = ""
|
248
|
+
sb.force_encoding "UTF-8"
|
249
|
+
vocal = get_prefixes_arabic_vocalizations()
|
250
|
+
sb += vocal[0].to_s if vocal!=nil
|
251
|
+
|
252
|
+
s = get_stem_arabic_vocalization()
|
253
|
+
sb+=s if s!= nil
|
254
|
+
vocal = get_suffixes_arabic_vocalizations()
|
255
|
+
if(vocal!=nil)
|
256
|
+
sb += vocal[0].to_s
|
257
|
+
end
|
258
|
+
|
259
|
+
return sb
|
260
|
+
end
|
261
|
+
|
262
|
+
# Returns the morphology of the prefix.
|
263
|
+
# * @return The morphology
|
264
|
+
#
|
265
|
+
def get_prefix_morphology
|
266
|
+
@prefix.morphology
|
267
|
+
end
|
268
|
+
|
269
|
+
# Returns the morphology of the stem.
|
270
|
+
# * @return The morphology
|
271
|
+
#
|
272
|
+
def get_stem_morphology
|
273
|
+
@stem.morphology
|
274
|
+
end
|
275
|
+
|
276
|
+
# Returns the morphology of the suffix.
|
277
|
+
# * @return The morphology
|
278
|
+
#
|
279
|
+
def get_suffix_morphology
|
280
|
+
@suffix.morphology
|
281
|
+
end
|
282
|
+
|
283
|
+
# Returns the morphology of the word.
|
284
|
+
# * @return The morphology
|
285
|
+
#
|
286
|
+
def get_word_morphology
|
287
|
+
sb = ""
|
288
|
+
sb.force_encoding "UTF-8"
|
289
|
+
if (!@prefix.morphology.empty? and @prefix.morphology != nil )
|
290
|
+
sb+= "\tprefix : #{@prefix.morphology}\n"
|
291
|
+
end
|
292
|
+
if (!@stem.morphology.empty? and @stem.morphology != nil)
|
293
|
+
sb+= "\tstem : #{@stem.morphology}\n"
|
294
|
+
end
|
295
|
+
if (!@suffix.morphology.empty? and @suffix.morphology != nil)
|
296
|
+
sb+= "\tsuffix : #{@suffix.morphology}\n"
|
297
|
+
end
|
298
|
+
return sb
|
299
|
+
end
|
300
|
+
|
301
|
+
# Returns the grammatical categories of the <b>recomputed</b> prefixes
|
302
|
+
# or <b>nil</b> if there are no prefixes for the word.
|
303
|
+
# * @return The grammatical categories
|
304
|
+
#
|
305
|
+
def get_prefixes_POS
|
306
|
+
perform_on_POS(1,@prefixesPOS,1)
|
307
|
+
end
|
308
|
+
|
309
|
+
# Returns The vocalizations using the Buckwalter transliteration system of the <b>recomputed</b> prefixes and their grammatical categories
|
310
|
+
# or <b>nil</b> if there are no prefixes for the word.
|
311
|
+
# * @return The vocalizations and the grammatical categories
|
312
|
+
#
|
313
|
+
def get_prefixes_long_POS
|
314
|
+
perform_on_POS(2,@prefixesPOS,1)
|
315
|
+
end
|
316
|
+
|
317
|
+
# Returns The vocalizations in arabic of the <b>recomputed</b> prefixes and their grammatical categories
|
318
|
+
# or <b>nil</b> if there is no stem for the word.
|
319
|
+
# * @return The vocalizations and the grammatical categories.
|
320
|
+
#
|
321
|
+
def get_prefixes_arabic_long_POS
|
322
|
+
perform_on_POS(3,@prefixesPOS,1)
|
323
|
+
end
|
324
|
+
|
325
|
+
# Returns the grammatical category of the <b>recomputed</b> stem.
|
326
|
+
# * @return The grammatical category
|
327
|
+
#
|
328
|
+
def get_stem_POS
|
329
|
+
perform_on_POS(1,@stemsPOS,2)
|
330
|
+
end
|
331
|
+
|
332
|
+
# Returns The vocalization using the Buckwalter transliteration system of the <b>recomputed</b> stem and its grammatical category
|
333
|
+
# or <b>nil</b> if there is no stem for the word.
|
334
|
+
# * @return The vocalizations and the grammatical categories.
|
335
|
+
#
|
336
|
+
def get_stem_long_POS
|
337
|
+
perform_on_POS(2,@stemsPOS,2)
|
338
|
+
end
|
339
|
+
|
340
|
+
# Returns The vocalization in arabic of the <b>recomputed</b> stem and its grammatical category
|
341
|
+
# or <b>nil</b> if there is no stem for the word.
|
342
|
+
# * @return The vocalizations and the grammatical categories.
|
343
|
+
#
|
344
|
+
def get_stem_arabic_long_POS
|
345
|
+
perform_on_POS(3,@stemsPOS,2)
|
346
|
+
end
|
347
|
+
|
348
|
+
# Returns The vocalization in arabic of the <b>recomputed</b> stem and its grammatical category
|
349
|
+
# or <b>nil</b> if there is no stem for the word.
|
350
|
+
# * @return The grammatical categories
|
351
|
+
#
|
352
|
+
def get_suffixes_POS
|
353
|
+
perform_on_POS(1,@suffixesPOS,3)
|
354
|
+
end
|
355
|
+
|
356
|
+
# Returns The vocalizations using the Buckwalter transliteration system of the <b>recomputed</b> stem and its grammatical category
|
357
|
+
# or <b>nil</b> if there is no stem for the word.
|
358
|
+
# * @return The vocalizations and the grammatical categories.
|
359
|
+
#
|
360
|
+
def get_suffixes_long_POS
|
361
|
+
perform_on_POS(2,@suffixesPOS,3)
|
362
|
+
end
|
363
|
+
|
364
|
+
# Returns The vocalization in arabic of the <b>recomputed</b> stem and its grammatical category
|
365
|
+
# or <b>nil</b> if there is no stem for the word.
|
366
|
+
# * @return The vocalizations and the grammatical categories.
|
367
|
+
#
|
368
|
+
def get_suffixes_arabic_long_POS
|
369
|
+
perform_on_POS(3,@suffixesPOS,3)
|
370
|
+
end
|
371
|
+
|
372
|
+
# Returns The vocalization of the word in the Buckwalter transliteration system and its grammatical categories.
|
373
|
+
# * @return The vocalization and the grammatical categories
|
374
|
+
#
|
375
|
+
def get_word_long_POS
|
376
|
+
word_POS(false)
|
377
|
+
end
|
378
|
+
|
379
|
+
# Returns The vocalization of the word in arabic and its grammatical categories.
|
380
|
+
# * @return The vocalization and the grammatical categories
|
381
|
+
#
|
382
|
+
def get_word_arabic_long_POS
|
383
|
+
word_POS(true)
|
384
|
+
end
|
385
|
+
|
386
|
+
# Returns the english glosses of the prefixes.
|
387
|
+
# * @return The glosses.
|
388
|
+
#
|
389
|
+
def get_prefixes_glosses
|
390
|
+
if(@prefixesGlosses.empty?)
|
391
|
+
return nil
|
392
|
+
else
|
393
|
+
return @prefixesGlosses
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
# Returns the english gloss of the stem.
|
398
|
+
# * @return The gloss.
|
399
|
+
#
|
400
|
+
def get_stem_gloss
|
401
|
+
if (@stemsGlosses.empty?)
|
402
|
+
return nil
|
403
|
+
end
|
404
|
+
if ((@stemsGlosses.length > 1) and @debug)
|
405
|
+
puts "More than one gloss for " + @stemsGlosses.to_s
|
406
|
+
end
|
407
|
+
#return the first anyway :-(
|
408
|
+
return @stemsGlosses[0]
|
409
|
+
|
410
|
+
end
|
411
|
+
|
412
|
+
# Returns the english glosses of the suffixes.
|
413
|
+
# * @return The glosses.
|
414
|
+
#
|
415
|
+
def get_suffixes_glosses
|
416
|
+
if(@suffixesGlosses.empty?)
|
417
|
+
return nil
|
418
|
+
else
|
419
|
+
return @suffixesGlosses
|
420
|
+
end
|
421
|
+
end
|
422
|
+
|
423
|
+
# Returns the english glosses of the word.
|
424
|
+
# * @return The glosses.
|
425
|
+
#
|
426
|
+
def get_word_glosses
|
427
|
+
sb = ""
|
428
|
+
sb.force_encoding "UTF-8"
|
429
|
+
glosses = get_prefixes_glosses()
|
430
|
+
if (glosses and glosses[0] != nil)
|
431
|
+
sb+=("\tprefix : #{glosses[0].gsub(";","/")}\n")
|
432
|
+
end
|
433
|
+
if (get_stem_gloss() != nil)
|
434
|
+
sb+=("\tstem : #{get_stem_gloss().gsub(";","/")}\n")
|
435
|
+
end
|
436
|
+
glosses = get_suffixes_glosses()
|
437
|
+
if (glosses and glosses[0] != nil)
|
438
|
+
sb+=("\tsuffix : #{glosses[0].gsub(";","/")}\n")
|
439
|
+
end
|
440
|
+
return sb
|
441
|
+
end
|
442
|
+
|
443
|
+
# Returns a string representation of how the word can be analyzed using the Buckwalter transliteration system for the vocalizations.
|
444
|
+
# * @return The representation
|
445
|
+
#
|
446
|
+
public
|
447
|
+
def to_s
|
448
|
+
ret = ""
|
449
|
+
ret.force_encoding "UTF-8"
|
450
|
+
ret = "\n SOLUTION # #{ @cnt.to_s} \n Lemma : #{ get_lemma() } \n
|
451
|
+
Vocalized as : \t #{get_word_vocalization()} \n
|
452
|
+
Morphology : \n #{ get_word_morphology()}
|
453
|
+
Grammatical category : \n
|
454
|
+
#{get_word_long_POS()} Glossed as : \n
|
455
|
+
#{get_word_glosses()} "
|
456
|
+
ret
|
457
|
+
end
|
458
|
+
|
459
|
+
# Returns a string representation of how the word can be analyzed using arabic for the vocalizations..
|
460
|
+
# * @return The representation
|
461
|
+
#
|
462
|
+
def to_arabized_string
|
463
|
+
ret = ""
|
464
|
+
ret.force_encoding "UTF-8"
|
465
|
+
ret = "\n SOLUTION # #{ @cnt.to_s} \n Lemma : #{ get_lemma() } \n
|
466
|
+
Vocalized as : \t #{get_word_arabic_vocalization()} \n
|
467
|
+
Morphology : \n #{ get_word_morphology()}
|
468
|
+
Grammatical category : \n
|
469
|
+
#{get_word_arabic_long_POS()} Glossed as : \n
|
470
|
+
#{get_word_glosses()} "
|
471
|
+
ret
|
472
|
+
end
|
473
|
+
|
474
|
+
private
|
475
|
+
|
476
|
+
# Returns an array of vocalizations according to type specified in the given parameters
|
477
|
+
# * [arabic] Whether or not vocalization is for arabic
|
478
|
+
# * [arr] The array utilized, either of prefixes, stems, suffixes
|
479
|
+
# * [one] Whether or not we are manipulating single vocalization (only true for stem vocalizations, false for suffixes and prefixes)
|
480
|
+
#
|
481
|
+
def vocalizations(arabic, arr, one)
|
482
|
+
if (arr.empty?)
|
483
|
+
return nil
|
484
|
+
end
|
485
|
+
vocalizations = []
|
486
|
+
arr.each do |pos|
|
487
|
+
array = pos.split("/")
|
488
|
+
if(arabic)
|
489
|
+
sb = LatinArabicTranslator.translate(array[0])
|
490
|
+
sb.force_encoding "UTF-8"
|
491
|
+
vocalizations << sb
|
492
|
+
else
|
493
|
+
vocalizations << array[0]
|
494
|
+
end
|
495
|
+
end
|
496
|
+
if(one)
|
497
|
+
if ( (vocalizations.length > 1) and @debug)
|
498
|
+
puts "More than one stem for " + vocalizations.to_s
|
499
|
+
end
|
500
|
+
return vocalizations[0]
|
501
|
+
else
|
502
|
+
return vocalizations
|
503
|
+
end
|
504
|
+
end
|
505
|
+
|
506
|
+
# Returns an array of vocalizations according to type specified in the given parameters
|
507
|
+
# * [type] Specifies the type of the function to perform, (1 for regular, 2 for long, 3 for arabic)
|
508
|
+
# * [arr] The array utilized, either of prefixes, stems, suffixes
|
509
|
+
# * [pre_stem_suff] Specifying which type of arrays are being handled (1 for prefixes, 2 for stems, 3 for suffixes)
|
510
|
+
#
|
511
|
+
def perform_on_POS(type, arr, pre_stem_suff)
|
512
|
+
if (arr.empty?)
|
513
|
+
return nil
|
514
|
+
end
|
515
|
+
temp_POS = []
|
516
|
+
arr.each do |pos|
|
517
|
+
array = pos.split("/")
|
518
|
+
j=1
|
519
|
+
if(type==1)
|
520
|
+
sb = ""
|
521
|
+
elsif(type==2)
|
522
|
+
sb = "#{array[0]}\t"
|
523
|
+
else
|
524
|
+
sb = "#{LatinArabicTranslator.translate(array[0])}\t"
|
525
|
+
sb.force_encoding "UTF-8"
|
526
|
+
end
|
527
|
+
sb << array[1..array.length].join(" / ")
|
528
|
+
temp_POS.push(sb)
|
529
|
+
end
|
530
|
+
|
531
|
+
if(pre_stem_suff==2)
|
532
|
+
if ((temp_POS.length > 1) and @debug)
|
533
|
+
puts "More than one stem for #{temp_POS.to_s}"
|
534
|
+
end
|
535
|
+
if (type ==1 and temp_POS[0].empty?)
|
536
|
+
puts "Empty POS for stem #{get_stem_long_POS()}"
|
537
|
+
end
|
538
|
+
#return the first anyway :-(
|
539
|
+
return temp_POS[0]
|
540
|
+
else
|
541
|
+
return temp_POS
|
542
|
+
end
|
543
|
+
end
|
544
|
+
|
545
|
+
# Returns the vocalizations and the grammatical categories
|
546
|
+
# * [arabic] Boolean to choose, Buckwalter transliteration system or arabic
|
547
|
+
#
|
548
|
+
def word_POS(arabic)
|
549
|
+
sb=""
|
550
|
+
if(arabic)
|
551
|
+
temp_POS =get_prefixes_arabic_long_POS()
|
552
|
+
else
|
553
|
+
temp_POS =get_prefixes_long_POS()
|
554
|
+
end
|
555
|
+
if (temp_POS != nil)
|
556
|
+
if (temp_POS[0]!=nil)
|
557
|
+
sb << ("\tprefix : #{temp_POS[0]}\n")
|
558
|
+
end
|
559
|
+
end
|
560
|
+
if(arabic)
|
561
|
+
s = get_stem_arabic_long_POS()
|
562
|
+
else
|
563
|
+
s = get_stem_long_POS()
|
564
|
+
end
|
565
|
+
if ( s != nil)
|
566
|
+
sb << ("\tstem : #{s} \n")
|
567
|
+
end
|
568
|
+
if(arabic)
|
569
|
+
temp_POS =get_suffixes_arabic_long_POS()
|
570
|
+
else
|
571
|
+
temp_POS =get_suffixes_long_POS()
|
572
|
+
end
|
573
|
+
if (temp_POS != nil)
|
574
|
+
if (temp_POS[0]!=nil)
|
575
|
+
sb << ("\tsuffix : #{temp_POS[0]}\n")
|
576
|
+
end
|
577
|
+
end
|
578
|
+
return sb
|
579
|
+
end
|
580
|
+
|
581
|
+
end
|
582
|
+
|
583
|
+
class String
|
584
|
+
def ends_with_suffix_set?(ends_with_suffix_set)
|
585
|
+
length = self.length
|
586
|
+
length.times { |i|
|
587
|
+
return true if ends_with_suffix_set.member?(self[i..length])
|
588
|
+
|
589
|
+
}
|
590
|
+
return false
|
591
|
+
end
|
592
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# Class For Translation
|
2
|
+
# Author:: eSpace technologies www.eSpace.com.eg
|
3
|
+
# Copyright:: 2008
|
4
|
+
|
5
|
+
class Translator
|
6
|
+
|
7
|
+
TABLE = { "ہ"=>"A" , "ء"=>"A","آ"=>"A" ,"أ"=>"A","ؤ"=>"A", "إ"=>"A",
|
8
|
+
"ا"=>"C" ,
|
9
|
+
"ب"=>"E", "ة"=>"E" , "ت"=>"E" , "ث"=>"E",
|
10
|
+
"ج"=>"I" , "ح"=>"I" , "خ"=>"I" , "د"=>"I",
|
11
|
+
"ر"=>"N" ,
|
12
|
+
"ز"=>"O" , "س"=>"O" , "ش"=>"O" , "ص"=>"O" , "ض"=>"O" ,
|
13
|
+
"ظ"=>"U" , "ع"=>"U" , "غ"=>"U" , "ـ"=>"U" ,
|
14
|
+
"à"=>"a" , "ل"=>"a" , "â"=>"a" , "م"=>"a" , "ن"=>"a" , "ه"=>"a" ,
|
15
|
+
"ç"=>"c" ,
|
16
|
+
"è"=>"e" , "é"=>"e" , "ê"=>"e" , "ë"=>"e" ,
|
17
|
+
"ى"=>"i" , "ي"=>"i" , "î"=>"i" , "ï"=>"i" ,
|
18
|
+
"ٌ"=>"n" ,
|
19
|
+
"ٍ"=>"o" , "َ"=>"o" , "ô"=>"o" , "ُ"=>"o" , "ِ"=>"o" ,
|
20
|
+
"ù"=>"u" , "ْ"=>"u" , "û"=>"u" , "ü"=>"u" ,
|
21
|
+
"ئ"=>"AE" , "ٹ"=>"Sh" , "ژ"=>"Zh" , "ك"=>"ss" , "و"=>"ae" , "ڑ"=>"sh" , ""=>"zh" }
|
22
|
+
|
23
|
+
# * Translate The String
|
24
|
+
def translate(string)
|
25
|
+
result = ""
|
26
|
+
i = 0
|
27
|
+
## IF non Utf8 Char return
|
28
|
+
return string unless string.length % 2 ==0
|
29
|
+
while i < string.length-1
|
30
|
+
char = string[i..i+1]
|
31
|
+
result+= TABLE[char].nil? ? char : TABLE[char]
|
32
|
+
i+=2
|
33
|
+
end
|
34
|
+
result
|
35
|
+
end
|
36
|
+
|
37
|
+
def table(str)
|
38
|
+
TABLE[str]
|
39
|
+
end
|
40
|
+
end
|
data/lib/raramorph.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#Dir[File.join(File.dirname(__FILE__), 'raramorph/**/*.rb')].sort.each { |lib| require lib }
|
2
|
+
|
3
|
+
$:.unshift File.expand_path(File.dirname(__FILE__) )
|
4
|
+
start = Time.now
|
5
|
+
require 'set'
|
6
|
+
require 'stringio'
|
7
|
+
require 'raramorph/logger'
|
8
|
+
require 'raramorph/translator'
|
9
|
+
require 'raramorph/arabic_latin_translator'
|
10
|
+
require 'raramorph/latin_arabic_translator'
|
11
|
+
require 'raramorph/in_memory_dictionary_handler'
|
12
|
+
require 'raramorph/in_memory_solutions_handler'
|
13
|
+
require 'raramorph/solution'
|
14
|
+
require 'raramorph/dictionary_entry'
|
15
|
+
require 'raramorph/raramorph'
|
16
|
+
puts "Time Elapsed loading dictionaries= " + ( Time.now - start).to_s
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# ARGV[0] # Input File Name
|
2
|
+
# ARGV[1] # Outpute File Name
|
3
|
+
# ARGV[2] # Verbose Default False
|
4
|
+
# ARGV[4] # BuckWalter Default False ( Arabic Output)
|
5
|
+
$:.unshift File.expand_path(File.dirname(__FILE__) )
|
6
|
+
if ARGV.length >= 2 and ARGV.length <= 4
|
7
|
+
require 'raramorph'
|
8
|
+
start = Time.now
|
9
|
+
verbose = false
|
10
|
+
not_arabic = true
|
11
|
+
verbose = true if ARGV[2] and ARGV[2] == "-v"
|
12
|
+
not_arabic = false if ARGV[3] and ARGV[3] == "-a"
|
13
|
+
not_arabic = false if ARGV[2] and ARGV[2] == "-a"
|
14
|
+
Raramorph.execute(ARGV[0] , ARGV[1] , verbose , not_arabic )
|
15
|
+
puts "Time Elapsed= " + ( Time.now - start).to_s
|
16
|
+
else
|
17
|
+
puts("Arabic Morphological Analyzer for Ruby")
|
18
|
+
puts("Ported to Ruby by Moustafa Emara and Hany Salah El din , eSpace-technologies.(www.espace.com.eg) , 2008.")
|
19
|
+
puts("Based on :")
|
20
|
+
puts("BUCKWALTER ARABIC MORPHOLOGICAL ANALYZER")
|
21
|
+
puts("This program is developed under the MIT-Licences")
|
22
|
+
puts("Usage :")
|
23
|
+
puts("")
|
24
|
+
puts("raraMorph inFile [inEncoding] [outFile] [-v] [-a]")
|
25
|
+
puts("")
|
26
|
+
puts("inFile : file to be analyzed")
|
27
|
+
puts("inEncoding : encoding for inFile, default UTF-8")
|
28
|
+
puts("outFile : result file ")
|
29
|
+
puts("-v : verbose mode")
|
30
|
+
puts("-a : Aarbic Output" )
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
|