mosta-raramorph 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +56 -0
- data/bin/raramorph +6 -0
- data/lib/dictionaries/dictPrefixes +421 -0
- data/lib/dictionaries/dictStems +135989 -0
- data/lib/dictionaries/dictSuffixes +1170 -0
- data/lib/dictionaries/marshal_stems +0 -0
- data/lib/dictionaries/tableAB +2276 -0
- data/lib/dictionaries/tableAC +743 -0
- data/lib/dictionaries/tableBC +1584 -0
- data/lib/raramorph/arabic_latin_translator.rb +38 -0
- data/lib/raramorph/dictionary_entry.rb +40 -0
- data/lib/raramorph/in_memory_dictionary_handler.rb +325 -0
- data/lib/raramorph/in_memory_solutions_handler.rb +78 -0
- data/lib/raramorph/latin_arabic_translator.rb +35 -0
- data/lib/raramorph/logger.rb +20 -0
- data/lib/raramorph/raramorph.rb +417 -0
- data/lib/raramorph/solution.rb +592 -0
- data/lib/raramorph/translator.rb +40 -0
- data/lib/raramorph.rb +16 -0
- data/lib/raramorph_main.rb +34 -0
- data/lib/test_input/UTF-8.txt +32 -0
- data/raramorph.gemspec +42 -0
- metadata +75 -0
@@ -0,0 +1,417 @@
|
|
1
|
+
# A Ruby port of Buckwalter Morphological Analyzer Version 1.0.
|
2
|
+
# Author:: eSpace technologies www.eSpace.com.eg
|
3
|
+
# Copyright:: 2008
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
class Raramorph
|
8
|
+
|
9
|
+
# The dictionary handler.
|
10
|
+
@@dict = InMemoryDictionaryHandler.create
|
11
|
+
# The solutions handler.
|
12
|
+
@@sol = InMemorySolutionsHandler.create
|
13
|
+
# Whether or not the analyzer should output some convenience messages
|
14
|
+
# Alternative spellings list of regular expressions
|
15
|
+
@@alternative_spellings = [Regexp.compile(".*" + "Y'$") ,
|
16
|
+
Regexp.compile(".*" + "y'$") ,
|
17
|
+
Regexp.compile(".*" + "y$") ,
|
18
|
+
Regexp.compile(".*" + "h$") ,
|
19
|
+
Regexp.compile(".*" + "p$") ]
|
20
|
+
@@space_regex = Regexp.compile("\\s+")
|
21
|
+
|
22
|
+
# * Analyze and Process the file ( i.e Doing the morphological Analysis )
|
23
|
+
# * [file_reader_in] Input File Path
|
24
|
+
# * [output_buckwalter] whether the output in buckwalter indications ( i.e Roman letters ) or arabic letters
|
25
|
+
def self.analyze(file_reader_in,output_buckwalter)
|
26
|
+
# begin
|
27
|
+
lines= IO.readlines(file_reader_in)
|
28
|
+
lines.each do |line|
|
29
|
+
@lines_counter+=1
|
30
|
+
if(@logger.verbose)
|
31
|
+
puts "Processing line : #{@lines_counter.to_s}"
|
32
|
+
end
|
33
|
+
tokens = tokenize(line)
|
34
|
+
tokens.each do |token|
|
35
|
+
analyze_token(token,output_buckwalter)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
#rescue
|
39
|
+
# @logger.info "Can not read line " + @lines_counter.to_s
|
40
|
+
#end
|
41
|
+
end
|
42
|
+
|
43
|
+
# * Tokenize the Word removing non-arabic characters
|
44
|
+
# * [str] Word to be tokenized
|
45
|
+
def self.tokenize(str) #String , REturn String
|
46
|
+
str.force_encoding "UTF-8"
|
47
|
+
str = str.strip
|
48
|
+
str = str.gsub(@@space_regex, " ")
|
49
|
+
#ignored \u0688 : ARABIC LETTER DDAL
|
50
|
+
#ignored \u06A9 : ARABIC LETTER KEHEH
|
51
|
+
#ignored \u0691 : ARABIC LETTER RREH
|
52
|
+
#ignored \u06BA : ARABIC LETTER NOON GHUNNA
|
53
|
+
#ignored \u06BE : ARABIC LETTER HEH DOACHASHMEE
|
54
|
+
#ignored \u06C1 : ARABIC LETTER HEH GOAL
|
55
|
+
#ignored \u06D2 : ARABIC LETTER YEH BARREE
|
56
|
+
split = str.split(/[^\u067E\u0686\u0698\u06AF\u0621-\u0636\u0637-\u0643\u0644\u0645-\u0648\u0649-\u064A\u064B-\u064E\u064F\u0650\u0651\u0652]+/)
|
57
|
+
tokens = []
|
58
|
+
#return at least one token, the string if necessary
|
59
|
+
split.length == 0 ? (tokens << str) : split
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
# * Analyze Token doing the morphological Analysis
|
64
|
+
# * [token] word to be analyzed
|
65
|
+
# * [output_buckwalter] whether the output in buckwalter indications ( i.e Roman letters ) or arabic letters
|
66
|
+
def self.analyze_token(token , output_buckwalter) #STring , Boolean , REturn Boolean
|
67
|
+
#TO DO SET UP THE PRINT STREAM
|
68
|
+
token.force_encoding "UTF-8"
|
69
|
+
@logger.info "Processing token : " + "\t" + token
|
70
|
+
#TODO : check accuracy
|
71
|
+
#ignored \u0688 : ARABIC LETTER DDAL
|
72
|
+
#ignored \u06A9 : ARABIC LETTER KEHEH
|
73
|
+
#ignored \u0691 : ARABIC LETTER RREH
|
74
|
+
#ignored \u06BA : ARABIC LETTER NOON GHUNNA
|
75
|
+
#ignored \u06BE : ARABIC LETTER HEH DOACHASHMEE
|
76
|
+
#ignored \u06C1 : ARABIC LETTER HEH GOAL
|
77
|
+
#ignored \u0640 : ARABIC TATWEEL
|
78
|
+
#ignored \u06D2 : ARABIC LETTER YEH BARREE
|
79
|
+
unless(token.match(/([\u067E\u0686\u0698\u06AF\u0621-\u063A\u0641-\u0652])+/))
|
80
|
+
token.strip!
|
81
|
+
# tokenize it on white space
|
82
|
+
sub_tokens = token.split(@@space_regex)
|
83
|
+
sub_tokens.each{|sub_token|
|
84
|
+
unless sub_token.strip == ""
|
85
|
+
@not_arabic_tokens_counter+=1
|
86
|
+
@logger.info("Non-Arabic : #{sub_token}")
|
87
|
+
end
|
88
|
+
}
|
89
|
+
return false
|
90
|
+
else
|
91
|
+
has_solutions = false
|
92
|
+
@not_arabic_tokens_counter+=1
|
93
|
+
|
94
|
+
translitered = ArabicLatinTranslator.translate(token)
|
95
|
+
@logger.info("Transliteration : \t#{translitered}")
|
96
|
+
|
97
|
+
if @found.has_key?(translitered) #Already processed : previously found
|
98
|
+
@logger.info("Token already processed." , true )
|
99
|
+
#increase reference counter
|
100
|
+
@found[translitered]+=1
|
101
|
+
has_solutions = true
|
102
|
+
elsif @not_found.has_key?(translitered) #Already processed : previously not found
|
103
|
+
@logger.info("Token already processed without solution." , true )
|
104
|
+
@not_found[translitered]+=1 #increase reference counter
|
105
|
+
has_solutions = false
|
106
|
+
else
|
107
|
+
@logger.info("Token not yet processed.", true )
|
108
|
+
|
109
|
+
if (feed_word_solutions(translitered)) #CHANGED #word has solutions...
|
110
|
+
#mark word as found
|
111
|
+
raise "There is already a key for " + translitered + " in found" if @found.has_key?(translitered)
|
112
|
+
@logger.info("Token has direct solutions." , true )
|
113
|
+
#set reference counter to 1
|
114
|
+
@found[translitered] = 1
|
115
|
+
has_solutions = true
|
116
|
+
else #word has no direct solution
|
117
|
+
if(feed_alternative_spellings(translitered))
|
118
|
+
alternatives_give_solutions = false
|
119
|
+
alternatives = @@sol.get_alternative_spellings(translitered)
|
120
|
+
alternatives.each{|alternative|
|
121
|
+
alternatives_give_solutions = (alternatives_give_solutions || feed_word_solutions(alternative))
|
122
|
+
}
|
123
|
+
if(alternatives_give_solutions)
|
124
|
+
#consistency check
|
125
|
+
raise "There is already a key for " + translitered + " in found" if @found.has_key?(translitered)
|
126
|
+
@logger.info("Token's alternative spellings have solutions." , true )
|
127
|
+
#mark word as found set reference counter to 1
|
128
|
+
@found[translitered] = 1
|
129
|
+
has_solutions = true
|
130
|
+
else
|
131
|
+
#consistency check
|
132
|
+
raise "There is already a key for " + translitered + " in notFound" if @not_found.has_key?(translitered)
|
133
|
+
@logger.info("Token's alternative spellings have no solution." , true )
|
134
|
+
@not_found[translitered]=1
|
135
|
+
has_solutions = false
|
136
|
+
end
|
137
|
+
else
|
138
|
+
#there are no alternative
|
139
|
+
raise "There is already a key for " + translitered + " in notFound" if @not_found.has_key?(translitered)
|
140
|
+
@logger.info("Token has no solution and no alternative spellings." , true )
|
141
|
+
#mark word as not found and set reference counter to 1
|
142
|
+
@not_found[translitered]=1
|
143
|
+
has_solutions = false
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
|
149
|
+
#output solutions : TODO consider XML output
|
150
|
+
if @logger.output != nil
|
151
|
+
if @found.has_key?(translitered)
|
152
|
+
if @@sol.has_solutions(translitered)
|
153
|
+
@@sol.get_solutions(translitered).each{|solution| @logger.info "#{output_buckwalter ? solution.to_s : solution.to_arabized_string}"
|
154
|
+
}
|
155
|
+
end
|
156
|
+
if @@sol.has_alternative_spellings(translitered)
|
157
|
+
@logger.info("No direct solution" , true )
|
158
|
+
@@sol.get_alternative_spellings(translitered).each{|alternative|
|
159
|
+
@logger.info("Considering alternative spelling :" + "\t#{alternative}" , true )
|
160
|
+
if @@sol.has_solutions(alternative)
|
161
|
+
@@sol.get_solutions(alternative).each{|solution| @logger.info "#{output_buckwalter ? solution.to_s : solution.to_arabized_string}"
|
162
|
+
}
|
163
|
+
end
|
164
|
+
}
|
165
|
+
end
|
166
|
+
elsif @not_found.has_key?(translitered)
|
167
|
+
@logger.info "\nNo solution\n"
|
168
|
+
else
|
169
|
+
raise "#{translitered} is neither in found or notFound !"
|
170
|
+
end
|
171
|
+
end
|
172
|
+
return has_solutions
|
173
|
+
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
# * Find the Solution for the translitered word
|
178
|
+
# * [translitered] word to be processed
|
179
|
+
def self.feed_word_solutions(translitered) # String #Return Boolean
|
180
|
+
#translitered.force_encoding "UTF-8"
|
181
|
+
return true if @@sol.has_solutions(translitered) #No need to reprocess
|
182
|
+
word_solutions = Set.new
|
183
|
+
count = 0
|
184
|
+
#get a list of valid segmentations
|
185
|
+
segments = segment_word(translitered) #Hash Set of Segement Words Objects
|
186
|
+
#Brute force algorithm
|
187
|
+
segments.each{|segmented_word|
|
188
|
+
count = @@dict.analyze_word_in_dictionaries(segmented_word , word_solutions , @logger.verbose , count )
|
189
|
+
}
|
190
|
+
|
191
|
+
#Add all solutions, if any
|
192
|
+
@@sol.add_solutions(translitered, word_solutions) unless word_solutions.empty?
|
193
|
+
return !word_solutions.empty?
|
194
|
+
end
|
195
|
+
|
196
|
+
# * Return the Solutions of the given Word
|
197
|
+
# * [word] word to be proccessed
|
198
|
+
def self.get_word_solutions(word) #String # Return Set
|
199
|
+
word.force_encoding "UTF-8"
|
200
|
+
word_solutions = Set.new
|
201
|
+
translitered = ArabicLatinTranslator.translate(word)
|
202
|
+
if @found.has_key?(translitered)
|
203
|
+
@@sol.get_solutions(translitered).each {|solution| word_solutions << solution } if @@sol.has_solutions(translitered)
|
204
|
+
if @@sol.has_alternative_spellings(translitered)
|
205
|
+
@@sol.get_alternative_spellings(translitered).each {|alt|
|
206
|
+
@@sol.get_solutions(alt).each {|solution| word_solutions << solution } if @@sol.has_solutions(alt)}
|
207
|
+
end
|
208
|
+
elsif @not_found.has_key?(translitered)
|
209
|
+
else
|
210
|
+
raise "#{translitered} is neither in found or notFound !"
|
211
|
+
end
|
212
|
+
return word_solutions
|
213
|
+
end
|
214
|
+
|
215
|
+
# * Segment the give word constructing prefix , stem , suffix
|
216
|
+
# * [translitered] word to be proccessed
|
217
|
+
def self.segment_word(translitered)
|
218
|
+
# translitered.force_encoding "UTF-8"
|
219
|
+
segmented = Set.new
|
220
|
+
prefix_len = 0
|
221
|
+
suffix_len = 0
|
222
|
+
|
223
|
+
while(prefix_len <=4 and prefix_len<=translitered.length)
|
224
|
+
prefix = translitered.slice(0,prefix_len)
|
225
|
+
stem_len = translitered.length - prefix_len
|
226
|
+
suffix_len = 0
|
227
|
+
|
228
|
+
while(stem_len>=1 and suffix_len<=6)
|
229
|
+
stem = translitered.slice(prefix_len,stem_len)
|
230
|
+
suffix = translitered.slice(prefix_len+stem_len,suffix_len)
|
231
|
+
segmented.add(SegmentedWord.new(prefix,stem,suffix))
|
232
|
+
stem_len-=1
|
233
|
+
suffix_len+=1
|
234
|
+
end
|
235
|
+
|
236
|
+
prefix_len+=1
|
237
|
+
end
|
238
|
+
|
239
|
+
segmented
|
240
|
+
end
|
241
|
+
|
242
|
+
def self.print_stats
|
243
|
+
total = (@found.length+@not_found.length).to_f
|
244
|
+
puts "=================== Statistics ==================="
|
245
|
+
puts "Lines : " + @lines_counter.to_s
|
246
|
+
puts "Arabic tokens : " + @not_arabic_tokens_counter.to_s
|
247
|
+
puts "Non-arabic tokens : " + @not_arabic_tokens_counter.to_s
|
248
|
+
puts "Words found : " + @found.length.to_s + " (" + (((100*(@found.length*100 / total)).round())/100.0 ).to_s+ "%)"
|
249
|
+
puts "Words not found : " + @not_found.length.to_s + " (" + (((100*(@not_found.length*100 / total)).round())/100.0 ).to_s + "%)"
|
250
|
+
puts "=================================================="
|
251
|
+
|
252
|
+
end
|
253
|
+
|
254
|
+
# * Find Alternative Spellings for the translitered word
|
255
|
+
# * [translitered] word to be proccesed
|
256
|
+
def self.feed_alternative_spellings(translitered)
|
257
|
+
return true if(@@sol.has_alternative_spellings(translitered))
|
258
|
+
word_alternative_spellings = Set.new
|
259
|
+
temp = translitered
|
260
|
+
|
261
|
+
if( temp.match(@@alternative_spellings[0]) )
|
262
|
+
temp.gsub!(/Y/, "y")
|
263
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
264
|
+
word_alternative_spellings.add(temp)
|
265
|
+
temp2 = temp.sub(/w/, "&")
|
266
|
+
if(temp!=temp2)
|
267
|
+
temp = temp2
|
268
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
269
|
+
word_alternative_spellings.add(temp)
|
270
|
+
end
|
271
|
+
temp=translitered
|
272
|
+
temp.gsub!(/Y/,"y")
|
273
|
+
temp.sub!(/y'$/,"}")
|
274
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
275
|
+
word_alternative_spellings.add(temp)
|
276
|
+
temp2 = temp.sub(/w/, "&")
|
277
|
+
if(temp!=temp2)
|
278
|
+
temp = temp2
|
279
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
280
|
+
word_alternative_spellings.add(temp)
|
281
|
+
end
|
282
|
+
|
283
|
+
elsif( temp.match(@@alternative_spellings[1]) )
|
284
|
+
temp2 = temp.gsub(/Y/,"y")
|
285
|
+
if(temp != temp2 )
|
286
|
+
temp = temp2
|
287
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
288
|
+
word_alternative_spellings.add(temp)
|
289
|
+
end
|
290
|
+
temp2 = temp.sub(/w'/, "&")
|
291
|
+
if(temp != temp2 )
|
292
|
+
temp = temp2
|
293
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
294
|
+
word_alternative_spellings.add(temp)
|
295
|
+
end
|
296
|
+
temp =translitered
|
297
|
+
temp.gsub!(/Y/, "y")
|
298
|
+
temp.sub!(/y'$/, "}")
|
299
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
300
|
+
word_alternative_spellings.add(temp)
|
301
|
+
temp2 = temp.sub(/w'/, "&")
|
302
|
+
if(temp != temp2 )
|
303
|
+
temp = temp2
|
304
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
305
|
+
word_alternative_spellings.add(temp)
|
306
|
+
end
|
307
|
+
|
308
|
+
elsif( temp.match(@@alternative_spellings[2]) )
|
309
|
+
temp.gsub!(/Y/,"y")
|
310
|
+
temp2 = temp.sub(/w'/, "&")
|
311
|
+
if(temp != temp2 )
|
312
|
+
temp = temp2
|
313
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
314
|
+
word_alternative_spellings.add(temp)
|
315
|
+
end
|
316
|
+
temp =translitered
|
317
|
+
temp.gsub!(/Y/, "y")
|
318
|
+
temp.gsub!(/y$/, "Y")
|
319
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
320
|
+
word_alternative_spellings.add(temp)
|
321
|
+
temp2 = temp.sub(/w'/, "&")
|
322
|
+
if(temp != temp2 )
|
323
|
+
temp = temp2
|
324
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
325
|
+
word_alternative_spellings.add(temp)
|
326
|
+
end
|
327
|
+
|
328
|
+
elsif( temp.match(@@alternative_spellings[3]) )
|
329
|
+
temp2 = temp.gsub(/Y/,"y")
|
330
|
+
if(temp != temp2 )
|
331
|
+
temp = temp2
|
332
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
333
|
+
word_alternative_spellings.add(temp)
|
334
|
+
end
|
335
|
+
temp2 = temp.sub(/w'/, "&")
|
336
|
+
if(temp != temp2 )
|
337
|
+
temp = temp2
|
338
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
339
|
+
word_alternative_spellings.add(temp)
|
340
|
+
end
|
341
|
+
temp.sub!(/p$/, "h")
|
342
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
343
|
+
word_alternative_spellings.add(temp)
|
344
|
+
|
345
|
+
else
|
346
|
+
temp2 = temp.sub(/Y$/, "y")
|
347
|
+
if(temp!=temp2)
|
348
|
+
temp = temp2
|
349
|
+
temp.gsub!(/Y/, "y")
|
350
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
351
|
+
word_alternative_spellings.add(temp)
|
352
|
+
temp2 = temp.sub(/w'/, "&")
|
353
|
+
if(temp != temp2 )
|
354
|
+
temp = temp2
|
355
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
356
|
+
word_alternative_spellings.add(temp)
|
357
|
+
end
|
358
|
+
else
|
359
|
+
temp2 = temp.gsub(/Y/, "y")
|
360
|
+
if(temp != temp2)
|
361
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
362
|
+
word_alternative_spellings.add(temp)
|
363
|
+
temp2 = temp.sub(/w'/, "&")
|
364
|
+
if(temp != temp2 )
|
365
|
+
temp = temp2
|
366
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
367
|
+
word_alternative_spellings.add(temp)
|
368
|
+
end
|
369
|
+
else
|
370
|
+
temp2 = temp.sub(/w'/, "&")
|
371
|
+
if(temp != temp2 )
|
372
|
+
temp = temp2
|
373
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
374
|
+
word_alternative_spellings.add(temp)
|
375
|
+
end
|
376
|
+
end
|
377
|
+
end
|
378
|
+
end
|
379
|
+
|
380
|
+
if(!word_alternative_spellings.empty?)
|
381
|
+
@@sol .add_alternative_spellings(translitered,word_alternative_spellings)
|
382
|
+
end
|
383
|
+
return !word_alternative_spellings.empty?
|
384
|
+
end
|
385
|
+
|
386
|
+
# Executes the morphological Analyzer and Intitaite the variables
|
387
|
+
# * [input_filename] input file path
|
388
|
+
# * [output_filename] Output file path
|
389
|
+
# * [verbose] Setter for verbose
|
390
|
+
# * [not_arabic] alias for out_put_bucwalter for indicating the output format in buckwalter indications or will be arabic
|
391
|
+
def self.execute(input_filename, output_filename ,verbose = false, not_arabic = false)
|
392
|
+
@logger = Logger.new(true , output_filename )
|
393
|
+
@not_arabic = not_arabic
|
394
|
+
# Lines processed
|
395
|
+
@lines_counter = 0
|
396
|
+
# Arabic tokens processed
|
397
|
+
@not_arabic_tokens_counter = 0
|
398
|
+
# Not arabic tokens processed
|
399
|
+
@not_arabic_tokens_counter = 0
|
400
|
+
# Arabic words which have been succesfully analyzed.
|
401
|
+
# * [key] = word
|
402
|
+
# * [value] = occurences
|
403
|
+
#
|
404
|
+
@found = {}
|
405
|
+
# Arabic words which have not been succesfully analyzed.
|
406
|
+
# * [key] = word
|
407
|
+
# * [value] = occurences
|
408
|
+
#
|
409
|
+
@not_found = {}
|
410
|
+
analyze(input_filename , @not_arabic)
|
411
|
+
@logger.log
|
412
|
+
print_stats
|
413
|
+
end
|
414
|
+
end
|
415
|
+
|
416
|
+
class SegmentedWord < Struct.new( :prefix , :stem , :suffix) ; end
|
417
|
+
|