mosta-raramorph 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,417 @@
1
+ # A Ruby port of Buckwalter Morphological Analyzer Version 1.0.
2
+ # Author:: eSpace technologies www.eSpace.com.eg
3
+ # Copyright:: 2008
4
+
5
+
6
+
7
+ class Raramorph
8
+
9
+ # The dictionary handler.
10
+ @@dict = InMemoryDictionaryHandler.create
11
+ # The solutions handler.
12
+ @@sol = InMemorySolutionsHandler.create
13
+ # Whether or not the analyzer should output some convenience messages
14
+ # Alternative spellings list of regular expressions
15
+ @@alternative_spellings = [Regexp.compile(".*" + "Y'$") ,
16
+ Regexp.compile(".*" + "y'$") ,
17
+ Regexp.compile(".*" + "y$") ,
18
+ Regexp.compile(".*" + "h$") ,
19
+ Regexp.compile(".*" + "p$") ]
20
+ @@space_regex = Regexp.compile("\\s+")
21
+
22
+ # * Analyze and Process the file ( i.e Doing the morphological Analysis )
23
+ # * [file_reader_in] Input File Path
24
+ # * [output_buckwalter] whether the output in buckwalter indications ( i.e Roman letters ) or arabic letters
25
+ def self.analyze(file_reader_in,output_buckwalter)
26
+ # begin
27
+ lines= IO.readlines(file_reader_in)
28
+ lines.each do |line|
29
+ @lines_counter+=1
30
+ if(@logger.verbose)
31
+ puts "Processing line : #{@lines_counter.to_s}"
32
+ end
33
+ tokens = tokenize(line)
34
+ tokens.each do |token|
35
+ analyze_token(token,output_buckwalter)
36
+ end
37
+ end
38
+ #rescue
39
+ # @logger.info "Can not read line " + @lines_counter.to_s
40
+ #end
41
+ end
42
+
43
+ # * Tokenize the Word removing non-arabic characters
44
+ # * [str] Word to be tokenized
45
+ def self.tokenize(str) #String , REturn String
46
+ str.force_encoding "UTF-8"
47
+ str = str.strip
48
+ str = str.gsub(@@space_regex, " ")
49
+ #ignored \u0688 : ARABIC LETTER DDAL
50
+ #ignored \u06A9 : ARABIC LETTER KEHEH
51
+ #ignored \u0691 : ARABIC LETTER RREH
52
+ #ignored \u06BA : ARABIC LETTER NOON GHUNNA
53
+ #ignored \u06BE : ARABIC LETTER HEH DOACHASHMEE
54
+ #ignored \u06C1 : ARABIC LETTER HEH GOAL
55
+ #ignored \u06D2 : ARABIC LETTER YEH BARREE
56
+ split = str.split(/[^\u067E\u0686\u0698\u06AF\u0621-\u0636\u0637-\u0643\u0644\u0645-\u0648\u0649-\u064A\u064B-\u064E\u064F\u0650\u0651\u0652]+/)
57
+ tokens = []
58
+ #return at least one token, the string if necessary
59
+ split.length == 0 ? (tokens << str) : split
60
+ end
61
+
62
+
63
+ # * Analyze Token doing the morphological Analysis
64
+ # * [token] word to be analyzed
65
+ # * [output_buckwalter] whether the output in buckwalter indications ( i.e Roman letters ) or arabic letters
66
+ def self.analyze_token(token , output_buckwalter) #STring , Boolean , REturn Boolean
67
+ #TO DO SET UP THE PRINT STREAM
68
+ token.force_encoding "UTF-8"
69
+ @logger.info "Processing token : " + "\t" + token
70
+ #TODO : check accuracy
71
+ #ignored \u0688 : ARABIC LETTER DDAL
72
+ #ignored \u06A9 : ARABIC LETTER KEHEH
73
+ #ignored \u0691 : ARABIC LETTER RREH
74
+ #ignored \u06BA : ARABIC LETTER NOON GHUNNA
75
+ #ignored \u06BE : ARABIC LETTER HEH DOACHASHMEE
76
+ #ignored \u06C1 : ARABIC LETTER HEH GOAL
77
+ #ignored \u0640 : ARABIC TATWEEL
78
+ #ignored \u06D2 : ARABIC LETTER YEH BARREE
79
+ unless(token.match(/([\u067E\u0686\u0698\u06AF\u0621-\u063A\u0641-\u0652])+/))
80
+ token.strip!
81
+ # tokenize it on white space
82
+ sub_tokens = token.split(@@space_regex)
83
+ sub_tokens.each{|sub_token|
84
+ unless sub_token.strip == ""
85
+ @not_arabic_tokens_counter+=1
86
+ @logger.info("Non-Arabic : #{sub_token}")
87
+ end
88
+ }
89
+ return false
90
+ else
91
+ has_solutions = false
92
+ @not_arabic_tokens_counter+=1
93
+
94
+ translitered = ArabicLatinTranslator.translate(token)
95
+ @logger.info("Transliteration : \t#{translitered}")
96
+
97
+ if @found.has_key?(translitered) #Already processed : previously found
98
+ @logger.info("Token already processed." , true )
99
+ #increase reference counter
100
+ @found[translitered]+=1
101
+ has_solutions = true
102
+ elsif @not_found.has_key?(translitered) #Already processed : previously not found
103
+ @logger.info("Token already processed without solution." , true )
104
+ @not_found[translitered]+=1 #increase reference counter
105
+ has_solutions = false
106
+ else
107
+ @logger.info("Token not yet processed.", true )
108
+
109
+ if (feed_word_solutions(translitered)) #CHANGED #word has solutions...
110
+ #mark word as found
111
+ raise "There is already a key for " + translitered + " in found" if @found.has_key?(translitered)
112
+ @logger.info("Token has direct solutions." , true )
113
+ #set reference counter to 1
114
+ @found[translitered] = 1
115
+ has_solutions = true
116
+ else #word has no direct solution
117
+ if(feed_alternative_spellings(translitered))
118
+ alternatives_give_solutions = false
119
+ alternatives = @@sol.get_alternative_spellings(translitered)
120
+ alternatives.each{|alternative|
121
+ alternatives_give_solutions = (alternatives_give_solutions || feed_word_solutions(alternative))
122
+ }
123
+ if(alternatives_give_solutions)
124
+ #consistency check
125
+ raise "There is already a key for " + translitered + " in found" if @found.has_key?(translitered)
126
+ @logger.info("Token's alternative spellings have solutions." , true )
127
+ #mark word as found set reference counter to 1
128
+ @found[translitered] = 1
129
+ has_solutions = true
130
+ else
131
+ #consistency check
132
+ raise "There is already a key for " + translitered + " in notFound" if @not_found.has_key?(translitered)
133
+ @logger.info("Token's alternative spellings have no solution." , true )
134
+ @not_found[translitered]=1
135
+ has_solutions = false
136
+ end
137
+ else
138
+ #there are no alternative
139
+ raise "There is already a key for " + translitered + " in notFound" if @not_found.has_key?(translitered)
140
+ @logger.info("Token has no solution and no alternative spellings." , true )
141
+ #mark word as not found and set reference counter to 1
142
+ @not_found[translitered]=1
143
+ has_solutions = false
144
+ end
145
+ end
146
+ end
147
+
148
+
149
+ #output solutions : TODO consider XML output
150
+ if @logger.output != nil
151
+ if @found.has_key?(translitered)
152
+ if @@sol.has_solutions(translitered)
153
+ @@sol.get_solutions(translitered).each{|solution| @logger.info "#{output_buckwalter ? solution.to_s : solution.to_arabized_string}"
154
+ }
155
+ end
156
+ if @@sol.has_alternative_spellings(translitered)
157
+ @logger.info("No direct solution" , true )
158
+ @@sol.get_alternative_spellings(translitered).each{|alternative|
159
+ @logger.info("Considering alternative spelling :" + "\t#{alternative}" , true )
160
+ if @@sol.has_solutions(alternative)
161
+ @@sol.get_solutions(alternative).each{|solution| @logger.info "#{output_buckwalter ? solution.to_s : solution.to_arabized_string}"
162
+ }
163
+ end
164
+ }
165
+ end
166
+ elsif @not_found.has_key?(translitered)
167
+ @logger.info "\nNo solution\n"
168
+ else
169
+ raise "#{translitered} is neither in found or notFound !"
170
+ end
171
+ end
172
+ return has_solutions
173
+
174
+ end
175
+ end
176
+
177
+ # * Find the Solution for the translitered word
178
+ # * [translitered] word to be processed
179
+ def self.feed_word_solutions(translitered) # String #Return Boolean
180
+ #translitered.force_encoding "UTF-8"
181
+ return true if @@sol.has_solutions(translitered) #No need to reprocess
182
+ word_solutions = Set.new
183
+ count = 0
184
+ #get a list of valid segmentations
185
+ segments = segment_word(translitered) #Hash Set of Segement Words Objects
186
+ #Brute force algorithm
187
+ segments.each{|segmented_word|
188
+ count = @@dict.analyze_word_in_dictionaries(segmented_word , word_solutions , @logger.verbose , count )
189
+ }
190
+
191
+ #Add all solutions, if any
192
+ @@sol.add_solutions(translitered, word_solutions) unless word_solutions.empty?
193
+ return !word_solutions.empty?
194
+ end
195
+
196
+ # * Return the Solutions of the given Word
197
+ # * [word] word to be proccessed
198
+ def self.get_word_solutions(word) #String # Return Set
199
+ word.force_encoding "UTF-8"
200
+ word_solutions = Set.new
201
+ translitered = ArabicLatinTranslator.translate(word)
202
+ if @found.has_key?(translitered)
203
+ @@sol.get_solutions(translitered).each {|solution| word_solutions << solution } if @@sol.has_solutions(translitered)
204
+ if @@sol.has_alternative_spellings(translitered)
205
+ @@sol.get_alternative_spellings(translitered).each {|alt|
206
+ @@sol.get_solutions(alt).each {|solution| word_solutions << solution } if @@sol.has_solutions(alt)}
207
+ end
208
+ elsif @not_found.has_key?(translitered)
209
+ else
210
+ raise "#{translitered} is neither in found or notFound !"
211
+ end
212
+ return word_solutions
213
+ end
214
+
215
+ # * Segment the give word constructing prefix , stem , suffix
216
+ # * [translitered] word to be proccessed
217
+ def self.segment_word(translitered)
218
+ # translitered.force_encoding "UTF-8"
219
+ segmented = Set.new
220
+ prefix_len = 0
221
+ suffix_len = 0
222
+
223
+ while(prefix_len <=4 and prefix_len<=translitered.length)
224
+ prefix = translitered.slice(0,prefix_len)
225
+ stem_len = translitered.length - prefix_len
226
+ suffix_len = 0
227
+
228
+ while(stem_len>=1 and suffix_len<=6)
229
+ stem = translitered.slice(prefix_len,stem_len)
230
+ suffix = translitered.slice(prefix_len+stem_len,suffix_len)
231
+ segmented.add(SegmentedWord.new(prefix,stem,suffix))
232
+ stem_len-=1
233
+ suffix_len+=1
234
+ end
235
+
236
+ prefix_len+=1
237
+ end
238
+
239
+ segmented
240
+ end
241
+
242
+ def self.print_stats
243
+ total = (@found.length+@not_found.length).to_f
244
+ puts "=================== Statistics ==================="
245
+ puts "Lines : " + @lines_counter.to_s
246
+ puts "Arabic tokens : " + @not_arabic_tokens_counter.to_s
247
+ puts "Non-arabic tokens : " + @not_arabic_tokens_counter.to_s
248
+ puts "Words found : " + @found.length.to_s + " (" + (((100*(@found.length*100 / total)).round())/100.0 ).to_s+ "%)"
249
+ puts "Words not found : " + @not_found.length.to_s + " (" + (((100*(@not_found.length*100 / total)).round())/100.0 ).to_s + "%)"
250
+ puts "=================================================="
251
+
252
+ end
253
+
254
+ # * Find Alternative Spellings for the translitered word
255
+ # * [translitered] word to be proccesed
256
+ def self.feed_alternative_spellings(translitered)
257
+ return true if(@@sol.has_alternative_spellings(translitered))
258
+ word_alternative_spellings = Set.new
259
+ temp = translitered
260
+
261
+ if( temp.match(@@alternative_spellings[0]) )
262
+ temp.gsub!(/Y/, "y")
263
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
264
+ word_alternative_spellings.add(temp)
265
+ temp2 = temp.sub(/w/, "&")
266
+ if(temp!=temp2)
267
+ temp = temp2
268
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
269
+ word_alternative_spellings.add(temp)
270
+ end
271
+ temp=translitered
272
+ temp.gsub!(/Y/,"y")
273
+ temp.sub!(/y'$/,"}")
274
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
275
+ word_alternative_spellings.add(temp)
276
+ temp2 = temp.sub(/w/, "&")
277
+ if(temp!=temp2)
278
+ temp = temp2
279
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
280
+ word_alternative_spellings.add(temp)
281
+ end
282
+
283
+ elsif( temp.match(@@alternative_spellings[1]) )
284
+ temp2 = temp.gsub(/Y/,"y")
285
+ if(temp != temp2 )
286
+ temp = temp2
287
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
288
+ word_alternative_spellings.add(temp)
289
+ end
290
+ temp2 = temp.sub(/w'/, "&")
291
+ if(temp != temp2 )
292
+ temp = temp2
293
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
294
+ word_alternative_spellings.add(temp)
295
+ end
296
+ temp =translitered
297
+ temp.gsub!(/Y/, "y")
298
+ temp.sub!(/y'$/, "}")
299
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
300
+ word_alternative_spellings.add(temp)
301
+ temp2 = temp.sub(/w'/, "&")
302
+ if(temp != temp2 )
303
+ temp = temp2
304
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
305
+ word_alternative_spellings.add(temp)
306
+ end
307
+
308
+ elsif( temp.match(@@alternative_spellings[2]) )
309
+ temp.gsub!(/Y/,"y")
310
+ temp2 = temp.sub(/w'/, "&")
311
+ if(temp != temp2 )
312
+ temp = temp2
313
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
314
+ word_alternative_spellings.add(temp)
315
+ end
316
+ temp =translitered
317
+ temp.gsub!(/Y/, "y")
318
+ temp.gsub!(/y$/, "Y")
319
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
320
+ word_alternative_spellings.add(temp)
321
+ temp2 = temp.sub(/w'/, "&")
322
+ if(temp != temp2 )
323
+ temp = temp2
324
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
325
+ word_alternative_spellings.add(temp)
326
+ end
327
+
328
+ elsif( temp.match(@@alternative_spellings[3]) )
329
+ temp2 = temp.gsub(/Y/,"y")
330
+ if(temp != temp2 )
331
+ temp = temp2
332
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
333
+ word_alternative_spellings.add(temp)
334
+ end
335
+ temp2 = temp.sub(/w'/, "&")
336
+ if(temp != temp2 )
337
+ temp = temp2
338
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
339
+ word_alternative_spellings.add(temp)
340
+ end
341
+ temp.sub!(/p$/, "h")
342
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
343
+ word_alternative_spellings.add(temp)
344
+
345
+ else
346
+ temp2 = temp.sub(/Y$/, "y")
347
+ if(temp!=temp2)
348
+ temp = temp2
349
+ temp.gsub!(/Y/, "y")
350
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
351
+ word_alternative_spellings.add(temp)
352
+ temp2 = temp.sub(/w'/, "&")
353
+ if(temp != temp2 )
354
+ temp = temp2
355
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
356
+ word_alternative_spellings.add(temp)
357
+ end
358
+ else
359
+ temp2 = temp.gsub(/Y/, "y")
360
+ if(temp != temp2)
361
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
362
+ word_alternative_spellings.add(temp)
363
+ temp2 = temp.sub(/w'/, "&")
364
+ if(temp != temp2 )
365
+ temp = temp2
366
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
367
+ word_alternative_spellings.add(temp)
368
+ end
369
+ else
370
+ temp2 = temp.sub(/w'/, "&")
371
+ if(temp != temp2 )
372
+ temp = temp2
373
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
374
+ word_alternative_spellings.add(temp)
375
+ end
376
+ end
377
+ end
378
+ end
379
+
380
+ if(!word_alternative_spellings.empty?)
381
+ @@sol .add_alternative_spellings(translitered,word_alternative_spellings)
382
+ end
383
+ return !word_alternative_spellings.empty?
384
+ end
385
+
386
+ # Executes the morphological Analyzer and Intitaite the variables
387
+ # * [input_filename] input file path
388
+ # * [output_filename] Output file path
389
+ # * [verbose] Setter for verbose
390
+ # * [not_arabic] alias for out_put_bucwalter for indicating the output format in buckwalter indications or will be arabic
391
+ def self.execute(input_filename, output_filename ,verbose = false, not_arabic = false)
392
+ @logger = Logger.new(true , output_filename )
393
+ @not_arabic = not_arabic
394
+ # Lines processed
395
+ @lines_counter = 0
396
+ # Arabic tokens processed
397
+ @not_arabic_tokens_counter = 0
398
+ # Not arabic tokens processed
399
+ @not_arabic_tokens_counter = 0
400
+ # Arabic words which have been succesfully analyzed.
401
+ # * [key] = word
402
+ # * [value] = occurences
403
+ #
404
+ @found = {}
405
+ # Arabic words which have not been succesfully analyzed.
406
+ # * [key] = word
407
+ # * [value] = occurences
408
+ #
409
+ @not_found = {}
410
+ analyze(input_filename , @not_arabic)
411
+ @logger.log
412
+ print_stats
413
+ end
414
+ end
415
+
416
+ class SegmentedWord < Struct.new( :prefix , :stem , :suffix) ; end
417
+