raramorph 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,537 @@
1
+ # A Ruby port of Buckwalter Arabic Morphological Analyzer Version 1.0.
2
+ #
3
+ # Author:: eSpace technologies www.eSpace.com.eg
4
+ # Copyright:: 2008
5
+
6
+
7
+ require 'set'
8
+
9
+ class Raramorph
10
+
11
+ # The dictionary handler.
12
+ @@dict = InMemoryDictionaryHandler.create
13
+ # The solutions handler.
14
+ @@sol = InMemorySolutionsHandler.create
15
+ # Whether or not the analyzer should output some convenience messages
16
+ @verbose
17
+ # The stream where to output the results
18
+
19
+ @output_stream
20
+
21
+ #use arabic translation or not?
22
+ @not_arabic
23
+
24
+ #Stats
25
+ # Lines processed
26
+ @lines_counter = 0
27
+ # Arabic tokens processed
28
+ @not_arabic_tokens_counter = 0
29
+ # Not arabic tokens processed
30
+ @not_arabic_tokens_counter = 0
31
+
32
+ # Arabic words which have been succesfully analyzed.
33
+ # * [key] = word
34
+ # * [value] = occurences
35
+ #
36
+ @found = {}
37
+
38
+ # Arabic words which have not been succesfully analyzed.
39
+ # * [key] = word
40
+ # * [value] = occurences
41
+ #
42
+ @not_found = {}
43
+
44
+ # Alternative spellings list of regular expressions
45
+ @@alternative_spellings = []
46
+ @@alternative_spellings[0] = Regexp.compile(".*" + "Y'$")
47
+ @@alternative_spellings[1] = Regexp.compile(".*" + "y'$")
48
+ @@alternative_spellings[2] = Regexp.compile(".*" + "y$")
49
+ @@alternative_spellings[3] = Regexp.compile(".*" + "h$")
50
+ @@alternative_spellings[4] = Regexp.compile(".*" + "p$")
51
+ @@space_regex = Regexp.compile("\\s+")
52
+
53
+
54
+ def self.set_verbose(verbose) #Bolean Variable
55
+ @verbose = verbose
56
+ end
57
+
58
+ # * Analyze and Process the file ( i.e Doing the morphological Analysis )
59
+ # * [file_reader_in] Input File Path
60
+ # * [output_buckwalter] whether the output in buckwalter indications ( i.e Roman letters ) or arabic letters
61
+ def self.analyze(file_reader_in,output_buckwalter)
62
+ # begin
63
+ lines= IO.readlines(file_reader_in)
64
+ lines.each do |line|
65
+ @lines_counter+=1
66
+ if(@verbose)
67
+ puts "Processing line : "+ @lines_counter.to_s
68
+ end
69
+ tokens = tokenize(line)
70
+ tokens.each do |token|
71
+ analyze_token(token,output_buckwalter)
72
+ end
73
+ end
74
+ #rescue
75
+ # @stream.puts "Can not read line " + @lines_counter.to_s
76
+ #end
77
+ end
78
+
79
+ # * Tokenize the Word removing non-arabic characters
80
+ # * [str] Word to be tokenized
81
+ def self.tokenize(str) #String , REturn String
82
+ str.force_encoding "UTF-8"
83
+ str = str.strip
84
+ str = str.gsub(@@space_regex, " ")
85
+ #ignored \u0688 : ARABIC LETTER DDAL
86
+ #ignored \u06A9 : ARABIC LETTER KEHEH
87
+ #ignored \u0691 : ARABIC LETTER RREH
88
+ #ignored \u06BA : ARABIC LETTER NOON GHUNNA
89
+ #ignored \u06BE : ARABIC LETTER HEH DOACHASHMEE
90
+ #ignored \u06C1 : ARABIC LETTER HEH GOAL
91
+ #ignored \u06D2 : ARABIC LETTER YEH BARREE
92
+ split = str.split(/[^\u067E\u0686\u0698\u06AF\u0621-\u0636\u0637-\u0643\u0644\u0645-\u0648\u0649-\u064A\u064B-\u064E\u064F\u0650\u0651\u0652]+/)
93
+ tokens = []
94
+ #return at least one token, the string if necessary
95
+ split.length == 0 ? (tokens << str) : split
96
+ end
97
+
98
+
99
+ # * Analyze Token doing the morphological Analysis
100
+ # * [token] word to be analyzed
101
+ # * [output_buckwalter] whether the output in buckwalter indications ( i.e Roman letters ) or arabic letters
102
+ def self.analyze_token(token , output_buckwalter) #STring , Boolean , REturn Boolean
103
+ #TO DO SET UP THE PRINT STREAM
104
+ token.force_encoding "UTF-8"
105
+ @stream.puts "Processing token : " + "\t" + token
106
+ #TODO : check accuracy
107
+ #ignored \u0688 : ARABIC LETTER DDAL
108
+ #ignored \u06A9 : ARABIC LETTER KEHEH
109
+ #ignored \u0691 : ARABIC LETTER RREH
110
+ #ignored \u06BA : ARABIC LETTER NOON GHUNNA
111
+ #ignored \u06BE : ARABIC LETTER HEH DOACHASHMEE
112
+ #ignored \u06C1 : ARABIC LETTER HEH GOAL
113
+ #ignored \u0640 : ARABIC TATWEEL
114
+ #ignored \u06D2 : ARABIC LETTER YEH BARREE
115
+ unless(token.match(/([\u067E\u0686\u0698\u06AF\u0621-\u063A\u0641-\u0652])+/))
116
+ token.strip!
117
+ # tokenize it on white space
118
+ sub_tokens = token.split(@@space_regex)
119
+ sub_tokens.each{|sub_token|
120
+ unless sub_token.strip == ""
121
+ @not_arabic_tokens_counter+=1
122
+ @output_stream != nil ? @stream.puts("Non-Arabic : " + sub_token) : puts("Non-Arabic : " + sub_token)
123
+ end
124
+ }
125
+ return false
126
+ else
127
+ has_solutions = false
128
+ @not_arabic_tokens_counter+=1
129
+
130
+ translitered = ArabicLatinTranslator.translate(token)
131
+ @output_stream != nil ? @stream.puts("Transliteration : " + "\t" + translitered) : puts("Transliteration : " + "\t" + translitered)
132
+
133
+ if @found.has_key?(translitered) #Already processed : previously found
134
+ @output_stream != nil && @verbose ? @stream.puts("Token already processed.") : puts("Token already processed.")
135
+ #increase reference counter
136
+ @found[translitered]+=1
137
+ has_solutions = true
138
+ elsif @not_found.has_key?(translitered) #Already processed : previously not found
139
+ @output_stream != nil && @verbose ? @stream.puts("Token already processed without solution.") : puts("Token already processed without solution.")
140
+ @not_found[translitered]+=1 #increase reference counter
141
+ has_solutions = false
142
+ else
143
+ @output_stream != nil && @verbose ? @stream.puts("Token not yet processed.") : puts("Token not yet processed.")
144
+
145
+ if (feed_word_solutions(translitered)) #CHANGED #word has solutions...
146
+ #mark word as found
147
+ raise "There is already a key for " + translitered + " in found" if @found.has_key?(translitered)
148
+ @output_stream != nil && @verbose ? @stream.puts("Token has direct solutions.") : puts("Token has direct solutions.")
149
+ #set reference counter to 1
150
+ @found[translitered] = 1
151
+ has_solutions = true
152
+ else #word has no direct solution
153
+ if(feed_alternative_spellings(translitered))
154
+ alternatives_give_solutions = false
155
+
156
+ alternatives = @@sol.get_alternative_spellings(translitered)
157
+ alternatives.each{|alternative|
158
+ alternatives_give_solutions = (alternatives_give_solutions || feed_word_solutions(alternative))
159
+ }
160
+ if(alternatives_give_solutions)
161
+ #consistency check
162
+ raise "There is already a key for " + translitered + " in found" if @found.has_key?(translitered)
163
+ @output_stream != nil && @verbose ? @stream.puts("Token's alternative spellings have solutions.") : puts("Token's alternative spellings have solutions.")
164
+ #mark word as found set reference counter to 1
165
+ @found[translitered] = 1
166
+ has_solutions = true
167
+ else
168
+ #consistency check
169
+ raise "There is already a key for " + translitered + " in notFound" if @not_found.has_key?(translitered)
170
+ @output_stream != nil && @verbose ? @stream.puts("Token's alternative spellings have no solution.") : puts("Token's alternative spellings have no solution.")
171
+ @not_found[translitered]=1
172
+ has_solutions = false
173
+ end
174
+ else
175
+ #there are no alternative
176
+ raise "There is already a key for " + translitered + " in notFound" if @not_found.has_key?(translitered)
177
+ @output_stream != nil && @verbose ? @stream.puts("Token has no solution and no alternative spellings.") : puts("Token has no solution and no alternative spellings.")
178
+ #mark word as not found and set reference counter to 1
179
+ @not_found[translitered]=1
180
+ has_solutions = false
181
+ end
182
+ end
183
+ end
184
+
185
+
186
+ #output solutions : TODO consider XML output
187
+ if @output_stream != nil
188
+ if @found.has_key?(translitered)
189
+ if @@sol.has_solutions(translitered)
190
+ @@sol.get_solutions(translitered).each{|solution| @stream.puts "#{output_buckwalter ? solution.to_s : solution.to_arabized_string}"}
191
+ end
192
+ if @@sol.has_alternative_spellings(translitered)
193
+ @output_stream != nil && @verbose ? @stream.puts("No direct solution") : puts("No direct solution")
194
+ @@sol.get_alternative_spellings(translitered).each{|alternative|
195
+ @output_stream != nil && @verbose ? @stream.puts("Considering alternative spelling :" + "\t" + alternative) : puts("Considering alternative spelling :" + "\t" + alternative)
196
+ if @@sol.has_solutions(alternative)
197
+ @@sol.get_solutions(alternative).each{|solution| @stream.puts "#{output_buckwalter ? solution.to_s : solution.to_arabized_string}"}
198
+ end
199
+ }
200
+ end
201
+ elsif @not_found.has_key?(translitered)
202
+ @stream.puts "\nNo solution\n"
203
+ else
204
+ raise "#{translitered} is neither in found or notFound !"
205
+ end
206
+ end
207
+ return has_solutions
208
+
209
+ end
210
+ end
211
+
212
+ # * Find the Solution for the translitered word
213
+ # * [translitered] word to be processed
214
+ def self.feed_word_solutions(translitered) # String #Return Boolean
215
+ #translitered.force_encoding "UTF-8"
216
+ return true if @@sol.has_solutions(translitered) #No need to reprocess
217
+ word_solutions = Set.new
218
+ count = 0
219
+ #get a list of valid segmentations
220
+ segments = segment_word(translitered) #Hash Set of Segement Words Objects
221
+ #Brute force algorithm
222
+ segments.each{|segmented_word|
223
+ #Is prefix known ?
224
+ if @@dict.has_prefix?(segmented_word.prefix)
225
+ #Is stem known ?
226
+ # puts "has prefix"
227
+ if @@dict.has_stem?(segmented_word.stem)
228
+ # puts "has stem"
229
+ #Is suffix known ?
230
+ if @@dict.has_suffix?(segmented_word.suffix)
231
+ # puts "has suffix"
232
+ #Compatibility check
233
+ @@dict.prefixes[segmented_word.prefix].each{|prefix|
234
+ @@dict.stems[segmented_word.stem].each {|stem|
235
+ #Prefix/Stem compatibility
236
+ if @@dict.prefixes_stems_compatible?(prefix.morphology ,stem.morphology )
237
+ # puts "has A B Com"
238
+ @@dict.suffixes[segmented_word.suffix].each {|suffix|
239
+ # Prefix/Suffix compatiblity
240
+ if @@dict.prefixes_suffixes_compatible?(prefix.morphology , suffix.morphology)
241
+ # puts "has A C Com"
242
+ # Stems/Suffixes compatiblity
243
+ if @@dict.stems_suffixes_compatible?(stem.morphology , suffix.morphology)
244
+ # puts "has B C COM"
245
+ #All tests passed : it is a solution
246
+ count = count + 1
247
+ word_solutions << Solution.new(@verbose , count , prefix , stem , suffix )
248
+ end
249
+ end
250
+ }
251
+ end
252
+ }
253
+ }
254
+ end
255
+ end
256
+ end
257
+ }
258
+
259
+ #Add all solutions, if any
260
+ @@sol.add_solutions(translitered, word_solutions) unless word_solutions.empty?
261
+ return !word_solutions.empty?
262
+ end
263
+
264
+ # * Return the Solutions of the given Word
265
+ # * [word] word to be proccessed
266
+ def self.get_word_solutions(word) #String # Return Set
267
+ word.force_encoding "UTF-8"
268
+ word_solutions = Set.new
269
+ translitered = ArabicLatinTranslator.translate(word)
270
+ if @found.has_key?(translitered)
271
+ @@sol.get_solutions(translitered).each {|solution| word_solutions << solution } if @@sol.has_solutions(translitered)
272
+ if @@sol.has_alternative_spellings(translitered)
273
+ @@sol.get_alternative_spellings(translitered).each {|alt|
274
+ @@sol.get_solutions(alt).each {|solution| word_solutions << solution } if @@sol.has_solutions(alt)}
275
+ end
276
+ elsif @not_found.has_key?(translitered)
277
+ else
278
+ raise "#{translitered} is neither in found or notFound !"
279
+ end
280
+ return word_solutions
281
+ end
282
+
283
+ # * Segment the give word constructing prefix , stem , suffix
284
+ # * [translitered] word to be proccessed
285
+ def self.segment_word(translitered)
286
+ # translitered.force_encoding "UTF-8"
287
+ segmented = Set.new
288
+ prefix_len = 0
289
+ suffix_len = 0
290
+
291
+ while(prefix_len <=4 and prefix_len<=translitered.length)
292
+ prefix = translitered.slice(0,prefix_len)
293
+ stem_len = translitered.length - prefix_len
294
+ suffix_len = 0
295
+
296
+ while(stem_len>=1 and suffix_len<=6)
297
+ stem = translitered.slice(prefix_len,stem_len)
298
+ suffix = translitered.slice(prefix_len+stem_len,suffix_len)
299
+ segmented.add(SegmentedWord.new(prefix,stem,suffix))
300
+ stem_len-=1
301
+ suffix_len+=1
302
+ end
303
+
304
+ prefix_len+=1
305
+ end
306
+
307
+ segmented
308
+ end
309
+
310
+ def self.print_stats
311
+ total = (@found.length+@not_found.length).to_f
312
+ puts "=================== Statistics ==================="
313
+ puts "Lines : " + @lines_counter.to_s
314
+ puts "Arabic tokens : " + @not_arabic_tokens_counter.to_s
315
+ puts "Non-arabic tokens : " + @not_arabic_tokens_counter.to_s
316
+ puts "Words found : " + @found.length.to_s + " (" + (((100*(@found.length*100 / total)).round())/100.0 ).to_s+ "%)"
317
+ puts "Words not found : " + @not_found.length.to_s + " (" + (((100*(@not_found.length*100 / total)).round())/100.0 ).to_s + "%)"
318
+ puts "=================================================="
319
+
320
+ end
321
+
322
+ # * Find Alternative Spellings for the translitered word
323
+ # * [translitered] word to be proccesed
324
+ def self.feed_alternative_spellings(translitered)
325
+ return true if(@@sol.has_alternative_spellings(translitered))
326
+ word_alternative_spellings = Set.new
327
+ temp = translitered
328
+
329
+ if( temp.match(@@alternative_spellings[0]) )
330
+ temp.gsub!(/Y/, "y")
331
+ if(@verbose)
332
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
333
+ end
334
+ word_alternative_spellings.add(temp)
335
+ temp2 = temp.sub(/w/, "&")
336
+ if(temp!=temp2)
337
+ temp = temp2
338
+ if(@verbose)
339
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
340
+ end
341
+ word_alternative_spellings.add(temp)
342
+ end
343
+ temp=translitered
344
+ temp.gsub!(/Y/,"y")
345
+ temp.sub!(/y'$/,"}")
346
+ if(@verbose)
347
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
348
+ end
349
+ word_alternative_spellings.add(temp)
350
+ temp2 = temp.sub(/w/, "&")
351
+ if(temp!=temp2)
352
+ temp = temp2
353
+ if(@verbose)
354
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
355
+ end
356
+ word_alternative_spellings.add(temp)
357
+ end
358
+
359
+ elsif( temp.match(@@alternative_spellings[1]) )
360
+ temp2 = temp.gsub(/Y/,"y")
361
+ if(temp != temp2 )
362
+ temp = temp2
363
+ if(@verbose)
364
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
365
+ end
366
+ word_alternative_spellings.add(temp)
367
+ end
368
+ temp2 = temp.sub(/w'/, "&")
369
+ if(temp != temp2 )
370
+ temp = temp2
371
+ if(@verbose)
372
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
373
+ end
374
+ word_alternative_spellings.add(temp)
375
+ end
376
+ temp =translitered
377
+ temp.gsub!(/Y/, "y")
378
+ temp.sub!(/y'$/, "}")
379
+ if(@verbose)
380
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
381
+ end
382
+ word_alternative_spellings.add(temp)
383
+ temp2 = temp.sub(/w'/, "&")
384
+ if(temp != temp2 )
385
+ temp = temp2
386
+ if(@verbose)
387
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
388
+ end
389
+ word_alternative_spellings.add(temp)
390
+ end
391
+
392
+ elsif( temp.match(@@alternative_spellings[2]) )
393
+ temp.gsub!(/Y/,"y")
394
+ temp2 = temp.sub(/w'/, "&")
395
+ if(temp != temp2 )
396
+ temp = temp2
397
+ if(@verbose)
398
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
399
+ end
400
+ word_alternative_spellings.add(temp)
401
+ end
402
+ temp =translitered
403
+ temp.gsub!(/Y/, "y")
404
+ temp.gsub!(/y$/, "Y")
405
+ if(@verbose)
406
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
407
+ end
408
+ word_alternative_spellings.add(temp)
409
+ temp2 = temp.sub(/w'/, "&")
410
+ if(temp != temp2 )
411
+ temp = temp2
412
+ if(@verbose)
413
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
414
+ end
415
+ word_alternative_spellings.add(temp)
416
+ end
417
+
418
+ elsif( temp.match(@@alternative_spellings[3]) )
419
+ temp2 = temp.gsub(/Y/,"y")
420
+ if(temp != temp2 )
421
+ temp = temp2
422
+ if(@verbose)
423
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
424
+ end
425
+ word_alternative_spellings.add(temp)
426
+ end
427
+ temp2 = temp.sub(/w'/, "&")
428
+ if(temp != temp2 )
429
+ temp = temp2
430
+ if(@verbose)
431
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
432
+ end
433
+ word_alternative_spellings.add(temp)
434
+ end
435
+ temp.sub!(/p$/, "h")
436
+ if(@verbose)
437
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
438
+ end
439
+ word_alternative_spellings.add(temp)
440
+
441
+ else
442
+ temp2 = temp.sub(/Y$/, "y")
443
+ if(temp!=temp2)
444
+ temp = temp2
445
+ temp.gsub!(/Y/, "y")
446
+ if(@verbose)
447
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
448
+ end
449
+ word_alternative_spellings.add(temp)
450
+ temp2 = temp.sub(/w'/, "&")
451
+ if(temp != temp2 )
452
+ temp = temp2
453
+ if(@verbose)
454
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
455
+ end
456
+ word_alternative_spellings.add(temp)
457
+ end
458
+ else
459
+ temp2 = temp.gsub(/Y/, "y")
460
+ if(temp != temp2)
461
+ if(@verbose)
462
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
463
+ end
464
+ word_alternative_spellings.add(temp)
465
+ temp2 = temp.sub(/w'/, "&")
466
+ if(temp != temp2 )
467
+ temp = temp2
468
+ if(@verbose)
469
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
470
+ end
471
+ word_alternative_spellings.add(temp)
472
+ end
473
+ else
474
+ temp2 = temp.sub(/w'/, "&")
475
+ if(temp != temp2 )
476
+ temp = temp2
477
+ if(@verbose)
478
+ @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
479
+ end
480
+ word_alternative_spellings.add(temp)
481
+ end
482
+ end
483
+ end
484
+ end
485
+
486
+ if(!word_alternative_spellings.empty?)
487
+ @@sol .add_alternative_spellings(translitered,word_alternative_spellings)
488
+ end
489
+ return !word_alternative_spellings.empty?
490
+ end
491
+
492
+ # Executes the morphological Analyzer and Intitaite the variables
493
+ # * [input_filename] input file path
494
+ # * [output_filename] Output file path
495
+ # * [verbose] Setter for verbose
496
+ # * [not_arabic] alias for out_put_bucwalter for indicating the output format in buckwalter indications or will be arabic
497
+ def self.execute(input_filename, output_filename ,verbose = false, not_arabic = true)
498
+ @output_stream = true
499
+ @not_arabic = not_arabic
500
+ @verbose = verbose
501
+ # Lines processed
502
+ @lines_counter = 0
503
+ # Arabic tokens processed
504
+ @not_arabic_tokens_counter = 0
505
+ # Not arabic tokens processed
506
+ @not_arabic_tokens_counter = 0
507
+ # Arabic words which have been succesfully analyzed.
508
+ # * [key] = word
509
+ # * [value] = occurences
510
+ #
511
+ @found = {}
512
+ # Arabic words which have not been succesfully analyzed.
513
+ # * [key] = word
514
+ # * [value] = occurences
515
+ #
516
+ @not_found = {}
517
+ @stream = StringIO.new
518
+
519
+ analyze(input_filename , @not_arabic)
520
+ File.open(output_filename , "w") do |f|
521
+ f.puts @stream.string
522
+ end
523
+ print_stats
524
+ end
525
+ end
526
+
527
+ class SegmentedWord
528
+ # Class For Storing the Data of segmented Word
529
+ # Author:: eSpace technologies www.eSpace.com.eg
530
+ # Copyright:: 2008
531
+ attr_reader :prefix , :stem , :suffix
532
+ def initialize(prefix , stem , suffix)
533
+ @prefix = prefix
534
+ @stem = stem
535
+ @suffix = suffix
536
+ end
537
+ end