espace-raramorph 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,417 @@
1
+ # A Ruby port of Buckwalter Morphological Analyzer Version 1.0.
2
+ # Author:: eSpace technologies www.eSpace.com.eg
3
+ # Copyright:: 2008
4
+
5
+
6
+
7
+ class Raramorph
8
+
9
+ # The dictionary handler.
10
+ @@dict = InMemoryDictionaryHandler.create
11
+ # The solutions handler.
12
+ @@sol = InMemorySolutionsHandler.create
13
+ # Whether or not the analyzer should output some convenience messages
14
+ # Alternative spellings list of regular expressions
15
+ @@alternative_spellings = [Regexp.compile(".*" + "Y'$") ,
16
+ Regexp.compile(".*" + "y'$") ,
17
+ Regexp.compile(".*" + "y$") ,
18
+ Regexp.compile(".*" + "h$") ,
19
+ Regexp.compile(".*" + "p$") ]
20
+ @@space_regex = Regexp.compile("\\s+")
21
+
22
+ # * Analyze and Process the file ( i.e Doing the morphological Analysis )
23
+ # * [file_reader_in] Input File Path
24
+ # * [output_buckwalter] whether the output in buckwalter indications ( i.e Roman letters ) or arabic letters
25
+ def self.analyze(file_reader_in,output_buckwalter)
26
+ # begin
27
+ lines= IO.readlines(file_reader_in)
28
+ lines.each do |line|
29
+ @lines_counter+=1
30
+ if(@logger.verbose)
31
+ puts "Processing line : #{@lines_counter.to_s}"
32
+ end
33
+ tokens = tokenize(line)
34
+ tokens.each do |token|
35
+ analyze_token(token,output_buckwalter)
36
+ end
37
+ end
38
+ #rescue
39
+ # @logger.info "Can not read line " + @lines_counter.to_s
40
+ #end
41
+ end
42
+
43
+ # * Tokenize the Word removing non-arabic characters
44
+ # * [str] Word to be tokenized
45
+ def self.tokenize(str) #String , REturn String
46
+ str.force_encoding "UTF-8"
47
+ str = str.strip
48
+ str = str.gsub(@@space_regex, " ")
49
+ #ignored \u0688 : ARABIC LETTER DDAL
50
+ #ignored \u06A9 : ARABIC LETTER KEHEH
51
+ #ignored \u0691 : ARABIC LETTER RREH
52
+ #ignored \u06BA : ARABIC LETTER NOON GHUNNA
53
+ #ignored \u06BE : ARABIC LETTER HEH DOACHASHMEE
54
+ #ignored \u06C1 : ARABIC LETTER HEH GOAL
55
+ #ignored \u06D2 : ARABIC LETTER YEH BARREE
56
+ split = str.split(/[^\u067E\u0686\u0698\u06AF\u0621-\u0636\u0637-\u0643\u0644\u0645-\u0648\u0649-\u064A\u064B-\u064E\u064F\u0650\u0651\u0652]+/)
57
+ tokens = []
58
+ #return at least one token, the string if necessary
59
+ split.length == 0 ? (tokens << str) : split
60
+ end
61
+
62
+
63
+ # * Analyze Token doing the morphological Analysis
64
+ # * [token] word to be analyzed
65
+ # * [output_buckwalter] whether the output in buckwalter indications ( i.e Roman letters ) or arabic letters
66
+ def self.analyze_token(token , output_buckwalter) #STring , Boolean , REturn Boolean
67
+ #TO DO SET UP THE PRINT STREAM
68
+ token.force_encoding "UTF-8"
69
+ @logger.info "Processing token : " + "\t" + token
70
+ #TODO : check accuracy
71
+ #ignored \u0688 : ARABIC LETTER DDAL
72
+ #ignored \u06A9 : ARABIC LETTER KEHEH
73
+ #ignored \u0691 : ARABIC LETTER RREH
74
+ #ignored \u06BA : ARABIC LETTER NOON GHUNNA
75
+ #ignored \u06BE : ARABIC LETTER HEH DOACHASHMEE
76
+ #ignored \u06C1 : ARABIC LETTER HEH GOAL
77
+ #ignored \u0640 : ARABIC TATWEEL
78
+ #ignored \u06D2 : ARABIC LETTER YEH BARREE
79
+ unless(token.match(/([\u067E\u0686\u0698\u06AF\u0621-\u063A\u0641-\u0652])+/))
80
+ token.strip!
81
+ # tokenize it on white space
82
+ sub_tokens = token.split(@@space_regex)
83
+ sub_tokens.each{|sub_token|
84
+ unless sub_token.strip == ""
85
+ @not_arabic_tokens_counter+=1
86
+ @logger.info("Non-Arabic : #{sub_token}")
87
+ end
88
+ }
89
+ return false
90
+ else
91
+ has_solutions = false
92
+ @not_arabic_tokens_counter+=1
93
+
94
+ translitered = ArabicLatinTranslator.translate(token)
95
+ @logger.info("Transliteration : \t#{translitered}")
96
+
97
+ if @found.has_key?(translitered) #Already processed : previously found
98
+ @logger.info("Token already processed." , true )
99
+ #increase reference counter
100
+ @found[translitered]+=1
101
+ has_solutions = true
102
+ elsif @not_found.has_key?(translitered) #Already processed : previously not found
103
+ @logger.info("Token already processed without solution." , true )
104
+ @not_found[translitered]+=1 #increase reference counter
105
+ has_solutions = false
106
+ else
107
+ @logger.info("Token not yet processed.", true )
108
+
109
+ if (feed_word_solutions(translitered)) #CHANGED #word has solutions...
110
+ #mark word as found
111
+ raise "There is already a key for " + translitered + " in found" if @found.has_key?(translitered)
112
+ @logger.info("Token has direct solutions." , true )
113
+ #set reference counter to 1
114
+ @found[translitered] = 1
115
+ has_solutions = true
116
+ else #word has no direct solution
117
+ if(feed_alternative_spellings(translitered))
118
+ alternatives_give_solutions = false
119
+ alternatives = @@sol.get_alternative_spellings(translitered)
120
+ alternatives.each{|alternative|
121
+ alternatives_give_solutions = (alternatives_give_solutions || feed_word_solutions(alternative))
122
+ }
123
+ if(alternatives_give_solutions)
124
+ #consistency check
125
+ raise "There is already a key for " + translitered + " in found" if @found.has_key?(translitered)
126
+ @logger.info("Token's alternative spellings have solutions." , true )
127
+ #mark word as found set reference counter to 1
128
+ @found[translitered] = 1
129
+ has_solutions = true
130
+ else
131
+ #consistency check
132
+ raise "There is already a key for " + translitered + " in notFound" if @not_found.has_key?(translitered)
133
+ @logger.info("Token's alternative spellings have no solution." , true )
134
+ @not_found[translitered]=1
135
+ has_solutions = false
136
+ end
137
+ else
138
+ #there are no alternative
139
+ raise "There is already a key for " + translitered + " in notFound" if @not_found.has_key?(translitered)
140
+ @logger.info("Token has no solution and no alternative spellings." , true )
141
+ #mark word as not found and set reference counter to 1
142
+ @not_found[translitered]=1
143
+ has_solutions = false
144
+ end
145
+ end
146
+ end
147
+
148
+
149
+ #output solutions : TODO consider XML output
150
+ if @logger.output != nil
151
+ if @found.has_key?(translitered)
152
+ if @@sol.has_solutions(translitered)
153
+ @@sol.get_solutions(translitered).each{|solution| @logger.info "#{output_buckwalter ? solution.to_s : solution.to_arabized_string}"
154
+ }
155
+ end
156
+ if @@sol.has_alternative_spellings(translitered)
157
+ @logger.info("No direct solution" , true )
158
+ @@sol.get_alternative_spellings(translitered).each{|alternative|
159
+ @logger.info("Considering alternative spelling :" + "\t#{alternative}" , true )
160
+ if @@sol.has_solutions(alternative)
161
+ @@sol.get_solutions(alternative).each{|solution| @logger.info "#{output_buckwalter ? solution.to_s : solution.to_arabized_string}"
162
+ }
163
+ end
164
+ }
165
+ end
166
+ elsif @not_found.has_key?(translitered)
167
+ @logger.info "\nNo solution\n"
168
+ else
169
+ raise "#{translitered} is neither in found or notFound !"
170
+ end
171
+ end
172
+ return has_solutions
173
+
174
+ end
175
+ end
176
+
177
+ # * Find the Solution for the translitered word
178
+ # * [translitered] word to be processed
179
+ def self.feed_word_solutions(translitered) # String #Return Boolean
180
+ #translitered.force_encoding "UTF-8"
181
+ return true if @@sol.has_solutions(translitered) #No need to reprocess
182
+ word_solutions = Set.new
183
+ count = 0
184
+ #get a list of valid segmentations
185
+ segments = segment_word(translitered) #Hash Set of Segement Words Objects
186
+ #Brute force algorithm
187
+ segments.each{|segmented_word|
188
+ count = @@dict.analyze_word_in_dictionaries(segmented_word , word_solutions , @logger.verbose , count )
189
+ }
190
+
191
+ #Add all solutions, if any
192
+ @@sol.add_solutions(translitered, word_solutions) unless word_solutions.empty?
193
+ return !word_solutions.empty?
194
+ end
195
+
196
+ # * Return the Solutions of the given Word
197
+ # * [word] word to be proccessed
198
+ def self.get_word_solutions(word) #String # Return Set
199
+ word.force_encoding "UTF-8"
200
+ word_solutions = Set.new
201
+ translitered = ArabicLatinTranslator.translate(word)
202
+ if @found.has_key?(translitered)
203
+ @@sol.get_solutions(translitered).each {|solution| word_solutions << solution } if @@sol.has_solutions(translitered)
204
+ if @@sol.has_alternative_spellings(translitered)
205
+ @@sol.get_alternative_spellings(translitered).each {|alt|
206
+ @@sol.get_solutions(alt).each {|solution| word_solutions << solution } if @@sol.has_solutions(alt)}
207
+ end
208
+ elsif @not_found.has_key?(translitered)
209
+ else
210
+ raise "#{translitered} is neither in found or notFound !"
211
+ end
212
+ return word_solutions
213
+ end
214
+
215
+ # * Segment the give word constructing prefix , stem , suffix
216
+ # * [translitered] word to be proccessed
217
+ def self.segment_word(translitered)
218
+ # translitered.force_encoding "UTF-8"
219
+ segmented = Set.new
220
+ prefix_len = 0
221
+ suffix_len = 0
222
+
223
+ while(prefix_len <=4 and prefix_len<=translitered.length)
224
+ prefix = translitered.slice(0,prefix_len)
225
+ stem_len = translitered.length - prefix_len
226
+ suffix_len = 0
227
+
228
+ while(stem_len>=1 and suffix_len<=6)
229
+ stem = translitered.slice(prefix_len,stem_len)
230
+ suffix = translitered.slice(prefix_len+stem_len,suffix_len)
231
+ segmented.add(SegmentedWord.new(prefix,stem,suffix))
232
+ stem_len-=1
233
+ suffix_len+=1
234
+ end
235
+
236
+ prefix_len+=1
237
+ end
238
+
239
+ segmented
240
+ end
241
+
242
+ def self.print_stats
243
+ total = (@found.length+@not_found.length).to_f
244
+ puts "=================== Statistics ==================="
245
+ puts "Lines : " + @lines_counter.to_s
246
+ puts "Arabic tokens : " + @not_arabic_tokens_counter.to_s
247
+ puts "Non-arabic tokens : " + @not_arabic_tokens_counter.to_s
248
+ puts "Words found : " + @found.length.to_s + " (" + (((100*(@found.length*100 / total)).round())/100.0 ).to_s+ "%)"
249
+ puts "Words not found : " + @not_found.length.to_s + " (" + (((100*(@not_found.length*100 / total)).round())/100.0 ).to_s + "%)"
250
+ puts "=================================================="
251
+
252
+ end
253
+
254
+ # * Find Alternative Spellings for the translitered word
255
+ # * [translitered] word to be proccesed
256
+ def self.feed_alternative_spellings(translitered)
257
+ return true if(@@sol.has_alternative_spellings(translitered))
258
+ word_alternative_spellings = Set.new
259
+ temp = translitered
260
+
261
+ if( temp.match(@@alternative_spellings[0]) )
262
+ temp.gsub!(/Y/, "y")
263
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
264
+ word_alternative_spellings.add(temp)
265
+ temp2 = temp.sub(/w/, "&")
266
+ if(temp!=temp2)
267
+ temp = temp2
268
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
269
+ word_alternative_spellings.add(temp)
270
+ end
271
+ temp=translitered
272
+ temp.gsub!(/Y/,"y")
273
+ temp.sub!(/y'$/,"}")
274
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
275
+ word_alternative_spellings.add(temp)
276
+ temp2 = temp.sub(/w/, "&")
277
+ if(temp!=temp2)
278
+ temp = temp2
279
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
280
+ word_alternative_spellings.add(temp)
281
+ end
282
+
283
+ elsif( temp.match(@@alternative_spellings[1]) )
284
+ temp2 = temp.gsub(/Y/,"y")
285
+ if(temp != temp2 )
286
+ temp = temp2
287
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
288
+ word_alternative_spellings.add(temp)
289
+ end
290
+ temp2 = temp.sub(/w'/, "&")
291
+ if(temp != temp2 )
292
+ temp = temp2
293
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
294
+ word_alternative_spellings.add(temp)
295
+ end
296
+ temp =translitered
297
+ temp.gsub!(/Y/, "y")
298
+ temp.sub!(/y'$/, "}")
299
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
300
+ word_alternative_spellings.add(temp)
301
+ temp2 = temp.sub(/w'/, "&")
302
+ if(temp != temp2 )
303
+ temp = temp2
304
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
305
+ word_alternative_spellings.add(temp)
306
+ end
307
+
308
+ elsif( temp.match(@@alternative_spellings[2]) )
309
+ temp.gsub!(/Y/,"y")
310
+ temp2 = temp.sub(/w'/, "&")
311
+ if(temp != temp2 )
312
+ temp = temp2
313
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
314
+ word_alternative_spellings.add(temp)
315
+ end
316
+ temp =translitered
317
+ temp.gsub!(/Y/, "y")
318
+ temp.gsub!(/y$/, "Y")
319
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
320
+ word_alternative_spellings.add(temp)
321
+ temp2 = temp.sub(/w'/, "&")
322
+ if(temp != temp2 )
323
+ temp = temp2
324
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
325
+ word_alternative_spellings.add(temp)
326
+ end
327
+
328
+ elsif( temp.match(@@alternative_spellings[3]) )
329
+ temp2 = temp.gsub(/Y/,"y")
330
+ if(temp != temp2 )
331
+ temp = temp2
332
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
333
+ word_alternative_spellings.add(temp)
334
+ end
335
+ temp2 = temp.sub(/w'/, "&")
336
+ if(temp != temp2 )
337
+ temp = temp2
338
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
339
+ word_alternative_spellings.add(temp)
340
+ end
341
+ temp.sub!(/p$/, "h")
342
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
343
+ word_alternative_spellings.add(temp)
344
+
345
+ else
346
+ temp2 = temp.sub(/Y$/, "y")
347
+ if(temp!=temp2)
348
+ temp = temp2
349
+ temp.gsub!(/Y/, "y")
350
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
351
+ word_alternative_spellings.add(temp)
352
+ temp2 = temp.sub(/w'/, "&")
353
+ if(temp != temp2 )
354
+ temp = temp2
355
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
356
+ word_alternative_spellings.add(temp)
357
+ end
358
+ else
359
+ temp2 = temp.gsub(/Y/, "y")
360
+ if(temp != temp2)
361
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
362
+ word_alternative_spellings.add(temp)
363
+ temp2 = temp.sub(/w'/, "&")
364
+ if(temp != temp2 )
365
+ temp = temp2
366
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
367
+ word_alternative_spellings.add(temp)
368
+ end
369
+ else
370
+ temp2 = temp.sub(/w'/, "&")
371
+ if(temp != temp2 )
372
+ temp = temp2
373
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
374
+ word_alternative_spellings.add(temp)
375
+ end
376
+ end
377
+ end
378
+ end
379
+
380
+ if(!word_alternative_spellings.empty?)
381
+ @@sol .add_alternative_spellings(translitered,word_alternative_spellings)
382
+ end
383
+ return !word_alternative_spellings.empty?
384
+ end
385
+
386
+ # Executes the morphological Analyzer and Intitaite the variables
387
+ # * [input_filename] input file path
388
+ # * [output_filename] Output file path
389
+ # * [verbose] Setter for verbose
390
+ # * [not_arabic] alias for out_put_bucwalter for indicating the output format in buckwalter indications or will be arabic
391
+ def self.execute(input_filename, output_filename ,verbose = false, not_arabic = false)
392
+ @logger = Logger.new(true , output_filename )
393
+ @not_arabic = not_arabic
394
+ # Lines processed
395
+ @lines_counter = 0
396
+ # Arabic tokens processed
397
+ @not_arabic_tokens_counter = 0
398
+ # Not arabic tokens processed
399
+ @not_arabic_tokens_counter = 0
400
+ # Arabic words which have been succesfully analyzed.
401
+ # * [key] = word
402
+ # * [value] = occurences
403
+ #
404
+ @found = {}
405
+ # Arabic words which have not been succesfully analyzed.
406
+ # * [key] = word
407
+ # * [value] = occurences
408
+ #
409
+ @not_found = {}
410
+ analyze(input_filename , @not_arabic)
411
+ @logger.log
412
+ print_stats
413
+ end
414
+ end
415
+
416
+ class SegmentedWord < Struct.new( :prefix , :stem , :suffix) ; end
417
+