raramorph 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,56 @@
1
+ == Raramorph
2
+ By eSpace-technologies
3
+ http://www.espace.com.eg
4
+ http://www.espace.com.eg/docs/raramorph/index.html
5
+ http://github.com/espace/raramorph
6
+ http://raramorph.rubyforge.org
7
+
8
+ == DESCRIPTION:
9
+
10
+ Raramorph is a Ruby 1.9 gem for an intelligent port Aramorph based on Buckwalter Arabic Morphological Analyzer Version 1.0.
11
+
12
+ == Usage
13
+
14
+ require 'raramorph'
15
+ # For analyzing a file
16
+ Raramorph.execute(input_filename, output_filename ,verbose = false, not_arabic = true)
17
+ # You can use functions like analyze_token , tokenize , segement_word all as static methods in Raramorph class
18
+ OR
19
+ From the command line
20
+ raramorph input_file_name output_file_name -v -a
21
+ -v verbose mode ( optional )
22
+ -a arabic output ( optional )
23
+
24
+
25
+
26
+ == INSTALL:
27
+
28
+ sudo gem install raramorph
29
+
30
+ === Source Code =====
31
+ http://github.com/espace/raramorph/tree/master
32
+
33
+ == LICENSE:
34
+
35
+ (The MIT License)
36
+
37
+ Copyright (c) 2008 Moustafa Emara , Hany Salah el deen
38
+
39
+ Permission is hereby granted, free of charge, to any person obtaining
40
+ a copy of this software and associated documentation files (the
41
+ 'Software'), to deal in the Software without restriction, including
42
+ without limitation the rights to use, copy, modify, merge, publish,
43
+ distribute, sublicense, and/or sell copies of the Software, and to
44
+ permit persons to whom the Software is furnished to do so, subject to
45
+ the following conditions:
46
+
47
+ The above copyright notice and this permission notice shall be
48
+ included in all copies or substantial portions of the Software.
49
+
50
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
51
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
52
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
53
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
54
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
55
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
56
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -2,9 +2,7 @@
2
2
  # Author:: eSpace technologies www.eSpace.com.eg
3
3
  # Copyright:: 2008
4
4
  #
5
-
6
5
  class ArabicLatinTranslator
7
-
8
6
  # * Table Used for Tranlation From Arabic To English I.e ( Romanize Word )
9
7
  # * According to Buckwalter system Dictionary
10
8
  TABLE = { "\u0621"=> "'" , "\u0622"=> "|" , "\u0623"=> ">" , "\u0624"=> "&" , "\u0625"=> "<" , "\u0626"=> "}" ,
@@ -20,9 +18,7 @@ class ArabicLatinTranslator
20
18
  #Not suitable for morphological analysis : remove all vowels/diacritics, i.e. undo the job !
21
19
  VOWEL_REMOVER = Regexp.compile("[FNKaui~o]")
22
20
  STRIPER = Regexp.compile("[`\\{]")
23
- def initilaize
24
-
25
- end
21
+
26
22
 
27
23
  # * Translate : Transilerate the arabic word to Roman lettered Word
28
24
  # * [word] Word String To be processed
@@ -1,7 +1,7 @@
1
1
  # Class For Storing Dictionary Entries
2
2
  # Author:: eSpace technologies www.eSpace.com.eg
3
3
  # Copyright:: 2008
4
- #
4
+
5
5
 
6
6
  class DictionaryEntry
7
7
  ## Constructs a Dictionary Entry
@@ -1,7 +1,7 @@
1
1
  # Class For Storing And Loading Dictionaries
2
2
  # Author:: eSpace technologies www.eSpace.com.eg
3
3
  # Copyright:: 2008
4
- #
4
+
5
5
 
6
6
  require 'rubygems'
7
7
  class InMemoryDictionaryHandler
@@ -25,25 +25,24 @@ class InMemoryDictionaryHandler
25
25
  ### Variables #####
26
26
  @@handler = nil
27
27
  @@regex = Regexp.compile(".*" + "<pos>(.+?)</pos>" + ".*")
28
- @@morphology_regexs=[]
29
- #@@leema_starter = Regexp.compile(";; ")
30
- @@morphology_regexs[0] = Regexp.compile("^(Pref-0|Suff-0)$")
31
- @@morphology_regexs[1] = Regexp.compile("^F" + ".*")
32
- @@morphology_regexs[2] = Regexp.compile("^IV" + ".*")
33
- @@morphology_regexs[3] = Regexp.compile("^PV" + ".*")
34
- @@morphology_regexs[4] = Regexp.compile("^CV" + ".*")
35
- @@morphology_regexs[5] = Regexp.compile("^N" + ".*")
36
- @@morphology_regexs[6] = Regexp.compile("^[A-Z]" + ".*")
37
- @@morphology_regexs[7] = Regexp.compile(".*" + "iy~$")
28
+ @@morphology_regexs=[Regexp.compile("^(Pref-0|Suff-0)$") ,
29
+ Regexp.compile("^F" + ".*") ,
30
+ Regexp.compile("^IV" + ".*") ,
31
+ Regexp.compile("^PV" + ".*") ,
32
+ Regexp.compile("^CV" + ".*") ,
33
+ Regexp.compile("^N" + ".*") ,
34
+ Regexp.compile("^[A-Z]" + ".*") ,
35
+ Regexp.compile(".*" + "iy~$")
36
+ ]
38
37
  @@compatability_stpliter = Regexp.compile("\\s+")
39
- @@vocalization_array =[]
40
- @@vocalization_array[0] = "/FUNC_WORD"
41
- @@vocalization_array[1] ="/VERB_IMPERFECT"
42
- @@vocalization_array[2] ="/VERB_PERFECT"
43
- @@vocalization_array[3] ="/VERB_IMPERATIVE"
44
- @@vocalization_array[4] = "/NOUN_PROP"
45
- @@vocalization_array[5] ="/NOUN"
46
- @@vocalization_array[6] = "/NOUN"
38
+ @@vocalization_array =["/FUNC_WORD" ,
39
+ "/VERB_IMPERFECT" ,
40
+ "/VERB_PERFECT" ,
41
+ "/VERB_IMPERATIVE" ,
42
+ "/NOUN_PROP" ,
43
+ "/NOUN" ,
44
+ "/NOUN"
45
+ ]
47
46
 
48
47
  @@prefixes_stems_compatibility = Set.new
49
48
  #Changed
@@ -163,6 +162,44 @@ class InMemoryDictionaryHandler
163
162
  @@suffixes = suffixes
164
163
  end
165
164
 
165
+ def analyze_word_in_dictionaries(segmented_word , word_solutions , verbose , count)
166
+ #Is prefix known ?
167
+ if has_prefix?(segmented_word.prefix)
168
+ #Is stem known ?
169
+ # puts "has prefix"
170
+ if has_stem?(segmented_word.stem)
171
+ # puts "has stem"
172
+ #Is suffix known ?
173
+ if has_suffix?(segmented_word.suffix)
174
+ # puts "has suffix"
175
+ #Compatibility check
176
+ @@prefixes[segmented_word.prefix].each{|prefix|
177
+ @@stems[segmented_word.stem].each {|stem|
178
+ #Prefix/Stem compatibility
179
+ if prefixes_stems_compatible?(prefix.morphology ,stem.morphology )
180
+ # puts "has A B Com"
181
+ @@suffixes[segmented_word.suffix].each {|suffix|
182
+ # Prefix/Suffix compatiblity
183
+ if prefixes_suffixes_compatible?(prefix.morphology , suffix.morphology)
184
+ # puts "has A C Com"
185
+ # Stems/Suffixes compatiblity
186
+ if stems_suffixes_compatible?(stem.morphology , suffix.morphology)
187
+ # puts "has B C COM"
188
+ #All tests passed : it is a solution
189
+ count = count + 1
190
+ word_solutions << Solution.new(verbose , count , prefix , stem , suffix )
191
+ end
192
+ end
193
+ }
194
+ end
195
+ }
196
+ }
197
+ end
198
+ end
199
+ end
200
+ return count
201
+ end
202
+
166
203
  private
167
204
 
168
205
  # * load Dictionary from files
@@ -241,7 +278,8 @@ class InMemoryDictionaryHandler
241
278
  vocalization = splited_line[1]
242
279
  morphology = splited_line[2]
243
280
  gloss_pos = splited_line[3]
244
- gloss , pos = ""
281
+ gloss = ""
282
+ pos = ""
245
283
  # two ways to get the POS info
246
284
  # (1) explicitly, by extracting it from the gloss field:
247
285
 
@@ -2,7 +2,7 @@
2
2
  #
3
3
  # Author:: eSpace technologies www.eSpace.com.eg
4
4
  # Copyright:: 2008
5
- #
5
+
6
6
 
7
7
  class InMemorySolutionsHandler
8
8
 
@@ -1,7 +1,7 @@
1
1
  # Class For Latin Arabic Transileration
2
2
  # Author:: eSpace technologies www.eSpace.com.eg
3
3
  # Copyright:: 2008
4
- #
4
+
5
5
 
6
6
  class LatinArabicTranslator
7
7
 
@@ -0,0 +1,20 @@
1
+ class Logger
2
+
3
+
4
+ attr_reader :verbose , :output
5
+ def initialize(verbose = nil , output = nil )
6
+ @verbose = verbose
7
+ @output = output
8
+ @stream = StringIO.new
9
+ end
10
+
11
+ def info string , require_verbose = false
12
+ @stream.puts(string) #if ( require_verbose && @verbose || ! require_verbose )
13
+ end
14
+
15
+ def log
16
+ return puts @stream.string if @output.nil?
17
+ File.open(@output , "w") { |f|
18
+ f.puts @stream.string }
19
+ end
20
+ end
@@ -1,10 +1,8 @@
1
- # A Ruby port of Buckwalter Arabic Morphological Analyzer Version 1.0.
2
- #
1
+ # A Ruby port of Buckwalter Morphological Analyzer Version 1.0.
3
2
  # Author:: eSpace technologies www.eSpace.com.eg
4
3
  # Copyright:: 2008
5
4
 
6
5
 
7
- require 'set'
8
6
 
9
7
  class Raramorph
10
8
 
@@ -13,48 +11,14 @@ class Raramorph
13
11
  # The solutions handler.
14
12
  @@sol = InMemorySolutionsHandler.create
15
13
  # Whether or not the analyzer should output some convenience messages
16
- @verbose
17
- # The stream where to output the results
18
-
19
- @output_stream
20
-
21
- #use arabic translation or not?
22
- @not_arabic
23
-
24
- #Stats
25
- # Lines processed
26
- @lines_counter = 0
27
- # Arabic tokens processed
28
- @not_arabic_tokens_counter = 0
29
- # Not arabic tokens processed
30
- @not_arabic_tokens_counter = 0
31
-
32
- # Arabic words which have been succesfully analyzed.
33
- # * [key] = word
34
- # * [value] = occurences
35
- #
36
- @found = {}
37
-
38
- # Arabic words which have not been succesfully analyzed.
39
- # * [key] = word
40
- # * [value] = occurences
41
- #
42
- @not_found = {}
43
-
44
14
  # Alternative spellings list of regular expressions
45
- @@alternative_spellings = []
46
- @@alternative_spellings[0] = Regexp.compile(".*" + "Y'$")
47
- @@alternative_spellings[1] = Regexp.compile(".*" + "y'$")
48
- @@alternative_spellings[2] = Regexp.compile(".*" + "y$")
49
- @@alternative_spellings[3] = Regexp.compile(".*" + "h$")
50
- @@alternative_spellings[4] = Regexp.compile(".*" + "p$")
15
+ @@alternative_spellings = [Regexp.compile(".*" + "Y'$") ,
16
+ Regexp.compile(".*" + "y'$") ,
17
+ Regexp.compile(".*" + "y$") ,
18
+ Regexp.compile(".*" + "h$") ,
19
+ Regexp.compile(".*" + "p$") ]
51
20
  @@space_regex = Regexp.compile("\\s+")
52
21
 
53
-
54
- def self.set_verbose(verbose) #Bolean Variable
55
- @verbose = verbose
56
- end
57
-
58
22
  # * Analyze and Process the file ( i.e Doing the morphological Analysis )
59
23
  # * [file_reader_in] Input File Path
60
24
  # * [output_buckwalter] whether the output in buckwalter indications ( i.e Roman letters ) or arabic letters
@@ -63,8 +27,8 @@ class Raramorph
63
27
  lines= IO.readlines(file_reader_in)
64
28
  lines.each do |line|
65
29
  @lines_counter+=1
66
- if(@verbose)
67
- puts "Processing line : "+ @lines_counter.to_s
30
+ if(@logger.verbose)
31
+ puts "Processing line : #{@lines_counter.to_s}"
68
32
  end
69
33
  tokens = tokenize(line)
70
34
  tokens.each do |token|
@@ -72,7 +36,7 @@ class Raramorph
72
36
  end
73
37
  end
74
38
  #rescue
75
- # @stream.puts "Can not read line " + @lines_counter.to_s
39
+ # @logger.info "Can not read line " + @lines_counter.to_s
76
40
  #end
77
41
  end
78
42
 
@@ -102,7 +66,7 @@ class Raramorph
102
66
  def self.analyze_token(token , output_buckwalter) #STring , Boolean , REturn Boolean
103
67
  #TO DO SET UP THE PRINT STREAM
104
68
  token.force_encoding "UTF-8"
105
- @stream.puts "Processing token : " + "\t" + token
69
+ @logger.info "Processing token : " + "\t" + token
106
70
  #TODO : check accuracy
107
71
  #ignored \u0688 : ARABIC LETTER DDAL
108
72
  #ignored \u06A9 : ARABIC LETTER KEHEH
@@ -119,7 +83,7 @@ class Raramorph
119
83
  sub_tokens.each{|sub_token|
120
84
  unless sub_token.strip == ""
121
85
  @not_arabic_tokens_counter+=1
122
- @output_stream != nil ? @stream.puts("Non-Arabic : " + sub_token) : puts("Non-Arabic : " + sub_token)
86
+ @logger.info("Non-Arabic : #{sub_token}")
123
87
  end
124
88
  }
125
89
  return false
@@ -128,31 +92,30 @@ class Raramorph
128
92
  @not_arabic_tokens_counter+=1
129
93
 
130
94
  translitered = ArabicLatinTranslator.translate(token)
131
- @output_stream != nil ? @stream.puts("Transliteration : " + "\t" + translitered) : puts("Transliteration : " + "\t" + translitered)
95
+ @logger.info("Transliteration : \t#{translitered}")
132
96
 
133
97
  if @found.has_key?(translitered) #Already processed : previously found
134
- @output_stream != nil && @verbose ? @stream.puts("Token already processed.") : puts("Token already processed.")
98
+ @logger.info("Token already processed." , true )
135
99
  #increase reference counter
136
100
  @found[translitered]+=1
137
101
  has_solutions = true
138
102
  elsif @not_found.has_key?(translitered) #Already processed : previously not found
139
- @output_stream != nil && @verbose ? @stream.puts("Token already processed without solution.") : puts("Token already processed without solution.")
103
+ @logger.info("Token already processed without solution." , true )
140
104
  @not_found[translitered]+=1 #increase reference counter
141
105
  has_solutions = false
142
106
  else
143
- @output_stream != nil && @verbose ? @stream.puts("Token not yet processed.") : puts("Token not yet processed.")
107
+ @logger.info("Token not yet processed.", true )
144
108
 
145
109
  if (feed_word_solutions(translitered)) #CHANGED #word has solutions...
146
110
  #mark word as found
147
111
  raise "There is already a key for " + translitered + " in found" if @found.has_key?(translitered)
148
- @output_stream != nil && @verbose ? @stream.puts("Token has direct solutions.") : puts("Token has direct solutions.")
112
+ @logger.info("Token has direct solutions." , true )
149
113
  #set reference counter to 1
150
114
  @found[translitered] = 1
151
115
  has_solutions = true
152
116
  else #word has no direct solution
153
117
  if(feed_alternative_spellings(translitered))
154
118
  alternatives_give_solutions = false
155
-
156
119
  alternatives = @@sol.get_alternative_spellings(translitered)
157
120
  alternatives.each{|alternative|
158
121
  alternatives_give_solutions = (alternatives_give_solutions || feed_word_solutions(alternative))
@@ -160,21 +123,21 @@ class Raramorph
160
123
  if(alternatives_give_solutions)
161
124
  #consistency check
162
125
  raise "There is already a key for " + translitered + " in found" if @found.has_key?(translitered)
163
- @output_stream != nil && @verbose ? @stream.puts("Token's alternative spellings have solutions.") : puts("Token's alternative spellings have solutions.")
126
+ @logger.info("Token's alternative spellings have solutions." , true )
164
127
  #mark word as found set reference counter to 1
165
128
  @found[translitered] = 1
166
129
  has_solutions = true
167
130
  else
168
131
  #consistency check
169
132
  raise "There is already a key for " + translitered + " in notFound" if @not_found.has_key?(translitered)
170
- @output_stream != nil && @verbose ? @stream.puts("Token's alternative spellings have no solution.") : puts("Token's alternative spellings have no solution.")
133
+ @logger.info("Token's alternative spellings have no solution." , true )
171
134
  @not_found[translitered]=1
172
135
  has_solutions = false
173
136
  end
174
137
  else
175
138
  #there are no alternative
176
139
  raise "There is already a key for " + translitered + " in notFound" if @not_found.has_key?(translitered)
177
- @output_stream != nil && @verbose ? @stream.puts("Token has no solution and no alternative spellings.") : puts("Token has no solution and no alternative spellings.")
140
+ @logger.info("Token has no solution and no alternative spellings." , true )
178
141
  #mark word as not found and set reference counter to 1
179
142
  @not_found[translitered]=1
180
143
  has_solutions = false
@@ -184,22 +147,24 @@ class Raramorph
184
147
 
185
148
 
186
149
  #output solutions : TODO consider XML output
187
- if @output_stream != nil
150
+ if @logger.output != nil
188
151
  if @found.has_key?(translitered)
189
152
  if @@sol.has_solutions(translitered)
190
- @@sol.get_solutions(translitered).each{|solution| @stream.puts "#{output_buckwalter ? solution.to_s : solution.to_arabized_string}"}
153
+ @@sol.get_solutions(translitered).each{|solution| @logger.info "#{output_buckwalter ? solution.to_s : solution.to_arabized_string}"
154
+ }
191
155
  end
192
156
  if @@sol.has_alternative_spellings(translitered)
193
- @output_stream != nil && @verbose ? @stream.puts("No direct solution") : puts("No direct solution")
157
+ @logger.info("No direct solution" , true )
194
158
  @@sol.get_alternative_spellings(translitered).each{|alternative|
195
- @output_stream != nil && @verbose ? @stream.puts("Considering alternative spelling :" + "\t" + alternative) : puts("Considering alternative spelling :" + "\t" + alternative)
159
+ @logger.info("Considering alternative spelling :" + "\t#{alternative}" , true )
196
160
  if @@sol.has_solutions(alternative)
197
- @@sol.get_solutions(alternative).each{|solution| @stream.puts "#{output_buckwalter ? solution.to_s : solution.to_arabized_string}"}
161
+ @@sol.get_solutions(alternative).each{|solution| @logger.info "#{output_buckwalter ? solution.to_s : solution.to_arabized_string}"
162
+ }
198
163
  end
199
164
  }
200
165
  end
201
166
  elsif @not_found.has_key?(translitered)
202
- @stream.puts "\nNo solution\n"
167
+ @logger.info "\nNo solution\n"
203
168
  else
204
169
  raise "#{translitered} is neither in found or notFound !"
205
170
  end
@@ -220,40 +185,7 @@ class Raramorph
220
185
  segments = segment_word(translitered) #Hash Set of Segement Words Objects
221
186
  #Brute force algorithm
222
187
  segments.each{|segmented_word|
223
- #Is prefix known ?
224
- if @@dict.has_prefix?(segmented_word.prefix)
225
- #Is stem known ?
226
- # puts "has prefix"
227
- if @@dict.has_stem?(segmented_word.stem)
228
- # puts "has stem"
229
- #Is suffix known ?
230
- if @@dict.has_suffix?(segmented_word.suffix)
231
- # puts "has suffix"
232
- #Compatibility check
233
- @@dict.prefixes[segmented_word.prefix].each{|prefix|
234
- @@dict.stems[segmented_word.stem].each {|stem|
235
- #Prefix/Stem compatibility
236
- if @@dict.prefixes_stems_compatible?(prefix.morphology ,stem.morphology )
237
- # puts "has A B Com"
238
- @@dict.suffixes[segmented_word.suffix].each {|suffix|
239
- # Prefix/Suffix compatiblity
240
- if @@dict.prefixes_suffixes_compatible?(prefix.morphology , suffix.morphology)
241
- # puts "has A C Com"
242
- # Stems/Suffixes compatiblity
243
- if @@dict.stems_suffixes_compatible?(stem.morphology , suffix.morphology)
244
- # puts "has B C COM"
245
- #All tests passed : it is a solution
246
- count = count + 1
247
- word_solutions << Solution.new(@verbose , count , prefix , stem , suffix )
248
- end
249
- end
250
- }
251
- end
252
- }
253
- }
254
- end
255
- end
256
- end
188
+ count = @@dict.analyze_word_in_dictionaries(segmented_word , word_solutions , @logger.verbose , count )
257
189
  }
258
190
 
259
191
  #Add all solutions, if any
@@ -322,37 +254,29 @@ class Raramorph
322
254
  # * Find Alternative Spellings for the translitered word
323
255
  # * [translitered] word to be proccesed
324
256
  def self.feed_alternative_spellings(translitered)
325
- return true if(@@sol.has_alternative_spellings(translitered))
257
+ return true if(@@sol.has_alternative_spellings(translitered))
326
258
  word_alternative_spellings = Set.new
327
259
  temp = translitered
328
260
 
329
261
  if( temp.match(@@alternative_spellings[0]) )
330
262
  temp.gsub!(/Y/, "y")
331
- if(@verbose)
332
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
333
- end
263
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
334
264
  word_alternative_spellings.add(temp)
335
265
  temp2 = temp.sub(/w/, "&")
336
266
  if(temp!=temp2)
337
267
  temp = temp2
338
- if(@verbose)
339
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
340
- end
268
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
341
269
  word_alternative_spellings.add(temp)
342
270
  end
343
271
  temp=translitered
344
272
  temp.gsub!(/Y/,"y")
345
273
  temp.sub!(/y'$/,"}")
346
- if(@verbose)
347
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
348
- end
274
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
349
275
  word_alternative_spellings.add(temp)
350
276
  temp2 = temp.sub(/w/, "&")
351
277
  if(temp!=temp2)
352
278
  temp = temp2
353
- if(@verbose)
354
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
355
- end
279
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
356
280
  word_alternative_spellings.add(temp)
357
281
  end
358
282
 
@@ -360,32 +284,24 @@ class Raramorph
360
284
  temp2 = temp.gsub(/Y/,"y")
361
285
  if(temp != temp2 )
362
286
  temp = temp2
363
- if(@verbose)
364
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
365
- end
287
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
366
288
  word_alternative_spellings.add(temp)
367
289
  end
368
290
  temp2 = temp.sub(/w'/, "&")
369
291
  if(temp != temp2 )
370
292
  temp = temp2
371
- if(@verbose)
372
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
373
- end
293
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
374
294
  word_alternative_spellings.add(temp)
375
295
  end
376
296
  temp =translitered
377
297
  temp.gsub!(/Y/, "y")
378
298
  temp.sub!(/y'$/, "}")
379
- if(@verbose)
380
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
381
- end
299
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
382
300
  word_alternative_spellings.add(temp)
383
301
  temp2 = temp.sub(/w'/, "&")
384
302
  if(temp != temp2 )
385
303
  temp = temp2
386
- if(@verbose)
387
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
388
- end
304
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
389
305
  word_alternative_spellings.add(temp)
390
306
  end
391
307
 
@@ -394,24 +310,18 @@ class Raramorph
394
310
  temp2 = temp.sub(/w'/, "&")
395
311
  if(temp != temp2 )
396
312
  temp = temp2
397
- if(@verbose)
398
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
399
- end
313
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
400
314
  word_alternative_spellings.add(temp)
401
315
  end
402
316
  temp =translitered
403
317
  temp.gsub!(/Y/, "y")
404
318
  temp.gsub!(/y$/, "Y")
405
- if(@verbose)
406
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
407
- end
319
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
408
320
  word_alternative_spellings.add(temp)
409
321
  temp2 = temp.sub(/w'/, "&")
410
322
  if(temp != temp2 )
411
323
  temp = temp2
412
- if(@verbose)
413
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
414
- end
324
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
415
325
  word_alternative_spellings.add(temp)
416
326
  end
417
327
 
@@ -419,23 +329,17 @@ class Raramorph
419
329
  temp2 = temp.gsub(/Y/,"y")
420
330
  if(temp != temp2 )
421
331
  temp = temp2
422
- if(@verbose)
423
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
424
- end
332
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
425
333
  word_alternative_spellings.add(temp)
426
334
  end
427
335
  temp2 = temp.sub(/w'/, "&")
428
336
  if(temp != temp2 )
429
337
  temp = temp2
430
- if(@verbose)
431
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
432
- end
338
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
433
339
  word_alternative_spellings.add(temp)
434
340
  end
435
341
  temp.sub!(/p$/, "h")
436
- if(@verbose)
437
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
438
- end
342
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
439
343
  word_alternative_spellings.add(temp)
440
344
 
441
345
  else
@@ -443,40 +347,30 @@ class Raramorph
443
347
  if(temp!=temp2)
444
348
  temp = temp2
445
349
  temp.gsub!(/Y/, "y")
446
- if(@verbose)
447
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
448
- end
350
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
449
351
  word_alternative_spellings.add(temp)
450
352
  temp2 = temp.sub(/w'/, "&")
451
353
  if(temp != temp2 )
452
354
  temp = temp2
453
- if(@verbose)
454
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
455
- end
355
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
456
356
  word_alternative_spellings.add(temp)
457
357
  end
458
358
  else
459
359
  temp2 = temp.gsub(/Y/, "y")
460
360
  if(temp != temp2)
461
- if(@verbose)
462
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
463
- end
361
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
464
362
  word_alternative_spellings.add(temp)
465
363
  temp2 = temp.sub(/w'/, "&")
466
364
  if(temp != temp2 )
467
365
  temp = temp2
468
- if(@verbose)
469
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
470
- end
366
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
471
367
  word_alternative_spellings.add(temp)
472
368
  end
473
369
  else
474
370
  temp2 = temp.sub(/w'/, "&")
475
371
  if(temp != temp2 )
476
372
  temp = temp2
477
- if(@verbose)
478
- @stream.puts "Found alternative spelling "+ temp + " for word " + translitered
479
- end
373
+ @logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
480
374
  word_alternative_spellings.add(temp)
481
375
  end
482
376
  end
@@ -494,10 +388,9 @@ class Raramorph
494
388
  # * [output_filename] Output file path
495
389
  # * [verbose] Setter for verbose
496
390
  # * [not_arabic] alias for out_put_bucwalter for indicating the output format in buckwalter indications or will be arabic
497
- def self.execute(input_filename, output_filename ,verbose = false, not_arabic = true)
498
- @output_stream = true
391
+ def self.execute(input_filename, output_filename ,verbose = false, not_arabic = false)
392
+ @logger = Logger.new(true , output_filename )
499
393
  @not_arabic = not_arabic
500
- @verbose = verbose
501
394
  # Lines processed
502
395
  @lines_counter = 0
503
396
  # Arabic tokens processed
@@ -514,24 +407,11 @@ class Raramorph
514
407
  # * [value] = occurences
515
408
  #
516
409
  @not_found = {}
517
- @stream = StringIO.new
518
-
519
410
  analyze(input_filename , @not_arabic)
520
- File.open(output_filename , "w") do |f|
521
- f.puts @stream.string
522
- end
411
+ @logger.log
523
412
  print_stats
524
413
  end
525
414
  end
526
415
 
527
- class SegmentedWord
528
- # Class For Storing the Data of segmented Word
529
- # Author:: eSpace technologies www.eSpace.com.eg
530
- # Copyright:: 2008
531
- attr_reader :prefix , :stem , :suffix
532
- def initialize(prefix , stem , suffix)
533
- @prefix = prefix
534
- @stem = stem
535
- @suffix = suffix
536
- end
537
- end
416
+ class SegmentedWord < Struct.new( :prefix , :stem , :suffix) ; end
417
+
@@ -1,14 +1,47 @@
1
1
  # A class to find the solution of the word
2
- #
3
2
  # Author:: eSpace technologies www.eSpace.com.eg
4
3
  # Copyright:: 2008
5
- #
4
+
6
5
 
7
6
 
8
7
  class Solution
9
8
 
10
9
  attr_reader :prefix, :stem, :suffix, :cnt
10
+ @@ends_with_set_for_pos_one = Set.new(["CONJ","EMPHATIC_PARTICLE","FUNC_WORD",
11
+ "FUT_PART","INTERJ","INTERROG_PART","IV1S","IV2MS",
12
+ "IV2FS","IV3MS","IV3FS","IV2D","IV2FD","IV3MD","IV3FD",
13
+ "IV1P","IV2MP","IV2FP","IV3MP","IV3FP","NEG_PART",
14
+ "PREP","RESULT_CLAUSE_PARTICLE"])
11
15
 
16
+ @@ends_with_set_for_pos_two = Set.new(["CASE_INDEF_NOM","CASE_INDEF_ACC",
17
+ "CASE_INDEF_ACCGEN","CASE_INDEF_GEN" ,"CASE_DEF_NOM" ,
18
+ "CASE_DEF_ACC" ,"CASE_DEF_ACCGEN","CASE_DEF_GEN" ,
19
+ "NSUFF_MASC_SG_ACC_INDEF" ,"NSUFF_FEM_SG" ,"NSUFF_MASC_DU_NOM" ,
20
+ "NSUFF_MASC_DU_NOM_POSS" ,"NSUFF_MASC_DU_ACCGEN" ,
21
+ "NSUFF_MASC_DU_ACCGEN_POSS" ,"NSUFF_FEM_DU_NOM" ,
22
+ "NSUFF_FEM_DU_NOM_POSS" ,"NSUFF_FEM_DU_ACCGEN" ,
23
+ "NSUFF_FEM_DU_ACCGEN_POSS" ,"NSUFF_MASC_PL_NOM" ,
24
+ "NSUFF_MASC_PL_NOM_POSS" ,"NSUFF_MASC_PL_ACCGEN" ,
25
+ "NSUFF_MASC_PL_ACCGEN_POSS" ,"NSUFF_FEM_PL" ,"POSS_PRON_1S",
26
+ "POSS_PRON_2MS" ,"POSS_PRON_2FS" ,"POSS_PRON_3MS" ,
27
+ "POSS_PRON_3FS","POSS_PRON_2D" ,"POSS_PRON_3D" ,"POSS_PRON_1P",
28
+ "POSS_PRON_2MP" ,"POSS_PRON_2FP" ,"POSS_PRON_3MP" ,"POSS_PRON_3FP" ,
29
+ "IVSUFF_DO:1S" ,"IVSUFF_DO:2MS" ,"IVSUFF_DO:2FS" ,"IVSUFF_DO:3MS" ,
30
+ "IVSUFF_DO:3FS" ,"IVSUFF_DO:2D" ,"IVSUFF_DO:3D" ,"IVSUFF_DO:1P" ,
31
+ "IVSUFF_DO:2MP" ,"IVSUFF_DO:2FP" ,"IVSUFF_DO:3MP" ,"IVSUFF_DO:3FP" ,
32
+ "IVSUFF_MOOD:I" ,"IVSUFF_SUBJ:2FS_MOOD:I" ,"IVSUFF_SUBJ:D_MOOD:I" ,
33
+ "IVSUFF_SUBJ:3D_MOOD:I" ,"IVSUFF_SUBJ:MP_MOOD:I" ,"IVSUFF_MOOD:S",
34
+ "IVSUFF_SUBJ:2FS_MOOD:SJ" ,"IVSUFF_SUBJ:D_MOOD:SJ","IVSUFF_SUBJ:MP_MOOD:SJ" ,
35
+ "IVSUFF_SUBJ:3MP_MOOD:SJ" ,"IVSUFF_SUBJ:FP" ,"PVSUFF_DO:1S" ,"PVSUFF_DO:2MS" ,
36
+ "PVSUFF_DO:2FS" ,"PVSUFF_DO:3MS" ,"PVSUFF_DO:3FS" ,"PVSUFF_DO:2D" ,
37
+ "PVSUFF_DO:3D" ,"PVSUFF_DO:1P" ,"PVSUFF_DO:2MP" ,"PVSUFF_DO:2FP" ,
38
+ "PVSUFF_DO:3MP" ,"PVSUFF_DO:3FP" ,"PVSUFF_SUBJ:1S" ,"PVSUFF_SUBJ:2MS" ,
39
+ "PVSUFF_SUBJ:2FS" ,"PVSUFF_SUBJ:3MS" ,"PVSUFF_SUBJ:3FS" ,"PVSUFF_SUBJ:2MD" ,
40
+ "PVSUFF_SUBJ:2FD" ,"PVSUFF_SUBJ:3MD" ,"PVSUFF_SUBJ:3FD" ,"PVSUFF_SUBJ:1P" ,
41
+ "PVSUFF_SUBJ:2MP" ,"PVSUFF_SUBJ:2FP" ,"PVSUFF_SUBJ:3MP" ,"PVSUFF_SUBJ:3FP" ,
42
+ "CVSUFF_DO:1S" ,"CVSUFF_DO:3MS" ,"CVSUFF_DO:3FS" ,"CVSUFF_DO:3D" ,
43
+ "CVSUFF_DO:1P" ,"CVSUFF_DO:3MP" ,"CVSUFF_DO:3FP" ,"CVSUFF_SUBJ:2MS" ,
44
+ "CVSUFF_SUBJ:2FS" ,"CVSUFF_SUBJ:2MP"])
12
45
  protected
13
46
 
14
47
  # Constructs a solution for a word. Note that the prefix, stem and suffix combination is <b>recomputed</b>
@@ -42,60 +75,35 @@ class Solution
42
75
  @stemsGlosses = stem.glosses
43
76
  #The suffixes glosses.
44
77
  @suffixesGlosses = suffix.glosses
45
-
46
- if (@stemsPOS.length != @stemsGlosses.length)
47
- if (@debug)
48
- puts "\"" + get_lemma() + "\" : stem's sizes for POS (" + @stemsPOS.length.to_s + ") and GLOSS ("+ @stemsGlosses.length.to_s + ") do not match"
49
- end
50
- end
51
-
78
+
79
+ puts "\"#{get_lemma()}\" : stem's sizes for POS (\"#{@stemsPOS.length.to_s}\") and GLOSS (\"#{@stemsGlosses.length.to_s}\") do not match" if (@stemsPOS.length != @stemsGlosses.length and @debug)
80
+
52
81
  #Normalize stems since some of them can contain prefixes
53
82
 
54
83
  while(@stemsPOS.length>0)
55
84
  stemPOS = @stemsPOS.slice(0)
56
- if(stemPOS)
57
- stemPOS.force_encoding "UTF-8"
58
- end
85
+
86
+ stemPOS.force_encoding "UTF-8" if(stemPOS)
87
+
59
88
  if (@stemsGlosses.length>0)
60
89
  stemGloss = @stemsGlosses.slice(0)
61
90
  else
62
91
  stemGloss = nil
63
92
  end
64
- if(stemGloss)
65
- stemGloss.force_encoding "UTF-8"
66
- end
67
- if (stemPOS.end_with?("CONJ") or
68
- stemPOS.end_with?("EMPHATIC_PARTICLE") or
69
- stemPOS.end_with?("FUNC_WORD") or
70
- stemPOS.end_with?("FUT_PART") or
71
- stemPOS.end_with?("INTERJ") or
72
- stemPOS.end_with?("INTERROG_PART") or
73
- stemPOS.end_with?("IV1S") or
74
- stemPOS.end_with?("IV2MS") or
75
- stemPOS.end_with?("IV2FS") or
76
- stemPOS.end_with?("IV3MS") or
77
- stemPOS.end_with?("IV3FS") or
78
- stemPOS.end_with?("IV2D") or
79
- stemPOS.end_with?("IV2FD") or
80
- stemPOS.end_with?("IV3MD") or
81
- stemPOS.end_with?("IV3FD") or
82
- stemPOS.end_with?("IV1P") or
83
- stemPOS.end_with?("IV2MP") or
84
- stemPOS.end_with?("IV2FP") or
85
- stemPOS.end_with?("IV3MP") or
86
- stemPOS.end_with?("IV3FP") or
87
- stemPOS.end_with?("NEG_PART") or
88
- stemPOS.end_with?("PREP") or
89
- stemPOS.end_with?("RESULT_CLAUSE_PARTICLE") )
93
+
94
+ stemGloss.force_encoding "UTF-8" if(stemGloss)
95
+
96
+
97
+ if(stemPOS.ends_with_suffix_set?(@@ends_with_set_for_pos_one) )
90
98
  @stemsPOS.slice!(0)
91
99
  @prefixesPOS.push(stemPOS)
92
100
  if (stemGloss)
93
101
  @stemsGlosses.slice!(0)
94
102
  @prefixesGlosses.push(stemGloss)
95
103
  end
96
- else
97
- break
98
- end
104
+ else
105
+ break
106
+ end
99
107
  end
100
108
 
101
109
  #Normalize stems since some of them can contain suffixes
@@ -112,101 +120,8 @@ class Solution
112
120
  if(stemGloss)
113
121
  stemGloss.force_encoding "UTF-8"
114
122
  end
115
-
116
- if (stemPOS.end_with?("CASE_INDEF_NOM") or
117
- stemPOS.end_with?("CASE_INDEF_ACC") or
118
- stemPOS.end_with?("CASE_INDEF_ACCGEN") or
119
- stemPOS.end_with?("CASE_INDEF_GEN") or
120
- stemPOS.end_with?("CASE_DEF_NOM") or
121
- stemPOS.end_with?("CASE_DEF_ACC") or
122
- stemPOS.end_with?("CASE_DEF_ACCGEN") or
123
- stemPOS.end_with?("CASE_DEF_GEN") or
124
- stemPOS.end_with?("NSUFF_MASC_SG_ACC_INDEF") or
125
- stemPOS.end_with?("NSUFF_FEM_SG") or
126
- stemPOS.end_with?("NSUFF_MASC_DU_NOM") or
127
- stemPOS.end_with?("NSUFF_MASC_DU_NOM_POSS") or
128
- stemPOS.end_with?("NSUFF_MASC_DU_ACCGEN") or
129
- stemPOS.end_with?("NSUFF_MASC_DU_ACCGEN_POSS") or
130
- stemPOS.end_with?("NSUFF_FEM_DU_NOM") or
131
- stemPOS.end_with?("NSUFF_FEM_DU_NOM_POSS") or
132
- stemPOS.end_with?("NSUFF_FEM_DU_ACCGEN") or
133
- stemPOS.end_with?("NSUFF_FEM_DU_ACCGEN_POSS") or
134
- stemPOS.end_with?("NSUFF_MASC_PL_NOM") or
135
- stemPOS.end_with?("NSUFF_MASC_PL_NOM_POSS") or
136
- stemPOS.end_with?("NSUFF_MASC_PL_ACCGEN") or
137
- stemPOS.end_with?("NSUFF_MASC_PL_ACCGEN_POSS") or
138
- stemPOS.end_with?("NSUFF_FEM_PL") or
139
- stemPOS.end_with?("POSS_PRON_1S") or
140
- stemPOS.end_with?("POSS_PRON_2MS") or
141
- stemPOS.end_with?("POSS_PRON_2FS") or
142
- stemPOS.end_with?("POSS_PRON_3MS") or
143
- stemPOS.end_with?("POSS_PRON_3FS") or
144
- stemPOS.end_with?("POSS_PRON_2D") or
145
- stemPOS.end_with?("POSS_PRON_3D") or
146
- stemPOS.end_with?("POSS_PRON_1P") or
147
- stemPOS.end_with?("POSS_PRON_2MP") or
148
- stemPOS.end_with?("POSS_PRON_2FP") or
149
- stemPOS.end_with?("POSS_PRON_3MP") or
150
- stemPOS.end_with?("POSS_PRON_3FP") or
151
- stemPOS.end_with?("IVSUFF_DO:1S") or
152
- stemPOS.end_with?("IVSUFF_DO:2MS") or
153
- stemPOS.end_with?("IVSUFF_DO:2FS") or
154
- stemPOS.end_with?("IVSUFF_DO:3MS") or
155
- stemPOS.end_with?("IVSUFF_DO:3FS") or
156
- stemPOS.end_with?("IVSUFF_DO:2D") or
157
- stemPOS.end_with?("IVSUFF_DO:3D") or
158
- stemPOS.end_with?("IVSUFF_DO:1P") or
159
- stemPOS.end_with?("IVSUFF_DO:2MP") or
160
- stemPOS.end_with?("IVSUFF_DO:2FP") or
161
- stemPOS.end_with?("IVSUFF_DO:3MP") or
162
- stemPOS.end_with?("IVSUFF_DO:3FP") or
163
- stemPOS.end_with?("IVSUFF_MOOD:I") or
164
- stemPOS.end_with?("IVSUFF_SUBJ:2FS_MOOD:I") or
165
- stemPOS.end_with?("IVSUFF_SUBJ:D_MOOD:I") or
166
- stemPOS.end_with?("IVSUFF_SUBJ:3D_MOOD:I") or
167
- stemPOS.end_with?("IVSUFF_SUBJ:MP_MOOD:I") or
168
- stemPOS.end_with?("IVSUFF_MOOD:S") or
169
- stemPOS.end_with?("IVSUFF_SUBJ:2FS_MOOD:SJ") or
170
- stemPOS.end_with?("IVSUFF_SUBJ:D_MOOD:SJ") or
171
- stemPOS.end_with?("IVSUFF_SUBJ:MP_MOOD:SJ") or
172
- stemPOS.end_with?("IVSUFF_SUBJ:3MP_MOOD:SJ") or
173
- stemPOS.end_with?("IVSUFF_SUBJ:FP") or
174
- stemPOS.end_with?("PVSUFF_DO:1S") or
175
- stemPOS.end_with?("PVSUFF_DO:2MS") or
176
- stemPOS.end_with?("PVSUFF_DO:2FS") or
177
- stemPOS.end_with?("PVSUFF_DO:3MS") or
178
- stemPOS.end_with?("PVSUFF_DO:3FS") or
179
- stemPOS.end_with?("PVSUFF_DO:2D") or
180
- stemPOS.end_with?("PVSUFF_DO:3D") or
181
- stemPOS.end_with?("PVSUFF_DO:1P") or
182
- stemPOS.end_with?("PVSUFF_DO:2MP") or
183
- stemPOS.end_with?("PVSUFF_DO:2FP") or
184
- stemPOS.end_with?("PVSUFF_DO:3MP") or
185
- stemPOS.end_with?("PVSUFF_DO:3FP") or
186
- stemPOS.end_with?("PVSUFF_SUBJ:1S") or
187
- stemPOS.end_with?("PVSUFF_SUBJ:2MS") or
188
- stemPOS.end_with?("PVSUFF_SUBJ:2FS") or
189
- stemPOS.end_with?("PVSUFF_SUBJ:3MS") or
190
- stemPOS.end_with?("PVSUFF_SUBJ:3FS") or
191
- stemPOS.end_with?("PVSUFF_SUBJ:2MD") or
192
- stemPOS.end_with?("PVSUFF_SUBJ:2FD") or
193
- stemPOS.end_with?("PVSUFF_SUBJ:3MD") or
194
- stemPOS.end_with?("PVSUFF_SUBJ:3FD") or
195
- stemPOS.end_with?("PVSUFF_SUBJ:1P") or
196
- stemPOS.end_with?("PVSUFF_SUBJ:2MP") or
197
- stemPOS.end_with?("PVSUFF_SUBJ:2FP") or
198
- stemPOS.end_with?("PVSUFF_SUBJ:3MP") or
199
- stemPOS.end_with?("PVSUFF_SUBJ:3FP") or
200
- stemPOS.end_with?("CVSUFF_DO:1S") or
201
- stemPOS.end_with?("CVSUFF_DO:3MS") or
202
- stemPOS.end_with?("CVSUFF_DO:3FS") or
203
- stemPOS.end_with?("CVSUFF_DO:3D") or
204
- stemPOS.end_with?("CVSUFF_DO:1P") or
205
- stemPOS.end_with?("CVSUFF_DO:3MP") or
206
- stemPOS.end_with?("CVSUFF_DO:3FP") or
207
- stemPOS.end_with?("CVSUFF_SUBJ:2MS") or
208
- stemPOS.end_with?("CVSUFF_SUBJ:2FS") or
209
- stemPOS.end_with?("CVSUFF_SUBJ:2MP") )
123
+
124
+ if(stemPOS.ends_with_suffix_set?(@@ends_with_set_for_pos_two))
210
125
  @stemsPOS.slice!(@stemsPOS.length-1)
211
126
  @suffixesPOS.insert(0,stemPOS)
212
127
  if (stemGloss)
@@ -332,14 +247,10 @@ class Solution
332
247
  sb = ""
333
248
  sb.force_encoding "UTF-8"
334
249
  vocal = get_prefixes_arabic_vocalizations()
335
- if(vocal!=nil)
336
- sb += vocal[0].to_s
337
- end
250
+ sb += vocal[0].to_s if vocal!=nil
338
251
 
339
- s = get_stem_arabic_vocalization()
340
- if ( s != nil)
341
- sb+=s
342
- end
252
+ s = get_stem_arabic_vocalization()
253
+ sb+=s if s!= nil
343
254
  vocal = get_suffixes_arabic_vocalizations()
344
255
  if(vocal!=nil)
345
256
  sb += vocal[0].to_s
@@ -376,13 +287,13 @@ class Solution
376
287
  sb = ""
377
288
  sb.force_encoding "UTF-8"
378
289
  if (!@prefix.morphology.empty? and @prefix.morphology != nil )
379
- sb+= "\t" + "prefix : " + @prefix.morphology + "\n"
290
+ sb+= "\tprefix : #{@prefix.morphology}\n"
380
291
  end
381
292
  if (!@stem.morphology.empty? and @stem.morphology != nil)
382
- sb+= "\t" + "stem : " + @stem.morphology + "\n"
293
+ sb+= "\tstem : #{@stem.morphology}\n"
383
294
  end
384
295
  if (!@suffix.morphology.empty? and @suffix.morphology != nil)
385
- sb+= "\t" + "suffix : " + @suffix.morphology + "\n"
296
+ sb+= "\tsuffix : #{@suffix.morphology}\n"
386
297
  end
387
298
  return sb
388
299
  end
@@ -517,14 +428,14 @@ class Solution
517
428
  sb.force_encoding "UTF-8"
518
429
  glosses = get_prefixes_glosses()
519
430
  if (glosses and glosses[0] != nil)
520
- sb+=("\t" + "prefix : " + glosses[0].gsub(";","/") + "\n")
431
+ sb+=("\tprefix : #{glosses[0].gsub(";","/")}\n")
521
432
  end
522
433
  if (get_stem_gloss() != nil)
523
- sb+=("\t" + "stem : " +get_stem_gloss().gsub(";","/") + "\n")
434
+ sb+=("\tstem : #{get_stem_gloss().gsub(";","/")}\n")
524
435
  end
525
436
  glosses = get_suffixes_glosses()
526
437
  if (glosses and glosses[0] != nil)
527
- sb+=("\t" + "suffix : " + glosses[0].gsub(";","/") + "\n")
438
+ sb+=("\tsuffix : #{glosses[0].gsub(";","/")}\n")
528
439
  end
529
440
  return sb
530
441
  end
@@ -603,32 +514,26 @@ class Solution
603
514
  end
604
515
  temp_POS = []
605
516
  arr.each do |pos|
606
- array = pos.split("/");
517
+ array = pos.split("/")
607
518
  j=1
608
519
  if(type==1)
609
520
  sb = ""
610
521
  elsif(type==2)
611
- sb = array[0] + "\t"
522
+ sb = "#{array[0]}\t"
612
523
  else
613
- sb = LatinArabicTranslator.translate(array[0]) + "\t"
524
+ sb = "#{LatinArabicTranslator.translate(array[0])}\t"
614
525
  sb.force_encoding "UTF-8"
615
526
  end
616
- while( j < array.length)
617
- if (j > 1)
618
- sb+=" / "
619
- end
620
- sb+=array[j]
621
- j+=1
622
- end
527
+ sb << array[1..array.length].join(" / ")
623
528
  temp_POS.push(sb)
624
529
  end
625
530
 
626
531
  if(pre_stem_suff==2)
627
532
  if ((temp_POS.length > 1) and @debug)
628
- puts "More than one stem for " + temp_POS.to_s
533
+ puts "More than one stem for #{temp_POS.to_s}"
629
534
  end
630
535
  if (type ==1 and temp_POS[0].empty?)
631
- puts "Empty POS for stem " + get_stem_long_POS()
536
+ puts "Empty POS for stem #{get_stem_long_POS()}"
632
537
  end
633
538
  #return the first anyway :-(
634
539
  return temp_POS[0]
@@ -649,7 +554,7 @@ class Solution
649
554
  end
650
555
  if (temp_POS != nil)
651
556
  if (temp_POS[0]!=nil)
652
- sb+=("\t" + "prefix : " + temp_POS[0] + "\n")
557
+ sb << ("\tprefix : #{temp_POS[0]}\n")
653
558
  end
654
559
  end
655
560
  if(arabic)
@@ -658,7 +563,7 @@ class Solution
658
563
  s = get_stem_long_POS()
659
564
  end
660
565
  if ( s != nil)
661
- sb+=("\t" + "stem : " + s + "\n")
566
+ sb << ("\tstem : #{s} \n")
662
567
  end
663
568
  if(arabic)
664
569
  temp_POS =get_suffixes_arabic_long_POS()
@@ -667,9 +572,21 @@ class Solution
667
572
  end
668
573
  if (temp_POS != nil)
669
574
  if (temp_POS[0]!=nil)
670
- sb+=("\t" + "suffix : " + temp_POS[0] + "\n")
575
+ sb << ("\tsuffix : #{temp_POS[0]}\n")
671
576
  end
672
577
  end
673
578
  return sb
674
579
  end
580
+
675
581
  end
582
+
583
+ class String
584
+ def ends_with_suffix_set?(ends_with_suffix_set)
585
+ length = self.length
586
+ length.times { |i|
587
+ return true if ends_with_suffix_set.member?(self[i..length])
588
+
589
+ }
590
+ return false
591
+ end
592
+ end
@@ -24,7 +24,7 @@ class Translator
24
24
  def translate(string)
25
25
  result = ""
26
26
  i = 0
27
- ## IF non Utf8 Char REturn
27
+ ## IF non Utf8 Char return
28
28
  return string unless string.length % 2 ==0
29
29
  while i < string.length-1
30
30
  char = string[i..i+1]
data/lib/raramorph.rb CHANGED
@@ -1,8 +1,10 @@
1
1
  #Dir[File.join(File.dirname(__FILE__), 'raramorph/**/*.rb')].sort.each { |lib| require lib }
2
+
2
3
  $:.unshift File.expand_path(File.dirname(__FILE__) )
3
4
  start = Time.now
4
5
  require 'set'
5
- require 'stringio'
6
+ require 'stringio'
7
+ require 'raramorph/logger'
6
8
  require 'raramorph/translator'
7
9
  require 'raramorph/arabic_latin_translator'
8
10
  require 'raramorph/latin_arabic_translator'
@@ -3,26 +3,31 @@
3
3
  # ARGV[2] # Verbose Default False
4
4
  # ARGV[4] # BuckWalter Default False ( Arabic Output)
5
5
  $:.unshift File.expand_path(File.dirname(__FILE__) )
6
- if ARGV.length > 2 and ARGV.length <= 4
6
+ if ARGV.length >= 2 and ARGV.length <= 4
7
7
  require 'raramorph'
8
8
  start = Time.now
9
- Raramorph.execute(ARGV[0] , ARGV[1] , ARGV[2] , ARGV[3] )
9
+ verbose = false
10
+ not_arabic = true
11
+ verbose = true if ARGV[2] and ARGV[2] == "-v"
12
+ not_arabic = false if ARGV[3] and ARGV[3] == "-a"
13
+ not_arabic = false if ARGV[2] and ARGV[2] == "-a"
14
+ Raramorph.execute(ARGV[0] , ARGV[1] , verbose , not_arabic )
10
15
  puts "Time Elapsed= " + ( Time.now - start).to_s
11
16
  else
12
17
  puts("Arabic Morphological Analyzer for Ruby")
13
18
  puts("Ported to Ruby by Moustafa Emara and Hany Salah El din , eSpace-technologies.(www.espace.com.eg) , 2008.")
14
19
  puts("Based on :")
15
20
  puts("BUCKWALTER ARABIC MORPHOLOGICAL ANALYZER")
16
- puts("This program is developed under the Ruby-Licences")
21
+ puts("This program is developed under the MIT-Licences")
17
22
  puts("Usage :")
18
23
  puts("")
19
- puts("RaraMorph inFile [inEncoding] [outFile] [outEncoding] [-v]")
24
+ puts("raraMorph inFile [inEncoding] [outFile] [-v] [-a]")
20
25
  puts("")
21
26
  puts("inFile : file to be analyzed")
22
27
  puts("inEncoding : encoding for inFile, default UTF-8")
23
- puts("outFile : result file, default console")
24
- puts("outEncoding : encoding for outFile, if not specified use Buckwalter transliteration with system's file.encoding")
28
+ puts("outFile : result file ")
25
29
  puts("-v : verbose mode")
30
+ puts("-a : Aarbic Output" )
26
31
  end
27
32
 
28
33
 
data/raramorph.gemspec CHANGED
@@ -1,8 +1,8 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "raramorph"
3
- s.version = "0.1.0"
3
+ s.version = "0.1.1"
4
4
  s.date = "2008-09-06"
5
- s.summary = "Raramorph is a ruby gem for making morphological analysis and arabic indexing built using Ruby at eSpace-technologies ( www.espace.com.eg )"
5
+ s.summary = "Raramorph is a ruby gem for making morphological analysis and arabic indexing built using Ruby at eSpace-technologies ( www.espace.com.eg ) "
6
6
  s.email = "moustafa.emara@espace.com.eg"
7
7
  s.homepage = "http://github.com/espace/raramorph"
8
8
  s.description = "Raramorph is a ruby gem for making morphological analysis and arabic indexing built using Ruby at eSpace-technologies ( www.espace.com.eg )"
@@ -20,6 +20,7 @@ Gem::Specification.new do |s|
20
20
  "lib/raramorph/arabic_latin_translator.rb",
21
21
  "lib/raramorph/latin_arabic_translator.rb",
22
22
  "lib/raramorph/in_memory_dictionary_handler.rb",
23
+ "lib/raramorph/logger.rb",
23
24
  "lib/dictionaries/dictPrefixes",
24
25
  "lib/dictionaries/dictStems",
25
26
  "lib/dictionaries/dictSuffixes",
@@ -34,7 +35,8 @@ Gem::Specification.new do |s|
34
35
  s.executables = %w(raramorph)
35
36
  s.required_ruby_version = '>= 1.9'
36
37
  s.bindir = "bin"
37
- #s.rdoc_options = ["--main", "README"]
38
- #s.extra_rdoc_files = ["README"]
38
+ s.rdoc_options = ["--main", "README"]
39
+ s.extra_rdoc_files = ["README"]
39
40
  #s.extensions << "ext/extconf.rb"
41
+
40
42
  end
metadata CHANGED
@@ -3,7 +3,7 @@ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: raramorph
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.1.0
6
+ version: 0.1.1
7
7
  date: 2008-09-06 00:00:00 +02:00
8
8
  summary: Raramorph is a ruby gem for making morphological analysis and arabic indexing built using Ruby at eSpace-technologies ( www.espace.com.eg )
9
9
  require_paths:
@@ -39,6 +39,7 @@ files:
39
39
  - lib/raramorph/translator.rb
40
40
  - lib/raramorph/arabic_latin_translator.rb
41
41
  - lib/raramorph/latin_arabic_translator.rb
42
+ - lib/raramorph/logger.rb
42
43
  - lib/dictionaries/dictPrefixes
43
44
  - lib/dictionaries/dictStems
44
45
  - lib/dictionaries/dictSuffixes
@@ -49,12 +50,14 @@ files:
49
50
  - lib/raramorph.rb
50
51
  - lib/raramorph_main.rb
51
52
  - lib/test_input/UTF-8.txt
53
+ - README
52
54
  test_files: []
53
55
 
54
- rdoc_options: []
55
-
56
- extra_rdoc_files: []
57
-
56
+ rdoc_options:
57
+ - --main
58
+ - README
59
+ extra_rdoc_files:
60
+ - README
58
61
  executables:
59
62
  - raramorph
60
63
  extensions: []