raramorph 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +56 -0
- data/lib/raramorph/arabic_latin_translator.rb +1 -5
- data/lib/raramorph/dictionary_entry.rb +1 -1
- data/lib/raramorph/in_memory_dictionary_handler.rb +58 -20
- data/lib/raramorph/in_memory_solutions_handler.rb +1 -1
- data/lib/raramorph/latin_arabic_translator.rb +1 -1
- data/lib/raramorph/logger.rb +20 -0
- data/lib/raramorph/raramorph.rb +53 -173
- data/lib/raramorph/solution.rb +81 -164
- data/lib/raramorph/translator.rb +1 -1
- data/lib/raramorph.rb +3 -1
- data/lib/raramorph_main.rb +11 -6
- data/raramorph.gemspec +6 -4
- metadata +8 -5
data/README
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
== Raramorph
|
2
|
+
By eSpace-technologies
|
3
|
+
http://www.espace.com.eg
|
4
|
+
http://www.espace.com.eg/docs/raramorph/index.html
|
5
|
+
http://github.com/espace/raramorph
|
6
|
+
http://raramorph.rubyforge.org
|
7
|
+
|
8
|
+
== DESCRIPTION:
|
9
|
+
|
10
|
+
Raramorph is a Ruby 1.9 gem for an intelligent port Aramorph based on Buckwalter Arabic Morphological Analyzer Version 1.0.
|
11
|
+
|
12
|
+
== Usage
|
13
|
+
|
14
|
+
require 'raramorph'
|
15
|
+
# For analyzing a file
|
16
|
+
Raramorph.execute(input_filename, output_filename ,verbose = false, not_arabic = true)
|
17
|
+
# You can use functions like analyze_token , tokenize , segement_word all as static methods in Raramorph class
|
18
|
+
OR
|
19
|
+
From the command line
|
20
|
+
raramorph input_file_name output_file_name -v -a
|
21
|
+
-v verbose mode ( optional )
|
22
|
+
-a arabic output ( optional )
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
== INSTALL:
|
27
|
+
|
28
|
+
sudo gem install raramorph
|
29
|
+
|
30
|
+
=== Source Code =====
|
31
|
+
http://github.com/espace/raramorph/tree/master
|
32
|
+
|
33
|
+
== LICENSE:
|
34
|
+
|
35
|
+
(The MIT License)
|
36
|
+
|
37
|
+
Copyright (c) 2008 Moustafa Emara , Hany Salah el deen
|
38
|
+
|
39
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
40
|
+
a copy of this software and associated documentation files (the
|
41
|
+
'Software'), to deal in the Software without restriction, including
|
42
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
43
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
44
|
+
permit persons to whom the Software is furnished to do so, subject to
|
45
|
+
the following conditions:
|
46
|
+
|
47
|
+
The above copyright notice and this permission notice shall be
|
48
|
+
included in all copies or substantial portions of the Software.
|
49
|
+
|
50
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
51
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
52
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
53
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
54
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
55
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
56
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
@@ -2,9 +2,7 @@
|
|
2
2
|
# Author:: eSpace technologies www.eSpace.com.eg
|
3
3
|
# Copyright:: 2008
|
4
4
|
#
|
5
|
-
|
6
5
|
class ArabicLatinTranslator
|
7
|
-
|
8
6
|
# * Table Used for Tranlation From Arabic To English I.e ( Romanize Word )
|
9
7
|
# * According to Buckwalter system Dictionary
|
10
8
|
TABLE = { "\u0621"=> "'" , "\u0622"=> "|" , "\u0623"=> ">" , "\u0624"=> "&" , "\u0625"=> "<" , "\u0626"=> "}" ,
|
@@ -20,9 +18,7 @@ class ArabicLatinTranslator
|
|
20
18
|
#Not suitable for morphological analysis : remove all vowels/diacritics, i.e. undo the job !
|
21
19
|
VOWEL_REMOVER = Regexp.compile("[FNKaui~o]")
|
22
20
|
STRIPER = Regexp.compile("[`\\{]")
|
23
|
-
|
24
|
-
|
25
|
-
end
|
21
|
+
|
26
22
|
|
27
23
|
# * Translate : Transilerate the arabic word to Roman lettered Word
|
28
24
|
# * [word] Word String To be processed
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Class For Storing And Loading Dictionaries
|
2
2
|
# Author:: eSpace technologies www.eSpace.com.eg
|
3
3
|
# Copyright:: 2008
|
4
|
-
|
4
|
+
|
5
5
|
|
6
6
|
require 'rubygems'
|
7
7
|
class InMemoryDictionaryHandler
|
@@ -25,25 +25,24 @@ class InMemoryDictionaryHandler
|
|
25
25
|
### Variables #####
|
26
26
|
@@handler = nil
|
27
27
|
@@regex = Regexp.compile(".*" + "<pos>(.+?)</pos>" + ".*")
|
28
|
-
@@morphology_regexs=[
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
@@morphology_regexs[7] = Regexp.compile(".*" + "iy~$")
|
28
|
+
@@morphology_regexs=[Regexp.compile("^(Pref-0|Suff-0)$") ,
|
29
|
+
Regexp.compile("^F" + ".*") ,
|
30
|
+
Regexp.compile("^IV" + ".*") ,
|
31
|
+
Regexp.compile("^PV" + ".*") ,
|
32
|
+
Regexp.compile("^CV" + ".*") ,
|
33
|
+
Regexp.compile("^N" + ".*") ,
|
34
|
+
Regexp.compile("^[A-Z]" + ".*") ,
|
35
|
+
Regexp.compile(".*" + "iy~$")
|
36
|
+
]
|
38
37
|
@@compatability_stpliter = Regexp.compile("\\s+")
|
39
|
-
@@vocalization_array =[
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
38
|
+
@@vocalization_array =["/FUNC_WORD" ,
|
39
|
+
"/VERB_IMPERFECT" ,
|
40
|
+
"/VERB_PERFECT" ,
|
41
|
+
"/VERB_IMPERATIVE" ,
|
42
|
+
"/NOUN_PROP" ,
|
43
|
+
"/NOUN" ,
|
44
|
+
"/NOUN"
|
45
|
+
]
|
47
46
|
|
48
47
|
@@prefixes_stems_compatibility = Set.new
|
49
48
|
#Changed
|
@@ -163,6 +162,44 @@ class InMemoryDictionaryHandler
|
|
163
162
|
@@suffixes = suffixes
|
164
163
|
end
|
165
164
|
|
165
|
+
def analyze_word_in_dictionaries(segmented_word , word_solutions , verbose , count)
|
166
|
+
#Is prefix known ?
|
167
|
+
if has_prefix?(segmented_word.prefix)
|
168
|
+
#Is stem known ?
|
169
|
+
# puts "has prefix"
|
170
|
+
if has_stem?(segmented_word.stem)
|
171
|
+
# puts "has stem"
|
172
|
+
#Is suffix known ?
|
173
|
+
if has_suffix?(segmented_word.suffix)
|
174
|
+
# puts "has suffix"
|
175
|
+
#Compatibility check
|
176
|
+
@@prefixes[segmented_word.prefix].each{|prefix|
|
177
|
+
@@stems[segmented_word.stem].each {|stem|
|
178
|
+
#Prefix/Stem compatibility
|
179
|
+
if prefixes_stems_compatible?(prefix.morphology ,stem.morphology )
|
180
|
+
# puts "has A B Com"
|
181
|
+
@@suffixes[segmented_word.suffix].each {|suffix|
|
182
|
+
# Prefix/Suffix compatiblity
|
183
|
+
if prefixes_suffixes_compatible?(prefix.morphology , suffix.morphology)
|
184
|
+
# puts "has A C Com"
|
185
|
+
# Stems/Suffixes compatiblity
|
186
|
+
if stems_suffixes_compatible?(stem.morphology , suffix.morphology)
|
187
|
+
# puts "has B C COM"
|
188
|
+
#All tests passed : it is a solution
|
189
|
+
count = count + 1
|
190
|
+
word_solutions << Solution.new(verbose , count , prefix , stem , suffix )
|
191
|
+
end
|
192
|
+
end
|
193
|
+
}
|
194
|
+
end
|
195
|
+
}
|
196
|
+
}
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
return count
|
201
|
+
end
|
202
|
+
|
166
203
|
private
|
167
204
|
|
168
205
|
# * load Dictionary from files
|
@@ -241,7 +278,8 @@ class InMemoryDictionaryHandler
|
|
241
278
|
vocalization = splited_line[1]
|
242
279
|
morphology = splited_line[2]
|
243
280
|
gloss_pos = splited_line[3]
|
244
|
-
gloss
|
281
|
+
gloss = ""
|
282
|
+
pos = ""
|
245
283
|
# two ways to get the POS info
|
246
284
|
# (1) explicitly, by extracting it from the gloss field:
|
247
285
|
|
@@ -0,0 +1,20 @@
|
|
1
|
+
class Logger
|
2
|
+
|
3
|
+
|
4
|
+
attr_reader :verbose , :output
|
5
|
+
def initialize(verbose = nil , output = nil )
|
6
|
+
@verbose = verbose
|
7
|
+
@output = output
|
8
|
+
@stream = StringIO.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def info string , require_verbose = false
|
12
|
+
@stream.puts(string) #if ( require_verbose && @verbose || ! require_verbose )
|
13
|
+
end
|
14
|
+
|
15
|
+
def log
|
16
|
+
return puts @stream.string if @output.nil?
|
17
|
+
File.open(@output , "w") { |f|
|
18
|
+
f.puts @stream.string }
|
19
|
+
end
|
20
|
+
end
|
data/lib/raramorph/raramorph.rb
CHANGED
@@ -1,10 +1,8 @@
|
|
1
|
-
# A Ruby port of Buckwalter
|
2
|
-
#
|
1
|
+
# A Ruby port of Buckwalter Morphological Analyzer Version 1.0.
|
3
2
|
# Author:: eSpace technologies www.eSpace.com.eg
|
4
3
|
# Copyright:: 2008
|
5
4
|
|
6
5
|
|
7
|
-
require 'set'
|
8
6
|
|
9
7
|
class Raramorph
|
10
8
|
|
@@ -13,48 +11,14 @@ class Raramorph
|
|
13
11
|
# The solutions handler.
|
14
12
|
@@sol = InMemorySolutionsHandler.create
|
15
13
|
# Whether or not the analyzer should output some convenience messages
|
16
|
-
@verbose
|
17
|
-
# The stream where to output the results
|
18
|
-
|
19
|
-
@output_stream
|
20
|
-
|
21
|
-
#use arabic translation or not?
|
22
|
-
@not_arabic
|
23
|
-
|
24
|
-
#Stats
|
25
|
-
# Lines processed
|
26
|
-
@lines_counter = 0
|
27
|
-
# Arabic tokens processed
|
28
|
-
@not_arabic_tokens_counter = 0
|
29
|
-
# Not arabic tokens processed
|
30
|
-
@not_arabic_tokens_counter = 0
|
31
|
-
|
32
|
-
# Arabic words which have been succesfully analyzed.
|
33
|
-
# * [key] = word
|
34
|
-
# * [value] = occurences
|
35
|
-
#
|
36
|
-
@found = {}
|
37
|
-
|
38
|
-
# Arabic words which have not been succesfully analyzed.
|
39
|
-
# * [key] = word
|
40
|
-
# * [value] = occurences
|
41
|
-
#
|
42
|
-
@not_found = {}
|
43
|
-
|
44
14
|
# Alternative spellings list of regular expressions
|
45
|
-
@@alternative_spellings = [
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
@@alternative_spellings[4] = Regexp.compile(".*" + "p$")
|
15
|
+
@@alternative_spellings = [Regexp.compile(".*" + "Y'$") ,
|
16
|
+
Regexp.compile(".*" + "y'$") ,
|
17
|
+
Regexp.compile(".*" + "y$") ,
|
18
|
+
Regexp.compile(".*" + "h$") ,
|
19
|
+
Regexp.compile(".*" + "p$") ]
|
51
20
|
@@space_regex = Regexp.compile("\\s+")
|
52
21
|
|
53
|
-
|
54
|
-
def self.set_verbose(verbose) #Bolean Variable
|
55
|
-
@verbose = verbose
|
56
|
-
end
|
57
|
-
|
58
22
|
# * Analyze and Process the file ( i.e Doing the morphological Analysis )
|
59
23
|
# * [file_reader_in] Input File Path
|
60
24
|
# * [output_buckwalter] whether the output in buckwalter indications ( i.e Roman letters ) or arabic letters
|
@@ -63,8 +27,8 @@ class Raramorph
|
|
63
27
|
lines= IO.readlines(file_reader_in)
|
64
28
|
lines.each do |line|
|
65
29
|
@lines_counter+=1
|
66
|
-
if(@verbose)
|
67
|
-
puts "Processing line :
|
30
|
+
if(@logger.verbose)
|
31
|
+
puts "Processing line : #{@lines_counter.to_s}"
|
68
32
|
end
|
69
33
|
tokens = tokenize(line)
|
70
34
|
tokens.each do |token|
|
@@ -72,7 +36,7 @@ class Raramorph
|
|
72
36
|
end
|
73
37
|
end
|
74
38
|
#rescue
|
75
|
-
# @
|
39
|
+
# @logger.info "Can not read line " + @lines_counter.to_s
|
76
40
|
#end
|
77
41
|
end
|
78
42
|
|
@@ -102,7 +66,7 @@ class Raramorph
|
|
102
66
|
def self.analyze_token(token , output_buckwalter) #STring , Boolean , REturn Boolean
|
103
67
|
#TO DO SET UP THE PRINT STREAM
|
104
68
|
token.force_encoding "UTF-8"
|
105
|
-
@
|
69
|
+
@logger.info "Processing token : " + "\t" + token
|
106
70
|
#TODO : check accuracy
|
107
71
|
#ignored \u0688 : ARABIC LETTER DDAL
|
108
72
|
#ignored \u06A9 : ARABIC LETTER KEHEH
|
@@ -119,7 +83,7 @@ class Raramorph
|
|
119
83
|
sub_tokens.each{|sub_token|
|
120
84
|
unless sub_token.strip == ""
|
121
85
|
@not_arabic_tokens_counter+=1
|
122
|
-
@
|
86
|
+
@logger.info("Non-Arabic : #{sub_token}")
|
123
87
|
end
|
124
88
|
}
|
125
89
|
return false
|
@@ -128,31 +92,30 @@ class Raramorph
|
|
128
92
|
@not_arabic_tokens_counter+=1
|
129
93
|
|
130
94
|
translitered = ArabicLatinTranslator.translate(token)
|
131
|
-
@
|
95
|
+
@logger.info("Transliteration : \t#{translitered}")
|
132
96
|
|
133
97
|
if @found.has_key?(translitered) #Already processed : previously found
|
134
|
-
@
|
98
|
+
@logger.info("Token already processed." , true )
|
135
99
|
#increase reference counter
|
136
100
|
@found[translitered]+=1
|
137
101
|
has_solutions = true
|
138
102
|
elsif @not_found.has_key?(translitered) #Already processed : previously not found
|
139
|
-
@
|
103
|
+
@logger.info("Token already processed without solution." , true )
|
140
104
|
@not_found[translitered]+=1 #increase reference counter
|
141
105
|
has_solutions = false
|
142
106
|
else
|
143
|
-
@
|
107
|
+
@logger.info("Token not yet processed.", true )
|
144
108
|
|
145
109
|
if (feed_word_solutions(translitered)) #CHANGED #word has solutions...
|
146
110
|
#mark word as found
|
147
111
|
raise "There is already a key for " + translitered + " in found" if @found.has_key?(translitered)
|
148
|
-
@
|
112
|
+
@logger.info("Token has direct solutions." , true )
|
149
113
|
#set reference counter to 1
|
150
114
|
@found[translitered] = 1
|
151
115
|
has_solutions = true
|
152
116
|
else #word has no direct solution
|
153
117
|
if(feed_alternative_spellings(translitered))
|
154
118
|
alternatives_give_solutions = false
|
155
|
-
|
156
119
|
alternatives = @@sol.get_alternative_spellings(translitered)
|
157
120
|
alternatives.each{|alternative|
|
158
121
|
alternatives_give_solutions = (alternatives_give_solutions || feed_word_solutions(alternative))
|
@@ -160,21 +123,21 @@ class Raramorph
|
|
160
123
|
if(alternatives_give_solutions)
|
161
124
|
#consistency check
|
162
125
|
raise "There is already a key for " + translitered + " in found" if @found.has_key?(translitered)
|
163
|
-
@
|
126
|
+
@logger.info("Token's alternative spellings have solutions." , true )
|
164
127
|
#mark word as found set reference counter to 1
|
165
128
|
@found[translitered] = 1
|
166
129
|
has_solutions = true
|
167
130
|
else
|
168
131
|
#consistency check
|
169
132
|
raise "There is already a key for " + translitered + " in notFound" if @not_found.has_key?(translitered)
|
170
|
-
@
|
133
|
+
@logger.info("Token's alternative spellings have no solution." , true )
|
171
134
|
@not_found[translitered]=1
|
172
135
|
has_solutions = false
|
173
136
|
end
|
174
137
|
else
|
175
138
|
#there are no alternative
|
176
139
|
raise "There is already a key for " + translitered + " in notFound" if @not_found.has_key?(translitered)
|
177
|
-
@
|
140
|
+
@logger.info("Token has no solution and no alternative spellings." , true )
|
178
141
|
#mark word as not found and set reference counter to 1
|
179
142
|
@not_found[translitered]=1
|
180
143
|
has_solutions = false
|
@@ -184,22 +147,24 @@ class Raramorph
|
|
184
147
|
|
185
148
|
|
186
149
|
#output solutions : TODO consider XML output
|
187
|
-
if @
|
150
|
+
if @logger.output != nil
|
188
151
|
if @found.has_key?(translitered)
|
189
152
|
if @@sol.has_solutions(translitered)
|
190
|
-
@@sol.get_solutions(translitered).each{|solution| @
|
153
|
+
@@sol.get_solutions(translitered).each{|solution| @logger.info "#{output_buckwalter ? solution.to_s : solution.to_arabized_string}"
|
154
|
+
}
|
191
155
|
end
|
192
156
|
if @@sol.has_alternative_spellings(translitered)
|
193
|
-
@
|
157
|
+
@logger.info("No direct solution" , true )
|
194
158
|
@@sol.get_alternative_spellings(translitered).each{|alternative|
|
195
|
-
|
159
|
+
@logger.info("Considering alternative spelling :" + "\t#{alternative}" , true )
|
196
160
|
if @@sol.has_solutions(alternative)
|
197
|
-
@@sol.get_solutions(alternative).each{|solution| @
|
161
|
+
@@sol.get_solutions(alternative).each{|solution| @logger.info "#{output_buckwalter ? solution.to_s : solution.to_arabized_string}"
|
162
|
+
}
|
198
163
|
end
|
199
164
|
}
|
200
165
|
end
|
201
166
|
elsif @not_found.has_key?(translitered)
|
202
|
-
@
|
167
|
+
@logger.info "\nNo solution\n"
|
203
168
|
else
|
204
169
|
raise "#{translitered} is neither in found or notFound !"
|
205
170
|
end
|
@@ -220,40 +185,7 @@ class Raramorph
|
|
220
185
|
segments = segment_word(translitered) #Hash Set of Segement Words Objects
|
221
186
|
#Brute force algorithm
|
222
187
|
segments.each{|segmented_word|
|
223
|
-
|
224
|
-
if @@dict.has_prefix?(segmented_word.prefix)
|
225
|
-
#Is stem known ?
|
226
|
-
# puts "has prefix"
|
227
|
-
if @@dict.has_stem?(segmented_word.stem)
|
228
|
-
# puts "has stem"
|
229
|
-
#Is suffix known ?
|
230
|
-
if @@dict.has_suffix?(segmented_word.suffix)
|
231
|
-
# puts "has suffix"
|
232
|
-
#Compatibility check
|
233
|
-
@@dict.prefixes[segmented_word.prefix].each{|prefix|
|
234
|
-
@@dict.stems[segmented_word.stem].each {|stem|
|
235
|
-
#Prefix/Stem compatibility
|
236
|
-
if @@dict.prefixes_stems_compatible?(prefix.morphology ,stem.morphology )
|
237
|
-
# puts "has A B Com"
|
238
|
-
@@dict.suffixes[segmented_word.suffix].each {|suffix|
|
239
|
-
# Prefix/Suffix compatiblity
|
240
|
-
if @@dict.prefixes_suffixes_compatible?(prefix.morphology , suffix.morphology)
|
241
|
-
# puts "has A C Com"
|
242
|
-
# Stems/Suffixes compatiblity
|
243
|
-
if @@dict.stems_suffixes_compatible?(stem.morphology , suffix.morphology)
|
244
|
-
# puts "has B C COM"
|
245
|
-
#All tests passed : it is a solution
|
246
|
-
count = count + 1
|
247
|
-
word_solutions << Solution.new(@verbose , count , prefix , stem , suffix )
|
248
|
-
end
|
249
|
-
end
|
250
|
-
}
|
251
|
-
end
|
252
|
-
}
|
253
|
-
}
|
254
|
-
end
|
255
|
-
end
|
256
|
-
end
|
188
|
+
count = @@dict.analyze_word_in_dictionaries(segmented_word , word_solutions , @logger.verbose , count )
|
257
189
|
}
|
258
190
|
|
259
191
|
#Add all solutions, if any
|
@@ -322,37 +254,29 @@ class Raramorph
|
|
322
254
|
# * Find Alternative Spellings for the translitered word
|
323
255
|
# * [translitered] word to be proccesed
|
324
256
|
def self.feed_alternative_spellings(translitered)
|
325
|
-
|
257
|
+
return true if(@@sol.has_alternative_spellings(translitered))
|
326
258
|
word_alternative_spellings = Set.new
|
327
259
|
temp = translitered
|
328
260
|
|
329
261
|
if( temp.match(@@alternative_spellings[0]) )
|
330
262
|
temp.gsub!(/Y/, "y")
|
331
|
-
|
332
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
333
|
-
end
|
263
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
334
264
|
word_alternative_spellings.add(temp)
|
335
265
|
temp2 = temp.sub(/w/, "&")
|
336
266
|
if(temp!=temp2)
|
337
267
|
temp = temp2
|
338
|
-
|
339
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
340
|
-
end
|
268
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
341
269
|
word_alternative_spellings.add(temp)
|
342
270
|
end
|
343
271
|
temp=translitered
|
344
272
|
temp.gsub!(/Y/,"y")
|
345
273
|
temp.sub!(/y'$/,"}")
|
346
|
-
|
347
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
348
|
-
end
|
274
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
349
275
|
word_alternative_spellings.add(temp)
|
350
276
|
temp2 = temp.sub(/w/, "&")
|
351
277
|
if(temp!=temp2)
|
352
278
|
temp = temp2
|
353
|
-
|
354
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
355
|
-
end
|
279
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
356
280
|
word_alternative_spellings.add(temp)
|
357
281
|
end
|
358
282
|
|
@@ -360,32 +284,24 @@ class Raramorph
|
|
360
284
|
temp2 = temp.gsub(/Y/,"y")
|
361
285
|
if(temp != temp2 )
|
362
286
|
temp = temp2
|
363
|
-
|
364
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
365
|
-
end
|
287
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
366
288
|
word_alternative_spellings.add(temp)
|
367
289
|
end
|
368
290
|
temp2 = temp.sub(/w'/, "&")
|
369
291
|
if(temp != temp2 )
|
370
292
|
temp = temp2
|
371
|
-
|
372
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
373
|
-
end
|
293
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
374
294
|
word_alternative_spellings.add(temp)
|
375
295
|
end
|
376
296
|
temp =translitered
|
377
297
|
temp.gsub!(/Y/, "y")
|
378
298
|
temp.sub!(/y'$/, "}")
|
379
|
-
|
380
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
381
|
-
end
|
299
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
382
300
|
word_alternative_spellings.add(temp)
|
383
301
|
temp2 = temp.sub(/w'/, "&")
|
384
302
|
if(temp != temp2 )
|
385
303
|
temp = temp2
|
386
|
-
|
387
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
388
|
-
end
|
304
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
389
305
|
word_alternative_spellings.add(temp)
|
390
306
|
end
|
391
307
|
|
@@ -394,24 +310,18 @@ class Raramorph
|
|
394
310
|
temp2 = temp.sub(/w'/, "&")
|
395
311
|
if(temp != temp2 )
|
396
312
|
temp = temp2
|
397
|
-
|
398
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
399
|
-
end
|
313
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
400
314
|
word_alternative_spellings.add(temp)
|
401
315
|
end
|
402
316
|
temp =translitered
|
403
317
|
temp.gsub!(/Y/, "y")
|
404
318
|
temp.gsub!(/y$/, "Y")
|
405
|
-
|
406
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
407
|
-
end
|
319
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
408
320
|
word_alternative_spellings.add(temp)
|
409
321
|
temp2 = temp.sub(/w'/, "&")
|
410
322
|
if(temp != temp2 )
|
411
323
|
temp = temp2
|
412
|
-
|
413
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
414
|
-
end
|
324
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
415
325
|
word_alternative_spellings.add(temp)
|
416
326
|
end
|
417
327
|
|
@@ -419,23 +329,17 @@ class Raramorph
|
|
419
329
|
temp2 = temp.gsub(/Y/,"y")
|
420
330
|
if(temp != temp2 )
|
421
331
|
temp = temp2
|
422
|
-
|
423
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
424
|
-
end
|
332
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
425
333
|
word_alternative_spellings.add(temp)
|
426
334
|
end
|
427
335
|
temp2 = temp.sub(/w'/, "&")
|
428
336
|
if(temp != temp2 )
|
429
337
|
temp = temp2
|
430
|
-
|
431
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
432
|
-
end
|
338
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
433
339
|
word_alternative_spellings.add(temp)
|
434
340
|
end
|
435
341
|
temp.sub!(/p$/, "h")
|
436
|
-
|
437
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
438
|
-
end
|
342
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
439
343
|
word_alternative_spellings.add(temp)
|
440
344
|
|
441
345
|
else
|
@@ -443,40 +347,30 @@ class Raramorph
|
|
443
347
|
if(temp!=temp2)
|
444
348
|
temp = temp2
|
445
349
|
temp.gsub!(/Y/, "y")
|
446
|
-
|
447
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
448
|
-
end
|
350
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
449
351
|
word_alternative_spellings.add(temp)
|
450
352
|
temp2 = temp.sub(/w'/, "&")
|
451
353
|
if(temp != temp2 )
|
452
354
|
temp = temp2
|
453
|
-
|
454
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
455
|
-
end
|
355
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
456
356
|
word_alternative_spellings.add(temp)
|
457
357
|
end
|
458
358
|
else
|
459
359
|
temp2 = temp.gsub(/Y/, "y")
|
460
360
|
if(temp != temp2)
|
461
|
-
|
462
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
463
|
-
end
|
361
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
464
362
|
word_alternative_spellings.add(temp)
|
465
363
|
temp2 = temp.sub(/w'/, "&")
|
466
364
|
if(temp != temp2 )
|
467
365
|
temp = temp2
|
468
|
-
|
469
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
470
|
-
end
|
366
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
471
367
|
word_alternative_spellings.add(temp)
|
472
368
|
end
|
473
369
|
else
|
474
370
|
temp2 = temp.sub(/w'/, "&")
|
475
371
|
if(temp != temp2 )
|
476
372
|
temp = temp2
|
477
|
-
|
478
|
-
@stream.puts "Found alternative spelling "+ temp + " for word " + translitered
|
479
|
-
end
|
373
|
+
@logger.info "Found alternative spelling #{temp} for word #{translitered}" , true
|
480
374
|
word_alternative_spellings.add(temp)
|
481
375
|
end
|
482
376
|
end
|
@@ -494,10 +388,9 @@ class Raramorph
|
|
494
388
|
# * [output_filename] Output file path
|
495
389
|
# * [verbose] Setter for verbose
|
496
390
|
# * [not_arabic] alias for out_put_bucwalter for indicating the output format in buckwalter indications or will be arabic
|
497
|
-
def self.execute(input_filename, output_filename ,verbose = false, not_arabic =
|
498
|
-
@
|
391
|
+
def self.execute(input_filename, output_filename ,verbose = false, not_arabic = false)
|
392
|
+
@logger = Logger.new(true , output_filename )
|
499
393
|
@not_arabic = not_arabic
|
500
|
-
@verbose = verbose
|
501
394
|
# Lines processed
|
502
395
|
@lines_counter = 0
|
503
396
|
# Arabic tokens processed
|
@@ -514,24 +407,11 @@ class Raramorph
|
|
514
407
|
# * [value] = occurences
|
515
408
|
#
|
516
409
|
@not_found = {}
|
517
|
-
@stream = StringIO.new
|
518
|
-
|
519
410
|
analyze(input_filename , @not_arabic)
|
520
|
-
|
521
|
-
f.puts @stream.string
|
522
|
-
end
|
411
|
+
@logger.log
|
523
412
|
print_stats
|
524
413
|
end
|
525
414
|
end
|
526
415
|
|
527
|
-
|
528
|
-
|
529
|
-
# Author:: eSpace technologies www.eSpace.com.eg
|
530
|
-
# Copyright:: 2008
|
531
|
-
attr_reader :prefix , :stem , :suffix
|
532
|
-
def initialize(prefix , stem , suffix)
|
533
|
-
@prefix = prefix
|
534
|
-
@stem = stem
|
535
|
-
@suffix = suffix
|
536
|
-
end
|
537
|
-
end
|
416
|
+
class SegmentedWord < Struct.new( :prefix , :stem , :suffix) ; end
|
417
|
+
|
data/lib/raramorph/solution.rb
CHANGED
@@ -1,14 +1,47 @@
|
|
1
1
|
# A class to find the solution of the word
|
2
|
-
#
|
3
2
|
# Author:: eSpace technologies www.eSpace.com.eg
|
4
3
|
# Copyright:: 2008
|
5
|
-
|
4
|
+
|
6
5
|
|
7
6
|
|
8
7
|
class Solution
|
9
8
|
|
10
9
|
attr_reader :prefix, :stem, :suffix, :cnt
|
10
|
+
@@ends_with_set_for_pos_one = Set.new(["CONJ","EMPHATIC_PARTICLE","FUNC_WORD",
|
11
|
+
"FUT_PART","INTERJ","INTERROG_PART","IV1S","IV2MS",
|
12
|
+
"IV2FS","IV3MS","IV3FS","IV2D","IV2FD","IV3MD","IV3FD",
|
13
|
+
"IV1P","IV2MP","IV2FP","IV3MP","IV3FP","NEG_PART",
|
14
|
+
"PREP","RESULT_CLAUSE_PARTICLE"])
|
11
15
|
|
16
|
+
@@ends_with_set_for_pos_two = Set.new(["CASE_INDEF_NOM","CASE_INDEF_ACC",
|
17
|
+
"CASE_INDEF_ACCGEN","CASE_INDEF_GEN" ,"CASE_DEF_NOM" ,
|
18
|
+
"CASE_DEF_ACC" ,"CASE_DEF_ACCGEN","CASE_DEF_GEN" ,
|
19
|
+
"NSUFF_MASC_SG_ACC_INDEF" ,"NSUFF_FEM_SG" ,"NSUFF_MASC_DU_NOM" ,
|
20
|
+
"NSUFF_MASC_DU_NOM_POSS" ,"NSUFF_MASC_DU_ACCGEN" ,
|
21
|
+
"NSUFF_MASC_DU_ACCGEN_POSS" ,"NSUFF_FEM_DU_NOM" ,
|
22
|
+
"NSUFF_FEM_DU_NOM_POSS" ,"NSUFF_FEM_DU_ACCGEN" ,
|
23
|
+
"NSUFF_FEM_DU_ACCGEN_POSS" ,"NSUFF_MASC_PL_NOM" ,
|
24
|
+
"NSUFF_MASC_PL_NOM_POSS" ,"NSUFF_MASC_PL_ACCGEN" ,
|
25
|
+
"NSUFF_MASC_PL_ACCGEN_POSS" ,"NSUFF_FEM_PL" ,"POSS_PRON_1S",
|
26
|
+
"POSS_PRON_2MS" ,"POSS_PRON_2FS" ,"POSS_PRON_3MS" ,
|
27
|
+
"POSS_PRON_3FS","POSS_PRON_2D" ,"POSS_PRON_3D" ,"POSS_PRON_1P",
|
28
|
+
"POSS_PRON_2MP" ,"POSS_PRON_2FP" ,"POSS_PRON_3MP" ,"POSS_PRON_3FP" ,
|
29
|
+
"IVSUFF_DO:1S" ,"IVSUFF_DO:2MS" ,"IVSUFF_DO:2FS" ,"IVSUFF_DO:3MS" ,
|
30
|
+
"IVSUFF_DO:3FS" ,"IVSUFF_DO:2D" ,"IVSUFF_DO:3D" ,"IVSUFF_DO:1P" ,
|
31
|
+
"IVSUFF_DO:2MP" ,"IVSUFF_DO:2FP" ,"IVSUFF_DO:3MP" ,"IVSUFF_DO:3FP" ,
|
32
|
+
"IVSUFF_MOOD:I" ,"IVSUFF_SUBJ:2FS_MOOD:I" ,"IVSUFF_SUBJ:D_MOOD:I" ,
|
33
|
+
"IVSUFF_SUBJ:3D_MOOD:I" ,"IVSUFF_SUBJ:MP_MOOD:I" ,"IVSUFF_MOOD:S",
|
34
|
+
"IVSUFF_SUBJ:2FS_MOOD:SJ" ,"IVSUFF_SUBJ:D_MOOD:SJ","IVSUFF_SUBJ:MP_MOOD:SJ" ,
|
35
|
+
"IVSUFF_SUBJ:3MP_MOOD:SJ" ,"IVSUFF_SUBJ:FP" ,"PVSUFF_DO:1S" ,"PVSUFF_DO:2MS" ,
|
36
|
+
"PVSUFF_DO:2FS" ,"PVSUFF_DO:3MS" ,"PVSUFF_DO:3FS" ,"PVSUFF_DO:2D" ,
|
37
|
+
"PVSUFF_DO:3D" ,"PVSUFF_DO:1P" ,"PVSUFF_DO:2MP" ,"PVSUFF_DO:2FP" ,
|
38
|
+
"PVSUFF_DO:3MP" ,"PVSUFF_DO:3FP" ,"PVSUFF_SUBJ:1S" ,"PVSUFF_SUBJ:2MS" ,
|
39
|
+
"PVSUFF_SUBJ:2FS" ,"PVSUFF_SUBJ:3MS" ,"PVSUFF_SUBJ:3FS" ,"PVSUFF_SUBJ:2MD" ,
|
40
|
+
"PVSUFF_SUBJ:2FD" ,"PVSUFF_SUBJ:3MD" ,"PVSUFF_SUBJ:3FD" ,"PVSUFF_SUBJ:1P" ,
|
41
|
+
"PVSUFF_SUBJ:2MP" ,"PVSUFF_SUBJ:2FP" ,"PVSUFF_SUBJ:3MP" ,"PVSUFF_SUBJ:3FP" ,
|
42
|
+
"CVSUFF_DO:1S" ,"CVSUFF_DO:3MS" ,"CVSUFF_DO:3FS" ,"CVSUFF_DO:3D" ,
|
43
|
+
"CVSUFF_DO:1P" ,"CVSUFF_DO:3MP" ,"CVSUFF_DO:3FP" ,"CVSUFF_SUBJ:2MS" ,
|
44
|
+
"CVSUFF_SUBJ:2FS" ,"CVSUFF_SUBJ:2MP"])
|
12
45
|
protected
|
13
46
|
|
14
47
|
# Constructs a solution for a word. Note that the prefix, stem and suffix combination is <b>recomputed</b>
|
@@ -42,60 +75,35 @@ class Solution
|
|
42
75
|
@stemsGlosses = stem.glosses
|
43
76
|
#The suffixes glosses.
|
44
77
|
@suffixesGlosses = suffix.glosses
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
puts "\"" + get_lemma() + "\" : stem's sizes for POS (" + @stemsPOS.length.to_s + ") and GLOSS ("+ @stemsGlosses.length.to_s + ") do not match"
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
78
|
+
|
79
|
+
puts "\"#{get_lemma()}\" : stem's sizes for POS (\"#{@stemsPOS.length.to_s}\") and GLOSS (\"#{@stemsGlosses.length.to_s}\") do not match" if (@stemsPOS.length != @stemsGlosses.length and @debug)
|
80
|
+
|
52
81
|
#Normalize stems since some of them can contain prefixes
|
53
82
|
|
54
83
|
while(@stemsPOS.length>0)
|
55
84
|
stemPOS = @stemsPOS.slice(0)
|
56
|
-
|
57
|
-
|
58
|
-
|
85
|
+
|
86
|
+
stemPOS.force_encoding "UTF-8" if(stemPOS)
|
87
|
+
|
59
88
|
if (@stemsGlosses.length>0)
|
60
89
|
stemGloss = @stemsGlosses.slice(0)
|
61
90
|
else
|
62
91
|
stemGloss = nil
|
63
92
|
end
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
stemPOS.end_with?("FUNC_WORD") or
|
70
|
-
stemPOS.end_with?("FUT_PART") or
|
71
|
-
stemPOS.end_with?("INTERJ") or
|
72
|
-
stemPOS.end_with?("INTERROG_PART") or
|
73
|
-
stemPOS.end_with?("IV1S") or
|
74
|
-
stemPOS.end_with?("IV2MS") or
|
75
|
-
stemPOS.end_with?("IV2FS") or
|
76
|
-
stemPOS.end_with?("IV3MS") or
|
77
|
-
stemPOS.end_with?("IV3FS") or
|
78
|
-
stemPOS.end_with?("IV2D") or
|
79
|
-
stemPOS.end_with?("IV2FD") or
|
80
|
-
stemPOS.end_with?("IV3MD") or
|
81
|
-
stemPOS.end_with?("IV3FD") or
|
82
|
-
stemPOS.end_with?("IV1P") or
|
83
|
-
stemPOS.end_with?("IV2MP") or
|
84
|
-
stemPOS.end_with?("IV2FP") or
|
85
|
-
stemPOS.end_with?("IV3MP") or
|
86
|
-
stemPOS.end_with?("IV3FP") or
|
87
|
-
stemPOS.end_with?("NEG_PART") or
|
88
|
-
stemPOS.end_with?("PREP") or
|
89
|
-
stemPOS.end_with?("RESULT_CLAUSE_PARTICLE") )
|
93
|
+
|
94
|
+
stemGloss.force_encoding "UTF-8" if(stemGloss)
|
95
|
+
|
96
|
+
|
97
|
+
if(stemPOS.ends_with_suffix_set?(@@ends_with_set_for_pos_one) )
|
90
98
|
@stemsPOS.slice!(0)
|
91
99
|
@prefixesPOS.push(stemPOS)
|
92
100
|
if (stemGloss)
|
93
101
|
@stemsGlosses.slice!(0)
|
94
102
|
@prefixesGlosses.push(stemGloss)
|
95
103
|
end
|
96
|
-
|
97
|
-
|
98
|
-
|
104
|
+
else
|
105
|
+
break
|
106
|
+
end
|
99
107
|
end
|
100
108
|
|
101
109
|
#Normalize stems since some of them can contain suffixes
|
@@ -112,101 +120,8 @@ class Solution
|
|
112
120
|
if(stemGloss)
|
113
121
|
stemGloss.force_encoding "UTF-8"
|
114
122
|
end
|
115
|
-
|
116
|
-
if
|
117
|
-
stemPOS.end_with?("CASE_INDEF_ACC") or
|
118
|
-
stemPOS.end_with?("CASE_INDEF_ACCGEN") or
|
119
|
-
stemPOS.end_with?("CASE_INDEF_GEN") or
|
120
|
-
stemPOS.end_with?("CASE_DEF_NOM") or
|
121
|
-
stemPOS.end_with?("CASE_DEF_ACC") or
|
122
|
-
stemPOS.end_with?("CASE_DEF_ACCGEN") or
|
123
|
-
stemPOS.end_with?("CASE_DEF_GEN") or
|
124
|
-
stemPOS.end_with?("NSUFF_MASC_SG_ACC_INDEF") or
|
125
|
-
stemPOS.end_with?("NSUFF_FEM_SG") or
|
126
|
-
stemPOS.end_with?("NSUFF_MASC_DU_NOM") or
|
127
|
-
stemPOS.end_with?("NSUFF_MASC_DU_NOM_POSS") or
|
128
|
-
stemPOS.end_with?("NSUFF_MASC_DU_ACCGEN") or
|
129
|
-
stemPOS.end_with?("NSUFF_MASC_DU_ACCGEN_POSS") or
|
130
|
-
stemPOS.end_with?("NSUFF_FEM_DU_NOM") or
|
131
|
-
stemPOS.end_with?("NSUFF_FEM_DU_NOM_POSS") or
|
132
|
-
stemPOS.end_with?("NSUFF_FEM_DU_ACCGEN") or
|
133
|
-
stemPOS.end_with?("NSUFF_FEM_DU_ACCGEN_POSS") or
|
134
|
-
stemPOS.end_with?("NSUFF_MASC_PL_NOM") or
|
135
|
-
stemPOS.end_with?("NSUFF_MASC_PL_NOM_POSS") or
|
136
|
-
stemPOS.end_with?("NSUFF_MASC_PL_ACCGEN") or
|
137
|
-
stemPOS.end_with?("NSUFF_MASC_PL_ACCGEN_POSS") or
|
138
|
-
stemPOS.end_with?("NSUFF_FEM_PL") or
|
139
|
-
stemPOS.end_with?("POSS_PRON_1S") or
|
140
|
-
stemPOS.end_with?("POSS_PRON_2MS") or
|
141
|
-
stemPOS.end_with?("POSS_PRON_2FS") or
|
142
|
-
stemPOS.end_with?("POSS_PRON_3MS") or
|
143
|
-
stemPOS.end_with?("POSS_PRON_3FS") or
|
144
|
-
stemPOS.end_with?("POSS_PRON_2D") or
|
145
|
-
stemPOS.end_with?("POSS_PRON_3D") or
|
146
|
-
stemPOS.end_with?("POSS_PRON_1P") or
|
147
|
-
stemPOS.end_with?("POSS_PRON_2MP") or
|
148
|
-
stemPOS.end_with?("POSS_PRON_2FP") or
|
149
|
-
stemPOS.end_with?("POSS_PRON_3MP") or
|
150
|
-
stemPOS.end_with?("POSS_PRON_3FP") or
|
151
|
-
stemPOS.end_with?("IVSUFF_DO:1S") or
|
152
|
-
stemPOS.end_with?("IVSUFF_DO:2MS") or
|
153
|
-
stemPOS.end_with?("IVSUFF_DO:2FS") or
|
154
|
-
stemPOS.end_with?("IVSUFF_DO:3MS") or
|
155
|
-
stemPOS.end_with?("IVSUFF_DO:3FS") or
|
156
|
-
stemPOS.end_with?("IVSUFF_DO:2D") or
|
157
|
-
stemPOS.end_with?("IVSUFF_DO:3D") or
|
158
|
-
stemPOS.end_with?("IVSUFF_DO:1P") or
|
159
|
-
stemPOS.end_with?("IVSUFF_DO:2MP") or
|
160
|
-
stemPOS.end_with?("IVSUFF_DO:2FP") or
|
161
|
-
stemPOS.end_with?("IVSUFF_DO:3MP") or
|
162
|
-
stemPOS.end_with?("IVSUFF_DO:3FP") or
|
163
|
-
stemPOS.end_with?("IVSUFF_MOOD:I") or
|
164
|
-
stemPOS.end_with?("IVSUFF_SUBJ:2FS_MOOD:I") or
|
165
|
-
stemPOS.end_with?("IVSUFF_SUBJ:D_MOOD:I") or
|
166
|
-
stemPOS.end_with?("IVSUFF_SUBJ:3D_MOOD:I") or
|
167
|
-
stemPOS.end_with?("IVSUFF_SUBJ:MP_MOOD:I") or
|
168
|
-
stemPOS.end_with?("IVSUFF_MOOD:S") or
|
169
|
-
stemPOS.end_with?("IVSUFF_SUBJ:2FS_MOOD:SJ") or
|
170
|
-
stemPOS.end_with?("IVSUFF_SUBJ:D_MOOD:SJ") or
|
171
|
-
stemPOS.end_with?("IVSUFF_SUBJ:MP_MOOD:SJ") or
|
172
|
-
stemPOS.end_with?("IVSUFF_SUBJ:3MP_MOOD:SJ") or
|
173
|
-
stemPOS.end_with?("IVSUFF_SUBJ:FP") or
|
174
|
-
stemPOS.end_with?("PVSUFF_DO:1S") or
|
175
|
-
stemPOS.end_with?("PVSUFF_DO:2MS") or
|
176
|
-
stemPOS.end_with?("PVSUFF_DO:2FS") or
|
177
|
-
stemPOS.end_with?("PVSUFF_DO:3MS") or
|
178
|
-
stemPOS.end_with?("PVSUFF_DO:3FS") or
|
179
|
-
stemPOS.end_with?("PVSUFF_DO:2D") or
|
180
|
-
stemPOS.end_with?("PVSUFF_DO:3D") or
|
181
|
-
stemPOS.end_with?("PVSUFF_DO:1P") or
|
182
|
-
stemPOS.end_with?("PVSUFF_DO:2MP") or
|
183
|
-
stemPOS.end_with?("PVSUFF_DO:2FP") or
|
184
|
-
stemPOS.end_with?("PVSUFF_DO:3MP") or
|
185
|
-
stemPOS.end_with?("PVSUFF_DO:3FP") or
|
186
|
-
stemPOS.end_with?("PVSUFF_SUBJ:1S") or
|
187
|
-
stemPOS.end_with?("PVSUFF_SUBJ:2MS") or
|
188
|
-
stemPOS.end_with?("PVSUFF_SUBJ:2FS") or
|
189
|
-
stemPOS.end_with?("PVSUFF_SUBJ:3MS") or
|
190
|
-
stemPOS.end_with?("PVSUFF_SUBJ:3FS") or
|
191
|
-
stemPOS.end_with?("PVSUFF_SUBJ:2MD") or
|
192
|
-
stemPOS.end_with?("PVSUFF_SUBJ:2FD") or
|
193
|
-
stemPOS.end_with?("PVSUFF_SUBJ:3MD") or
|
194
|
-
stemPOS.end_with?("PVSUFF_SUBJ:3FD") or
|
195
|
-
stemPOS.end_with?("PVSUFF_SUBJ:1P") or
|
196
|
-
stemPOS.end_with?("PVSUFF_SUBJ:2MP") or
|
197
|
-
stemPOS.end_with?("PVSUFF_SUBJ:2FP") or
|
198
|
-
stemPOS.end_with?("PVSUFF_SUBJ:3MP") or
|
199
|
-
stemPOS.end_with?("PVSUFF_SUBJ:3FP") or
|
200
|
-
stemPOS.end_with?("CVSUFF_DO:1S") or
|
201
|
-
stemPOS.end_with?("CVSUFF_DO:3MS") or
|
202
|
-
stemPOS.end_with?("CVSUFF_DO:3FS") or
|
203
|
-
stemPOS.end_with?("CVSUFF_DO:3D") or
|
204
|
-
stemPOS.end_with?("CVSUFF_DO:1P") or
|
205
|
-
stemPOS.end_with?("CVSUFF_DO:3MP") or
|
206
|
-
stemPOS.end_with?("CVSUFF_DO:3FP") or
|
207
|
-
stemPOS.end_with?("CVSUFF_SUBJ:2MS") or
|
208
|
-
stemPOS.end_with?("CVSUFF_SUBJ:2FS") or
|
209
|
-
stemPOS.end_with?("CVSUFF_SUBJ:2MP") )
|
123
|
+
|
124
|
+
if(stemPOS.ends_with_suffix_set?(@@ends_with_set_for_pos_two))
|
210
125
|
@stemsPOS.slice!(@stemsPOS.length-1)
|
211
126
|
@suffixesPOS.insert(0,stemPOS)
|
212
127
|
if (stemGloss)
|
@@ -332,14 +247,10 @@ class Solution
|
|
332
247
|
sb = ""
|
333
248
|
sb.force_encoding "UTF-8"
|
334
249
|
vocal = get_prefixes_arabic_vocalizations()
|
335
|
-
|
336
|
-
sb += vocal[0].to_s
|
337
|
-
end
|
250
|
+
sb += vocal[0].to_s if vocal!=nil
|
338
251
|
|
339
|
-
s = get_stem_arabic_vocalization()
|
340
|
-
|
341
|
-
sb+=s
|
342
|
-
end
|
252
|
+
s = get_stem_arabic_vocalization()
|
253
|
+
sb+=s if s!= nil
|
343
254
|
vocal = get_suffixes_arabic_vocalizations()
|
344
255
|
if(vocal!=nil)
|
345
256
|
sb += vocal[0].to_s
|
@@ -376,13 +287,13 @@ class Solution
|
|
376
287
|
sb = ""
|
377
288
|
sb.force_encoding "UTF-8"
|
378
289
|
if (!@prefix.morphology.empty? and @prefix.morphology != nil )
|
379
|
-
sb+= "\
|
290
|
+
sb+= "\tprefix : #{@prefix.morphology}\n"
|
380
291
|
end
|
381
292
|
if (!@stem.morphology.empty? and @stem.morphology != nil)
|
382
|
-
sb+= "\
|
293
|
+
sb+= "\tstem : #{@stem.morphology}\n"
|
383
294
|
end
|
384
295
|
if (!@suffix.morphology.empty? and @suffix.morphology != nil)
|
385
|
-
sb+= "\
|
296
|
+
sb+= "\tsuffix : #{@suffix.morphology}\n"
|
386
297
|
end
|
387
298
|
return sb
|
388
299
|
end
|
@@ -517,14 +428,14 @@ class Solution
|
|
517
428
|
sb.force_encoding "UTF-8"
|
518
429
|
glosses = get_prefixes_glosses()
|
519
430
|
if (glosses and glosses[0] != nil)
|
520
|
-
sb+=("\
|
431
|
+
sb+=("\tprefix : #{glosses[0].gsub(";","/")}\n")
|
521
432
|
end
|
522
433
|
if (get_stem_gloss() != nil)
|
523
|
-
sb+=("\
|
434
|
+
sb+=("\tstem : #{get_stem_gloss().gsub(";","/")}\n")
|
524
435
|
end
|
525
436
|
glosses = get_suffixes_glosses()
|
526
437
|
if (glosses and glosses[0] != nil)
|
527
|
-
sb+=("\
|
438
|
+
sb+=("\tsuffix : #{glosses[0].gsub(";","/")}\n")
|
528
439
|
end
|
529
440
|
return sb
|
530
441
|
end
|
@@ -603,32 +514,26 @@ class Solution
|
|
603
514
|
end
|
604
515
|
temp_POS = []
|
605
516
|
arr.each do |pos|
|
606
|
-
array = pos.split("/")
|
517
|
+
array = pos.split("/")
|
607
518
|
j=1
|
608
519
|
if(type==1)
|
609
520
|
sb = ""
|
610
521
|
elsif(type==2)
|
611
|
-
sb = array[0]
|
522
|
+
sb = "#{array[0]}\t"
|
612
523
|
else
|
613
|
-
sb = LatinArabicTranslator.translate(array[0])
|
524
|
+
sb = "#{LatinArabicTranslator.translate(array[0])}\t"
|
614
525
|
sb.force_encoding "UTF-8"
|
615
526
|
end
|
616
|
-
|
617
|
-
if (j > 1)
|
618
|
-
sb+=" / "
|
619
|
-
end
|
620
|
-
sb+=array[j]
|
621
|
-
j+=1
|
622
|
-
end
|
527
|
+
sb << array[1..array.length].join(" / ")
|
623
528
|
temp_POS.push(sb)
|
624
529
|
end
|
625
530
|
|
626
531
|
if(pre_stem_suff==2)
|
627
532
|
if ((temp_POS.length > 1) and @debug)
|
628
|
-
puts "More than one stem for
|
533
|
+
puts "More than one stem for #{temp_POS.to_s}"
|
629
534
|
end
|
630
535
|
if (type ==1 and temp_POS[0].empty?)
|
631
|
-
puts "Empty POS for stem
|
536
|
+
puts "Empty POS for stem #{get_stem_long_POS()}"
|
632
537
|
end
|
633
538
|
#return the first anyway :-(
|
634
539
|
return temp_POS[0]
|
@@ -649,7 +554,7 @@ class Solution
|
|
649
554
|
end
|
650
555
|
if (temp_POS != nil)
|
651
556
|
if (temp_POS[0]!=nil)
|
652
|
-
sb
|
557
|
+
sb << ("\tprefix : #{temp_POS[0]}\n")
|
653
558
|
end
|
654
559
|
end
|
655
560
|
if(arabic)
|
@@ -658,7 +563,7 @@ class Solution
|
|
658
563
|
s = get_stem_long_POS()
|
659
564
|
end
|
660
565
|
if ( s != nil)
|
661
|
-
sb
|
566
|
+
sb << ("\tstem : #{s} \n")
|
662
567
|
end
|
663
568
|
if(arabic)
|
664
569
|
temp_POS =get_suffixes_arabic_long_POS()
|
@@ -667,9 +572,21 @@ class Solution
|
|
667
572
|
end
|
668
573
|
if (temp_POS != nil)
|
669
574
|
if (temp_POS[0]!=nil)
|
670
|
-
sb
|
575
|
+
sb << ("\tsuffix : #{temp_POS[0]}\n")
|
671
576
|
end
|
672
577
|
end
|
673
578
|
return sb
|
674
579
|
end
|
580
|
+
|
675
581
|
end
|
582
|
+
|
583
|
+
class String
|
584
|
+
def ends_with_suffix_set?(ends_with_suffix_set)
|
585
|
+
length = self.length
|
586
|
+
length.times { |i|
|
587
|
+
return true if ends_with_suffix_set.member?(self[i..length])
|
588
|
+
|
589
|
+
}
|
590
|
+
return false
|
591
|
+
end
|
592
|
+
end
|
data/lib/raramorph/translator.rb
CHANGED
data/lib/raramorph.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
#Dir[File.join(File.dirname(__FILE__), 'raramorph/**/*.rb')].sort.each { |lib| require lib }
|
2
|
+
|
2
3
|
$:.unshift File.expand_path(File.dirname(__FILE__) )
|
3
4
|
start = Time.now
|
4
5
|
require 'set'
|
5
|
-
require 'stringio'
|
6
|
+
require 'stringio'
|
7
|
+
require 'raramorph/logger'
|
6
8
|
require 'raramorph/translator'
|
7
9
|
require 'raramorph/arabic_latin_translator'
|
8
10
|
require 'raramorph/latin_arabic_translator'
|
data/lib/raramorph_main.rb
CHANGED
@@ -3,26 +3,31 @@
|
|
3
3
|
# ARGV[2] # Verbose Default False
|
4
4
|
# ARGV[4] # BuckWalter Default False ( Arabic Output)
|
5
5
|
$:.unshift File.expand_path(File.dirname(__FILE__) )
|
6
|
-
if ARGV.length
|
6
|
+
if ARGV.length >= 2 and ARGV.length <= 4
|
7
7
|
require 'raramorph'
|
8
8
|
start = Time.now
|
9
|
-
|
9
|
+
verbose = false
|
10
|
+
not_arabic = true
|
11
|
+
verbose = true if ARGV[2] and ARGV[2] == "-v"
|
12
|
+
not_arabic = false if ARGV[3] and ARGV[3] == "-a"
|
13
|
+
not_arabic = false if ARGV[2] and ARGV[2] == "-a"
|
14
|
+
Raramorph.execute(ARGV[0] , ARGV[1] , verbose , not_arabic )
|
10
15
|
puts "Time Elapsed= " + ( Time.now - start).to_s
|
11
16
|
else
|
12
17
|
puts("Arabic Morphological Analyzer for Ruby")
|
13
18
|
puts("Ported to Ruby by Moustafa Emara and Hany Salah El din , eSpace-technologies.(www.espace.com.eg) , 2008.")
|
14
19
|
puts("Based on :")
|
15
20
|
puts("BUCKWALTER ARABIC MORPHOLOGICAL ANALYZER")
|
16
|
-
puts("This program is developed under the
|
21
|
+
puts("This program is developed under the MIT-Licences")
|
17
22
|
puts("Usage :")
|
18
23
|
puts("")
|
19
|
-
puts("
|
24
|
+
puts("raraMorph inFile [inEncoding] [outFile] [-v] [-a]")
|
20
25
|
puts("")
|
21
26
|
puts("inFile : file to be analyzed")
|
22
27
|
puts("inEncoding : encoding for inFile, default UTF-8")
|
23
|
-
puts("outFile : result file
|
24
|
-
puts("outEncoding : encoding for outFile, if not specified use Buckwalter transliteration with system's file.encoding")
|
28
|
+
puts("outFile : result file ")
|
25
29
|
puts("-v : verbose mode")
|
30
|
+
puts("-a : Aarbic Output" )
|
26
31
|
end
|
27
32
|
|
28
33
|
|
data/raramorph.gemspec
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "raramorph"
|
3
|
-
s.version = "0.1.
|
3
|
+
s.version = "0.1.1"
|
4
4
|
s.date = "2008-09-06"
|
5
|
-
s.summary = "Raramorph is a ruby gem for making morphological analysis and arabic indexing built using Ruby at eSpace-technologies ( www.espace.com.eg )"
|
5
|
+
s.summary = "Raramorph is a ruby gem for making morphological analysis and arabic indexing built using Ruby at eSpace-technologies ( www.espace.com.eg ) "
|
6
6
|
s.email = "moustafa.emara@espace.com.eg"
|
7
7
|
s.homepage = "http://github.com/espace/raramorph"
|
8
8
|
s.description = "Raramorph is a ruby gem for making morphological analysis and arabic indexing built using Ruby at eSpace-technologies ( www.espace.com.eg )"
|
@@ -20,6 +20,7 @@ Gem::Specification.new do |s|
|
|
20
20
|
"lib/raramorph/arabic_latin_translator.rb",
|
21
21
|
"lib/raramorph/latin_arabic_translator.rb",
|
22
22
|
"lib/raramorph/in_memory_dictionary_handler.rb",
|
23
|
+
"lib/raramorph/logger.rb",
|
23
24
|
"lib/dictionaries/dictPrefixes",
|
24
25
|
"lib/dictionaries/dictStems",
|
25
26
|
"lib/dictionaries/dictSuffixes",
|
@@ -34,7 +35,8 @@ Gem::Specification.new do |s|
|
|
34
35
|
s.executables = %w(raramorph)
|
35
36
|
s.required_ruby_version = '>= 1.9'
|
36
37
|
s.bindir = "bin"
|
37
|
-
|
38
|
-
|
38
|
+
s.rdoc_options = ["--main", "README"]
|
39
|
+
s.extra_rdoc_files = ["README"]
|
39
40
|
#s.extensions << "ext/extconf.rb"
|
41
|
+
|
40
42
|
end
|
metadata
CHANGED
@@ -3,7 +3,7 @@ rubygems_version: 0.9.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: raramorph
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.1.
|
6
|
+
version: 0.1.1
|
7
7
|
date: 2008-09-06 00:00:00 +02:00
|
8
8
|
summary: Raramorph is a ruby gem for making morphological analysis and arabic indexing built using Ruby at eSpace-technologies ( www.espace.com.eg )
|
9
9
|
require_paths:
|
@@ -39,6 +39,7 @@ files:
|
|
39
39
|
- lib/raramorph/translator.rb
|
40
40
|
- lib/raramorph/arabic_latin_translator.rb
|
41
41
|
- lib/raramorph/latin_arabic_translator.rb
|
42
|
+
- lib/raramorph/logger.rb
|
42
43
|
- lib/dictionaries/dictPrefixes
|
43
44
|
- lib/dictionaries/dictStems
|
44
45
|
- lib/dictionaries/dictSuffixes
|
@@ -49,12 +50,14 @@ files:
|
|
49
50
|
- lib/raramorph.rb
|
50
51
|
- lib/raramorph_main.rb
|
51
52
|
- lib/test_input/UTF-8.txt
|
53
|
+
- README
|
52
54
|
test_files: []
|
53
55
|
|
54
|
-
rdoc_options:
|
55
|
-
|
56
|
-
|
57
|
-
|
56
|
+
rdoc_options:
|
57
|
+
- --main
|
58
|
+
- README
|
59
|
+
extra_rdoc_files:
|
60
|
+
- README
|
58
61
|
executables:
|
59
62
|
- raramorph
|
60
63
|
extensions: []
|