mosta-raramorph 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,38 @@
1
+ # Class For Arabic Latin Transileration
2
+ # Author:: eSpace technologies www.eSpace.com.eg
3
+ # Copyright:: 2008
4
+ #
5
+ class ArabicLatinTranslator
6
+ # * Table Used for Tranlation From Arabic To English I.e ( Romanize Word )
7
+ # * According to Buckwalter system Dictionary
8
+ TABLE = { "\u0621"=> "'" , "\u0622"=> "|" , "\u0623"=> ">" , "\u0624"=> "&" , "\u0625"=> "<" , "\u0626"=> "}" ,
9
+ "\u0627"=> "A" , "\u0628"=> "b" , "\u0629"=> "p" , "\u062A"=> "t" , "\u062B"=> "v" , "\u062C"=> "j" ,
10
+ "\u062D"=> "H" , "\u062E"=> "x" , "\u062F"=> "d" , "\u0630"=> "*" , "\u0631"=> "r" , "\u0632"=> "z" ,
11
+ "\u0633"=> "s" , "\u0634"=> "$" , "\u0635"=> "S" , "\u0636"=> "D" , "\u0637"=> "T" ,"\u0638"=> "Z",
12
+ "\u0639"=> "E" , "\u063A"=> "g" , "\u0640"=> "_" , "\u0641"=> "f" , "\u0642"=> "q" , "\u0643"=> "k" ,
13
+ "\u0644"=> "l" , "\u0645"=> "m" , "\u0646"=> "n" , "\u0647"=> "h" , "\u0648"=> "w" , "\u0649"=> "Y",
14
+ "\u064A"=> "y" , "\u064B"=> "F" , "\u064C"=> "N" , "\u064D"=> "K" , "\u064E"=> "a" , "\u064F"=> "u" ,
15
+ "\u0650"=> "i" , "\u0651"=> "~" , "\u0652"=> "o" , "\u0670"=> "`" ,"\u0671"=> "{" , "\u067E"=> "P" ,
16
+ "\u0686"=> "J" , "\u06A4"=> "V" , "\u06AF"=> "G" , "\u0698"=> "R" , "\u060C" => "," ,"\u061B" => ";",
17
+ "\u061F" => "?" , "\u0640" => "" }
18
+ #Not suitable for morphological analysis : remove all vowels/diacritics, i.e. undo the job !
19
+ VOWEL_REMOVER = Regexp.compile("[FNKaui~o]")
20
+ STRIPER = Regexp.compile("[`\\{]")
21
+
22
+
23
+ # * Translate : Transilerate the arabic word to Roman lettered Word
24
+ # * [word] Word String To be processed
25
+ # * @return transilerated word
26
+ #
27
+ def self.translate(word)
28
+ result = ""
29
+ word.gsub!(VOWEL_REMOVER , "")
30
+ word.gsub!(STRIPER , "")
31
+ word.force_encoding "UTF-8"
32
+ word.each_char{|char|
33
+ result+= TABLE[char] ? TABLE[char] : char
34
+ }
35
+ result
36
+ end
37
+
38
+ end
@@ -0,0 +1,40 @@
1
+ # Class For Storing Dictionary Entries
2
+ # Author:: eSpace technologies www.eSpace.com.eg
3
+ # Copyright:: 2008
4
+
5
+
6
+ class DictionaryEntry
7
+ ## Constructs a Dictionary Entry
8
+
9
+ attr_reader :entry , :lemma_id , :vocalization , :morphology , :gloss , :glosses , :pos
10
+ @@split_regex = Regexp.compile("\\+")
11
+
12
+ protected
13
+ # * Initiliaze New Dict. Entry
14
+ def initialize( entry, lemma_id, vocalization, morphology, gloss, pos)
15
+ # Instance Variables
16
+ @entry = entry.strip
17
+ @lemma_id = lemma_id.strip
18
+ @vocalization = vocalization.strip
19
+ @morphology = morphology.strip
20
+ @gloss = gloss
21
+ @glosses = []
22
+ @pos = []
23
+ @glosses = fill_instance_array_from_sent_array(gloss.split(@@split_regex))
24
+ @pos = fill_instance_array_from_sent_array(pos.split(@@split_regex))
25
+ end
26
+
27
+ private
28
+ def fill_instance_array_from_sent_array( sent_array)
29
+ instance_array = []
30
+ sent_array.each do |value |
31
+ value = value.strip
32
+ end
33
+ sent_array[0] == "" ? offset = 1 : offset = 0
34
+ for i in offset..sent_array.length-1
35
+ instance_array[i - offset] = sent_array[i]
36
+ end
37
+ instance_array
38
+ end
39
+
40
+ end
@@ -0,0 +1,325 @@
1
+ # Class For Storing And Loading Dictionaries
2
+ # Author:: eSpace technologies www.eSpace.com.eg
3
+ # Copyright:: 2008
4
+
5
+
6
+ require 'rubygems'
7
+ class InMemoryDictionaryHandler
8
+
9
+ #Signleton Class
10
+ ##### Dictionaries ########
11
+ #### Dictionaries are HASH OF ARRAYS #####
12
+ @@prefixes = {}
13
+ #Dictionary of Prefixes
14
+
15
+ @@stems = {}
16
+ #Dictionary of Stems
17
+
18
+ @@suffixes = {}
19
+ #Dictionary of Suffixes
20
+ private_class_method :new
21
+
22
+ # * Loads Dictionaries and initiate variables
23
+ def self.create
24
+
25
+ ### Variables #####
26
+ @@handler = nil
27
+ @@regex = Regexp.compile(".*" + "<pos>(.+?)</pos>" + ".*")
28
+ @@morphology_regexs=[Regexp.compile("^(Pref-0|Suff-0)$") ,
29
+ Regexp.compile("^F" + ".*") ,
30
+ Regexp.compile("^IV" + ".*") ,
31
+ Regexp.compile("^PV" + ".*") ,
32
+ Regexp.compile("^CV" + ".*") ,
33
+ Regexp.compile("^N" + ".*") ,
34
+ Regexp.compile("^[A-Z]" + ".*") ,
35
+ Regexp.compile(".*" + "iy~$")
36
+ ]
37
+ @@compatability_stpliter = Regexp.compile("\\s+")
38
+ @@vocalization_array =["/FUNC_WORD" ,
39
+ "/VERB_IMPERFECT" ,
40
+ "/VERB_PERFECT" ,
41
+ "/VERB_IMPERATIVE" ,
42
+ "/NOUN_PROP" ,
43
+ "/NOUN" ,
44
+ "/NOUN"
45
+ ]
46
+
47
+ @@prefixes_stems_compatibility = Set.new
48
+ #Changed
49
+ #Compatibility table for prefixes-stems combinations.
50
+
51
+ @@prefixes_suffixes_compatibility = Set.new
52
+ #Changed
53
+ #Compatibility table for prefixes-suffixes combinations.
54
+
55
+ @@stems_suffixes_compatibility = Set.new
56
+
57
+ #Changed
58
+ #Compatibility table for stem-suffixes combinations.
59
+
60
+ puts "Initializing in-memory dictionary handler..."
61
+ Thread.abort_on_exception = true
62
+ load_dictionary( @@prefixes , "dictPrefixes" , File.dirname(__FILE__) + "/../dictionaries/dictPrefixes" )
63
+ load_stems_marshaled_dictionary
64
+ load_dictionary(@@suffixes, "dictSuffixes" , File.dirname(__FILE__) + "/../dictionaries/dictSuffixes")
65
+ load_compatibility_table(@@prefixes_stems_compatibility , "prefixes_stems_compatibility" , File.dirname(__FILE__) + "/../dictionaries/tableAB")
66
+ load_compatibility_table(@@prefixes_suffixes_compatibility , "prefixes_suffixes_compatibility" , File.dirname(__FILE__) + "/../dictionaries/tableAC")
67
+ load_compatibility_table(@@stems_suffixes_compatibility , "stems_suffixes_compatibility" , File.dirname(__FILE__) + "/../dictionaries/tableBC")
68
+ puts "... Done ... "
69
+ @@handler = new unless @@handler
70
+ end
71
+
72
+ # * load the marshaled stems dictionary if avalaible or load from the origin dictionary if not avalaible
73
+ def self.load_stems_marshaled_dictionary
74
+ if File.exists?( File.dirname(__FILE__) + '/../dictionaries/marshal_stems' )
75
+ File.open( File.dirname(__FILE__) + '/../dictionaries/marshal_stems') do |f|
76
+ @@stems = Marshal.load(f)
77
+ end
78
+ puts("#{@@stems.length} entries totalizing")
79
+ else
80
+ reload_stems_dictionary
81
+ end
82
+ end
83
+
84
+ # * Marshal the stems dictionary into a file
85
+ def self.marshal_stems
86
+ File.open( File.dirname(__FILE__) + '/../dictionaries/marshal_stems' , 'w+') do |f|
87
+ Marshal.dump(@@stems, f)
88
+ end
89
+ end
90
+
91
+
92
+ # * Loads Stem dictionary from original file then marshal the dictionary for faster access
93
+ def self.reload_stems_dictionary
94
+ load_dictionary(@@stems, "dictStems", File.dirname(__FILE__) + "/../dictionaries/dictStems") #File.open("dictionaries/dictStems" , "r:UTF-8" ))
95
+ marshal_stems
96
+ end
97
+
98
+ # * Check if translitered word has prefix
99
+ # * [translitered] Translitered word to be checked
100
+ def has_prefix?(translitered)
101
+ @@prefixes.has_key?(translitered)
102
+ end
103
+
104
+ # * Check if translitered word has stem
105
+ # * [translitered] Translitered word to be checked
106
+ def has_stem?(translitered)
107
+ @@stems.has_key?(translitered)
108
+ end
109
+
110
+ # * Check if translitered word has suffix
111
+ # * [translitered] Translitered word to be checked
112
+ def has_suffix?(translitered)
113
+ @@suffixes.has_key?(translitered)
114
+ end
115
+
116
+ # * Check if prefix and stem are compatible
117
+ # * [prefix] prefix to be checked
118
+ # * [stem] stem to be checked
119
+ def prefixes_stems_compatible?(prefix , stem) #String , #String
120
+ @@prefixes_stems_compatibility.member?(prefix + " " + stem)
121
+ end
122
+
123
+ # * Check if prefix and suffix are compatible
124
+ # * [prefix] prefix to be checked
125
+ # * [suffix] suffix to be checked
126
+ def prefixes_suffixes_compatible?(prefix , suffix)
127
+ @@prefixes_suffixes_compatibility.member?(prefix + " " + suffix)
128
+ end
129
+
130
+ # * Check if stem and suffix are compatible
131
+ # * [stem] stem to be checked
132
+ # * [suffix] suffix to be checked
133
+ def stems_suffixes_compatible?(stem , suffix)
134
+ @@stems_suffixes_compatibility.member?(stem + " " + suffix)
135
+ end
136
+
137
+ # * Returns the prefixes table
138
+ def prefixes
139
+ @@prefixes
140
+ end
141
+
142
+ def prefixes=(prefixes)
143
+ @@prefixes = prefixes
144
+ end
145
+
146
+ # * Returns Stems Dictionary
147
+ def stems
148
+ @@stems
149
+ end
150
+
151
+ def stems=(stems)
152
+ @@stems = stems
153
+ end
154
+
155
+
156
+ # * Returns Suffixes Dictionary
157
+ def suffixes
158
+ @@suffixes
159
+ end
160
+
161
+ def suffixes=(suffixes)
162
+ @@suffixes = suffixes
163
+ end
164
+
165
+ def analyze_word_in_dictionaries(segmented_word , word_solutions , verbose , count)
166
+ #Is prefix known ?
167
+ if has_prefix?(segmented_word.prefix)
168
+ #Is stem known ?
169
+ # puts "has prefix"
170
+ if has_stem?(segmented_word.stem)
171
+ # puts "has stem"
172
+ #Is suffix known ?
173
+ if has_suffix?(segmented_word.suffix)
174
+ # puts "has suffix"
175
+ #Compatibility check
176
+ @@prefixes[segmented_word.prefix].each{|prefix|
177
+ @@stems[segmented_word.stem].each {|stem|
178
+ #Prefix/Stem compatibility
179
+ if prefixes_stems_compatible?(prefix.morphology ,stem.morphology )
180
+ # puts "has A B Com"
181
+ @@suffixes[segmented_word.suffix].each {|suffix|
182
+ # Prefix/Suffix compatiblity
183
+ if prefixes_suffixes_compatible?(prefix.morphology , suffix.morphology)
184
+ # puts "has A C Com"
185
+ # Stems/Suffixes compatiblity
186
+ if stems_suffixes_compatible?(stem.morphology , suffix.morphology)
187
+ # puts "has B C COM"
188
+ #All tests passed : it is a solution
189
+ count = count + 1
190
+ word_solutions << Solution.new(verbose , count , prefix , stem , suffix )
191
+ end
192
+ end
193
+ }
194
+ end
195
+ }
196
+ }
197
+ end
198
+ end
199
+ end
200
+ return count
201
+ end
202
+
203
+ private
204
+
205
+ # * load Dictionary from files
206
+ # * [dictionary] Hash of Arrays to store the Dictionary
207
+ # * [name] Dictionary Name
208
+ # * [file] File Path
209
+ def self.load_dictionary( dictionary , name , file )
210
+ lemmas = Set.new
211
+ forms = 0
212
+ final = 0
213
+ lemma_id = ""
214
+ puts "Loading Dictionary : #{ name }"
215
+ #x = Time.now
216
+ file = IO.readlines(file)
217
+ #@loading_secs += Time.now - x
218
+ line_count = 0
219
+ # leemas = file.select{|line| line.start_with?(@@leema_starter) }
220
+ file = file.select{|line| line.start_with?(";; ") or !line.start_with?(";") }
221
+ # entries = file.select{|line| !( line.start_with?(@@leema_starter) and line.start_with?(";")) }
222
+ # read_leemas(leemas)
223
+ # read_entries(entries)
224
+
225
+
226
+ file.each do |line|
227
+ # puts "." unless line_count % 1000
228
+ if line.start_with?(";; ")
229
+ lemma_id = line[3..line.length]
230
+ # Raise Exception If non-unique Lemma ID
231
+ raise ArgumentError.new("Lemma #{lemma_id } in #{name} #{line_count} isn't unique") if lemmas.member?(lemma_id)
232
+ # Add The New Lemma
233
+ lemmas << lemma_id
234
+ #elsif line.start_with?(";")
235
+ else
236
+ splited_line = line.split("\t" , -1)
237
+ raise ArgumentError.new("Entry In #{name} line #{line_count} doesn't have 4 fields ( 3 tabs )") unless splited_line.length == 4
238
+ de = self.construct_dictionary_entry(splited_line , name, line_count , lemma_id)
239
+ if dictionary.has_key?(de.entry)
240
+ dictionary[de.entry] << de
241
+ else
242
+ tmp_array = []
243
+ tmp_array << de
244
+ dictionary[de.entry] = tmp_array
245
+ end
246
+ forms+=1;
247
+ end
248
+ line_count+=1
249
+ end
250
+ # file.close()
251
+ #puts "Time Taken In If" + @@if_time.to_s
252
+ #puts "Time Taken In Sub" + @@sub_time.to_s
253
+
254
+ puts "#{lemmas.size()} lemmas and " unless lemma_id == ""
255
+ puts("#{dictionary.length} entries totalizing #{forms} forms")
256
+ end
257
+
258
+ # * Load Compatibilty tables
259
+ # * [set] Set for Loading Compatibilty Tables
260
+ # * [name] Table Name
261
+ # * [file] File Path
262
+ def self.load_compatibility_table(set, name, file)
263
+ puts "Loading compatibility table : #{name} "
264
+ file = IO.readlines(file)
265
+ file.each do |line|
266
+ unless (line.start_with?(";")) #Ignore comments
267
+ line = line.strip
268
+ line = line.gsub(@@compatability_stpliter, " ")
269
+ set << line#line
270
+ end
271
+ end
272
+ puts "#{set.size()} entries"
273
+ end
274
+
275
+ # * Construct Dictionary Entry from line
276
+ def self.construct_dictionary_entry(splited_line , name ,line_count , lemma_id)
277
+ entry = splited_line[0]
278
+ vocalization = splited_line[1]
279
+ morphology = splited_line[2]
280
+ gloss_pos = splited_line[3]
281
+ gloss = ""
282
+ pos = ""
283
+ # two ways to get the POS info
284
+ # (1) explicitly, by extracting it from the gloss field:
285
+
286
+ matcher = @@regex.match(gloss_pos)
287
+ if matcher
288
+ pos = matcher[1] #extract POS from glossPOS
289
+ gloss = gloss_pos #we clean up the gloss later (see below)
290
+ # (2) by deduction: use the morphology (and sometimes the voc and gloss) to deduce the appropriate POS
291
+ else
292
+ gloss= gloss_pos
293
+ # we need the gloss to guess proper name
294
+
295
+ if morphology.match(@@morphology_regexs[0])
296
+ pos = ""
297
+ elsif morphology.match(@@morphology_regexs[1])
298
+ pos = "#{vocalization} #{@@vocalization_array[0]}"
299
+ elsif (morphology.match(@@morphology_regexs[2]))
300
+ pos = "#{vocalization} #{ @@vocalization_array[1]}"
301
+ elsif (morphology.match(@@morphology_regexs[3]))
302
+ pos = "#{vocalization} #{ @@vocalization_array[2]}"
303
+ elsif (morphology.match(@@morphology_regexs[4] ))
304
+ pos = "#{vocalization} #{@@vocalization_array[3]}"
305
+ elsif (morphology.match(@@morphology_regexs[5]))
306
+ # educated guess (99% correct)
307
+ if (gloss.match(@@morphology_regexs[6]))
308
+ pos = "#{vocalization} #{@@vocalization_array[4]}"
309
+ #(was NOUN_ADJ: some of these are really ADJ's and need to be tagged manually)
310
+ elsif (vocalization.match(@@morphology_regexs[7]))
311
+ pos = "#{vocalization} #{@@vocalization_array[5]}"
312
+ else
313
+ pos = "#{vocalization} #{@@vocalization_array[6]}"
314
+ end
315
+ else raise "No POS can be deduced in #{ name} (line #{line_count}"
316
+ end
317
+ end
318
+ # clean up the gloss: remove POS info and extra space, and convert upper-ASCII to lower (it doesn't convert well to UTF-8)
319
+ gloss =gloss.sub(/<pos>.+?<\/pos>/,"")
320
+ gloss = gloss.strip
321
+ translotor = Translator.new
322
+ gloss = translotor.translate(gloss)
323
+ DictionaryEntry.new(entry, lemma_id, vocalization, morphology, gloss, pos)
324
+ end
325
+ end
@@ -0,0 +1,78 @@
1
+ # An in-memory handler for managing solutions found by the morphological analyzer.
2
+ #
3
+ # Author:: eSpace technologies www.eSpace.com.eg
4
+ # Copyright:: 2008
5
+
6
+
7
+ class InMemorySolutionsHandler
8
+
9
+ # The unique instance of this handler (singleton pattern)
10
+ # Constructor to avoid multiple instanciations
11
+ public_class_method :new
12
+ @@handler = nil
13
+
14
+ def self.create
15
+ @@handler= new unless @@handler
16
+ @@handler
17
+ end
18
+
19
+ public
20
+
21
+ # Add solutions for a given word
22
+ # * [translitered] The translitered word.
23
+ # * [sol] The solution to the translitered word.
24
+ def add_solutions (translitered, sol)
25
+ @@solutions[translitered] = sol
26
+ end
27
+
28
+ # Whether or not the word already gave solutions
29
+ # * [translitered] The translitered word
30
+ # * @return If it has the solution or not (Boolean).
31
+ def has_solutions(translitered)
32
+ @@solutions.has_key?(translitered)
33
+ end
34
+
35
+ # Return the solutions of a given word
36
+ # * [translitered] The translitered word
37
+ # * @return The solution matching the transliterd word.
38
+ def get_solutions(translitered)
39
+ if(self.has_solutions(translitered))
40
+ return @@solutions[translitered]
41
+ else
42
+ return nil
43
+ end
44
+ end
45
+
46
+ # Add alternative spellings for the given word
47
+ # * [translitered] The translitered word
48
+ # * [alt] The alternative spelling
49
+ def add_alternative_spellings(translitered, alt)
50
+ @@alternative_spellings[translitered] = alt
51
+ end
52
+
53
+ # Whether or not the word already gave alternative spellings
54
+ # * [translitered] The translitered word
55
+ # * @return If the transliterd word has alternative spellings
56
+ def has_alternative_spellings(translitered)
57
+ @@alternative_spellings.has_key?(translitered)
58
+ end
59
+
60
+ # Return the alternative spellings of the word
61
+ # * [translitered] The translitered word
62
+ # * @return The alternative spellings matching the transliterd word.
63
+ def get_alternative_spellings(translitered)
64
+ if(self.has_alternative_spellings(translitered))
65
+ return @@alternative_spellings[translitered]
66
+ else
67
+ return nil
68
+ end
69
+ end
70
+
71
+ private
72
+ #Hash of solutions for analyzed words
73
+ @@solutions ={}
74
+ #Hash of alternative spellings
75
+ @@alternative_spellings ={}
76
+
77
+ end
78
+
@@ -0,0 +1,35 @@
1
+ # Class For Latin Arabic Transileration
2
+ # Author:: eSpace technologies www.eSpace.com.eg
3
+ # Copyright:: 2008
4
+
5
+
6
+ class LatinArabicTranslator
7
+
8
+ # * Table Used for Tranlation From Latin Letters To Arabic I.e ( Arabize Word )
9
+ # * According to Buckwalter system Dictionary
10
+ TABLE = {"'" => "\u0621","|" => "\u0622",">" => "\u0623","&" => "\u0624",
11
+ "<" => "\u0625","}" => "\u0626","A" => "\u0627","b" => "\u0628",
12
+ "p" => "\u0629","t" => "\u062A","v" => "\u062B","j" => "\u062C",
13
+ "H" => "\u062D","x" => "\u062E","d" => "\u062F","*" => "\u0630",
14
+ "r" => "\u0631","z" => "\u0632", "s" => "\u0633","$" => "\u0634","S" => "\u0635",
15
+ "D" => "\u0636","T" => "\u0637","Z" => "\u0638","E" => "\u0639","g" => "\u063A",
16
+ "_" => "\u0640","f" => "\u0641","q" => "\u0642","k" => "\u0643","l" => "\u0644",
17
+ "m" => "\u0645","n" => "\u0646","h" => "\u0647","w" => "\u0648","Y" => "\u0649","y" => "\u064A",
18
+ "F" => "\u064B","N" => "\u064C","K" => "\u064D","a" => "\u064E","u" => "\u064F","i" => "\u0650",
19
+ "~" => "\u0651", "o" => "\u0652", "`" => "\u0670","{" => "\u0671","P" => "\u067E","J" => "\u0686",
20
+ "V" => "\u06A4", "G" => "\u06AF", "R" => "\u0698" ,"," => "\u060C" , ";" => "\u061B" , "?" => "\u061F"
21
+ }
22
+
23
+ # * Translate : Transilerate the Roman lettered word to Arabic Word
24
+ # * [word] Word String To be processed
25
+ # * @return transilerated word
26
+ #
27
+ def self.translate(word)
28
+ result = ""
29
+ word.force_encoding "UTF-8"
30
+ word.each_char{|char|
31
+ result+= TABLE[char] ? TABLE[char] : char
32
+ }
33
+ result
34
+ end
35
+ end
@@ -0,0 +1,20 @@
1
+ class Logger
2
+
3
+
4
+ attr_reader :verbose , :output
5
+ def initialize(verbose = nil , output = nil )
6
+ @verbose = verbose
7
+ @output = output
8
+ @stream = StringIO.new
9
+ end
10
+
11
+ def info string , require_verbose = false
12
+ @stream.puts(string) #if ( require_verbose && @verbose || ! require_verbose )
13
+ end
14
+
15
+ def log
16
+ return puts @stream.string if @output.nil?
17
+ File.open(@output , "w") { |f|
18
+ f.puts @stream.string }
19
+ end
20
+ end