espace-raramorph 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ # Class For Arabic Latin Transileration
2
+ # Author:: eSpace technologies www.eSpace.com.eg
3
+ # Copyright:: 2008
4
+ #
5
+ class ArabicLatinTranslator
6
+ # * Table Used for Tranlation From Arabic To English I.e ( Romanize Word )
7
+ # * According to Buckwalter system Dictionary
8
+ TABLE = { "\u0621"=> "'" , "\u0622"=> "|" , "\u0623"=> ">" , "\u0624"=> "&" , "\u0625"=> "<" , "\u0626"=> "}" ,
9
+ "\u0627"=> "A" , "\u0628"=> "b" , "\u0629"=> "p" , "\u062A"=> "t" , "\u062B"=> "v" , "\u062C"=> "j" ,
10
+ "\u062D"=> "H" , "\u062E"=> "x" , "\u062F"=> "d" , "\u0630"=> "*" , "\u0631"=> "r" , "\u0632"=> "z" ,
11
+ "\u0633"=> "s" , "\u0634"=> "$" , "\u0635"=> "S" , "\u0636"=> "D" , "\u0637"=> "T" ,"\u0638"=> "Z",
12
+ "\u0639"=> "E" , "\u063A"=> "g" , "\u0640"=> "_" , "\u0641"=> "f" , "\u0642"=> "q" , "\u0643"=> "k" ,
13
+ "\u0644"=> "l" , "\u0645"=> "m" , "\u0646"=> "n" , "\u0647"=> "h" , "\u0648"=> "w" , "\u0649"=> "Y",
14
+ "\u064A"=> "y" , "\u064B"=> "F" , "\u064C"=> "N" , "\u064D"=> "K" , "\u064E"=> "a" , "\u064F"=> "u" ,
15
+ "\u0650"=> "i" , "\u0651"=> "~" , "\u0652"=> "o" , "\u0670"=> "`" ,"\u0671"=> "{" , "\u067E"=> "P" ,
16
+ "\u0686"=> "J" , "\u06A4"=> "V" , "\u06AF"=> "G" , "\u0698"=> "R" , "\u060C" => "," ,"\u061B" => ";",
17
+ "\u061F" => "?" , "\u0640" => "" }
18
+ #Not suitable for morphological analysis : remove all vowels/diacritics, i.e. undo the job !
19
+ VOWEL_REMOVER = Regexp.compile("[FNKaui~o]")
20
+ STRIPER = Regexp.compile("[`\\{]")
21
+
22
+
23
+ # * Translate : Transilerate the arabic word to Roman lettered Word
24
+ # * [word] Word String To be processed
25
+ # * @return transilerated word
26
+ #
27
+ def self.translate(word)
28
+ result = ""
29
+ word.gsub!(VOWEL_REMOVER , "")
30
+ word.gsub!(STRIPER , "")
31
+ word.force_encoding "UTF-8"
32
+ word.each_char{|char|
33
+ result+= TABLE[char] ? TABLE[char] : char
34
+ }
35
+ result
36
+ end
37
+
38
+ end
@@ -0,0 +1,40 @@
1
+ # Class For Storing Dictionary Entries
2
+ # Author:: eSpace technologies www.eSpace.com.eg
3
+ # Copyright:: 2008
4
+
5
+
6
+ class DictionaryEntry
7
+ ## Constructs a Dictionary Entry
8
+
9
+ attr_reader :entry , :lemma_id , :vocalization , :morphology , :gloss , :glosses , :pos
10
+ @@split_regex = Regexp.compile("\\+")
11
+
12
+ protected
13
+ # * Initiliaze New Dict. Entry
14
+ def initialize( entry, lemma_id, vocalization, morphology, gloss, pos)
15
+ # Instance Variables
16
+ @entry = entry.strip
17
+ @lemma_id = lemma_id.strip
18
+ @vocalization = vocalization.strip
19
+ @morphology = morphology.strip
20
+ @gloss = gloss
21
+ @glosses = []
22
+ @pos = []
23
+ @glosses = fill_instance_array_from_sent_array(gloss.split(@@split_regex))
24
+ @pos = fill_instance_array_from_sent_array(pos.split(@@split_regex))
25
+ end
26
+
27
+ private
28
+ def fill_instance_array_from_sent_array( sent_array)
29
+ instance_array = []
30
+ sent_array.each do |value |
31
+ value = value.strip
32
+ end
33
+ sent_array[0] == "" ? offset = 1 : offset = 0
34
+ for i in offset..sent_array.length-1
35
+ instance_array[i - offset] = sent_array[i]
36
+ end
37
+ instance_array
38
+ end
39
+
40
+ end
@@ -0,0 +1,325 @@
1
+ # Class For Storing And Loading Dictionaries
2
+ # Author:: eSpace technologies www.eSpace.com.eg
3
+ # Copyright:: 2008
4
+
5
+
6
+ require 'rubygems'
7
+ class InMemoryDictionaryHandler
8
+
9
+ #Signleton Class
10
+ ##### Dictionaries ########
11
+ #### Dictionaries are HASH OF ARRAYS #####
12
+ @@prefixes = {}
13
+ #Dictionary of Prefixes
14
+
15
+ @@stems = {}
16
+ #Dictionary of Stems
17
+
18
+ @@suffixes = {}
19
+ #Dictionary of Suffixes
20
+ private_class_method :new
21
+
22
+ # * Loads Dictionaries and initiate variables
23
+ def self.create
24
+
25
+ ### Variables #####
26
+ @@handler = nil
27
+ @@regex = Regexp.compile(".*" + "<pos>(.+?)</pos>" + ".*")
28
+ @@morphology_regexs=[Regexp.compile("^(Pref-0|Suff-0)$") ,
29
+ Regexp.compile("^F" + ".*") ,
30
+ Regexp.compile("^IV" + ".*") ,
31
+ Regexp.compile("^PV" + ".*") ,
32
+ Regexp.compile("^CV" + ".*") ,
33
+ Regexp.compile("^N" + ".*") ,
34
+ Regexp.compile("^[A-Z]" + ".*") ,
35
+ Regexp.compile(".*" + "iy~$")
36
+ ]
37
+ @@compatability_stpliter = Regexp.compile("\\s+")
38
+ @@vocalization_array =["/FUNC_WORD" ,
39
+ "/VERB_IMPERFECT" ,
40
+ "/VERB_PERFECT" ,
41
+ "/VERB_IMPERATIVE" ,
42
+ "/NOUN_PROP" ,
43
+ "/NOUN" ,
44
+ "/NOUN"
45
+ ]
46
+
47
+ @@prefixes_stems_compatibility = Set.new
48
+ #Changed
49
+ #Compatibility table for prefixes-stems combinations.
50
+
51
+ @@prefixes_suffixes_compatibility = Set.new
52
+ #Changed
53
+ #Compatibility table for prefixes-suffixes combinations.
54
+
55
+ @@stems_suffixes_compatibility = Set.new
56
+
57
+ #Changed
58
+ #Compatibility table for stem-suffixes combinations.
59
+
60
+ puts "Initializing in-memory dictionary handler..."
61
+ Thread.abort_on_exception = true
62
+ load_dictionary( @@prefixes , "dictPrefixes" , File.dirname(__FILE__) + "/../dictionaries/dictPrefixes" )
63
+ load_stems_marshaled_dictionary
64
+ load_dictionary(@@suffixes, "dictSuffixes" , File.dirname(__FILE__) + "/../dictionaries/dictSuffixes")
65
+ load_compatibility_table(@@prefixes_stems_compatibility , "prefixes_stems_compatibility" , File.dirname(__FILE__) + "/../dictionaries/tableAB")
66
+ load_compatibility_table(@@prefixes_suffixes_compatibility , "prefixes_suffixes_compatibility" , File.dirname(__FILE__) + "/../dictionaries/tableAC")
67
+ load_compatibility_table(@@stems_suffixes_compatibility , "stems_suffixes_compatibility" , File.dirname(__FILE__) + "/../dictionaries/tableBC")
68
+ puts "... Done ... "
69
+ @@handler = new unless @@handler
70
+ end
71
+
72
+ # * load the marshaled stems dictionary if avalaible or load from the origin dictionary if not avalaible
73
+ def self.load_stems_marshaled_dictionary
74
+ if File.exists?( File.dirname(__FILE__) + '/../dictionaries/marshal_stems' )
75
+ File.open( File.dirname(__FILE__) + '/../dictionaries/marshal_stems') do |f|
76
+ @@stems = Marshal.load(f)
77
+ end
78
+ puts("#{@@stems.length} entries totalizing")
79
+ else
80
+ reload_stems_dictionary
81
+ end
82
+ end
83
+
84
+ # * Marshal the stems dictionary into a file
85
+ def self.marshal_stems
86
+ File.open( File.dirname(__FILE__) + '/../dictionaries/marshal_stems' , 'w+') do |f|
87
+ Marshal.dump(@@stems, f)
88
+ end
89
+ end
90
+
91
+
92
+ # * Loads Stem dictionary from original file then marshal the dictionary for faster access
93
+ def self.reload_stems_dictionary
94
+ load_dictionary(@@stems, "dictStems", File.dirname(__FILE__) + "/../dictionaries/dictStems") #File.open("dictionaries/dictStems" , "r:UTF-8" ))
95
+ marshal_stems
96
+ end
97
+
98
+ # * Check if translitered word has prefix
99
+ # * [translitered] Translitered word to be checked
100
+ def has_prefix?(translitered)
101
+ @@prefixes.has_key?(translitered)
102
+ end
103
+
104
+ # * Check if translitered word has stem
105
+ # * [translitered] Translitered word to be checked
106
+ def has_stem?(translitered)
107
+ @@stems.has_key?(translitered)
108
+ end
109
+
110
+ # * Check if translitered word has suffix
111
+ # * [translitered] Translitered word to be checked
112
+ def has_suffix?(translitered)
113
+ @@suffixes.has_key?(translitered)
114
+ end
115
+
116
+ # * Check if prefix and stem are compatible
117
+ # * [prefix] prefix to be checked
118
+ # * [stem] stem to be checked
119
+ def prefixes_stems_compatible?(prefix , stem) #String , #String
120
+ @@prefixes_stems_compatibility.member?(prefix + " " + stem)
121
+ end
122
+
123
+ # * Check if prefix and suffix are compatible
124
+ # * [prefix] prefix to be checked
125
+ # * [suffix] suffix to be checked
126
+ def prefixes_suffixes_compatible?(prefix , suffix)
127
+ @@prefixes_suffixes_compatibility.member?(prefix + " " + suffix)
128
+ end
129
+
130
+ # * Check if stem and suffix are compatible
131
+ # * [stem] stem to be checked
132
+ # * [suffix] suffix to be checked
133
+ def stems_suffixes_compatible?(stem , suffix)
134
+ @@stems_suffixes_compatibility.member?(stem + " " + suffix)
135
+ end
136
+
137
+ # * Returns the prefixes table
138
+ def prefixes
139
+ @@prefixes
140
+ end
141
+
142
+ def prefixes=(prefixes)
143
+ @@prefixes = prefixes
144
+ end
145
+
146
+ # * Returns Stems Dictionary
147
+ def stems
148
+ @@stems
149
+ end
150
+
151
+ def stems=(stems)
152
+ @@stems = stems
153
+ end
154
+
155
+
156
+ # * Returns Suffixes Dictionary
157
+ def suffixes
158
+ @@suffixes
159
+ end
160
+
161
+ def suffixes=(suffixes)
162
+ @@suffixes = suffixes
163
+ end
164
+
165
+ def analyze_word_in_dictionaries(segmented_word , word_solutions , verbose , count)
166
+ #Is prefix known ?
167
+ if has_prefix?(segmented_word.prefix)
168
+ #Is stem known ?
169
+ # puts "has prefix"
170
+ if has_stem?(segmented_word.stem)
171
+ # puts "has stem"
172
+ #Is suffix known ?
173
+ if has_suffix?(segmented_word.suffix)
174
+ # puts "has suffix"
175
+ #Compatibility check
176
+ @@prefixes[segmented_word.prefix].each{|prefix|
177
+ @@stems[segmented_word.stem].each {|stem|
178
+ #Prefix/Stem compatibility
179
+ if prefixes_stems_compatible?(prefix.morphology ,stem.morphology )
180
+ # puts "has A B Com"
181
+ @@suffixes[segmented_word.suffix].each {|suffix|
182
+ # Prefix/Suffix compatiblity
183
+ if prefixes_suffixes_compatible?(prefix.morphology , suffix.morphology)
184
+ # puts "has A C Com"
185
+ # Stems/Suffixes compatiblity
186
+ if stems_suffixes_compatible?(stem.morphology , suffix.morphology)
187
+ # puts "has B C COM"
188
+ #All tests passed : it is a solution
189
+ count = count + 1
190
+ word_solutions << Solution.new(verbose , count , prefix , stem , suffix )
191
+ end
192
+ end
193
+ }
194
+ end
195
+ }
196
+ }
197
+ end
198
+ end
199
+ end
200
+ return count
201
+ end
202
+
203
+ private
204
+
205
+ # * load Dictionary from files
206
+ # * [dictionary] Hash of Arrays to store the Dictionary
207
+ # * [name] Dictionary Name
208
+ # * [file] File Path
209
+ def self.load_dictionary( dictionary , name , file )
210
+ lemmas = Set.new
211
+ forms = 0
212
+ final = 0
213
+ lemma_id = ""
214
+ puts "Loading Dictionary : #{ name }"
215
+ #x = Time.now
216
+ file = IO.readlines(file)
217
+ #@loading_secs += Time.now - x
218
+ line_count = 0
219
+ # leemas = file.select{|line| line.start_with?(@@leema_starter) }
220
+ file = file.select{|line| line.start_with?(";; ") or !line.start_with?(";") }
221
+ # entries = file.select{|line| !( line.start_with?(@@leema_starter) and line.start_with?(";")) }
222
+ # read_leemas(leemas)
223
+ # read_entries(entries)
224
+
225
+
226
+ file.each do |line|
227
+ # puts "." unless line_count % 1000
228
+ if line.start_with?(";; ")
229
+ lemma_id = line[3..line.length]
230
+ # Raise Exception If non-unique Lemma ID
231
+ raise ArgumentError.new("Lemma #{lemma_id } in #{name} #{line_count} isn't unique") if lemmas.member?(lemma_id)
232
+ # Add The New Lemma
233
+ lemmas << lemma_id
234
+ #elsif line.start_with?(";")
235
+ else
236
+ splited_line = line.split("\t" , -1)
237
+ raise ArgumentError.new("Entry In #{name} line #{line_count} doesn't have 4 fields ( 3 tabs )") unless splited_line.length == 4
238
+ de = self.construct_dictionary_entry(splited_line , name, line_count , lemma_id)
239
+ if dictionary.has_key?(de.entry)
240
+ dictionary[de.entry] << de
241
+ else
242
+ tmp_array = []
243
+ tmp_array << de
244
+ dictionary[de.entry] = tmp_array
245
+ end
246
+ forms+=1;
247
+ end
248
+ line_count+=1
249
+ end
250
+ # file.close()
251
+ #puts "Time Taken In If" + @@if_time.to_s
252
+ #puts "Time Taken In Sub" + @@sub_time.to_s
253
+
254
+ puts "#{lemmas.size()} lemmas and " unless lemma_id == ""
255
+ puts("#{dictionary.length} entries totalizing #{forms} forms")
256
+ end
257
+
258
+ # * Load Compatibilty tables
259
+ # * [set] Set for Loading Compatibilty Tables
260
+ # * [name] Table Name
261
+ # * [file] File Path
262
+ def self.load_compatibility_table(set, name, file)
263
+ puts "Loading compatibility table : #{name} "
264
+ file = IO.readlines(file)
265
+ file.each do |line|
266
+ unless (line.start_with?(";")) #Ignore comments
267
+ line = line.strip
268
+ line = line.gsub(@@compatability_stpliter, " ")
269
+ set << line#line
270
+ end
271
+ end
272
+ puts "#{set.size()} entries"
273
+ end
274
+
275
+ # * Construct Dictionary Entry from line
276
+ def self.construct_dictionary_entry(splited_line , name ,line_count , lemma_id)
277
+ entry = splited_line[0]
278
+ vocalization = splited_line[1]
279
+ morphology = splited_line[2]
280
+ gloss_pos = splited_line[3]
281
+ gloss = ""
282
+ pos = ""
283
+ # two ways to get the POS info
284
+ # (1) explicitly, by extracting it from the gloss field:
285
+
286
+ matcher = @@regex.match(gloss_pos)
287
+ if matcher
288
+ pos = matcher[1] #extract POS from glossPOS
289
+ gloss = gloss_pos #we clean up the gloss later (see below)
290
+ # (2) by deduction: use the morphology (and sometimes the voc and gloss) to deduce the appropriate POS
291
+ else
292
+ gloss= gloss_pos
293
+ # we need the gloss to guess proper name
294
+
295
+ if morphology.match(@@morphology_regexs[0])
296
+ pos = ""
297
+ elsif morphology.match(@@morphology_regexs[1])
298
+ pos = "#{vocalization} #{@@vocalization_array[0]}"
299
+ elsif (morphology.match(@@morphology_regexs[2]))
300
+ pos = "#{vocalization} #{ @@vocalization_array[1]}"
301
+ elsif (morphology.match(@@morphology_regexs[3]))
302
+ pos = "#{vocalization} #{ @@vocalization_array[2]}"
303
+ elsif (morphology.match(@@morphology_regexs[4] ))
304
+ pos = "#{vocalization} #{@@vocalization_array[3]}"
305
+ elsif (morphology.match(@@morphology_regexs[5]))
306
+ # educated guess (99% correct)
307
+ if (gloss.match(@@morphology_regexs[6]))
308
+ pos = "#{vocalization} #{@@vocalization_array[4]}"
309
+ #(was NOUN_ADJ: some of these are really ADJ's and need to be tagged manually)
310
+ elsif (vocalization.match(@@morphology_regexs[7]))
311
+ pos = "#{vocalization} #{@@vocalization_array[5]}"
312
+ else
313
+ pos = "#{vocalization} #{@@vocalization_array[6]}"
314
+ end
315
+ else raise "No POS can be deduced in #{ name} (line #{line_count}"
316
+ end
317
+ end
318
+ # clean up the gloss: remove POS info and extra space, and convert upper-ASCII to lower (it doesn't convert well to UTF-8)
319
+ gloss =gloss.sub(/<pos>.+?<\/pos>/,"")
320
+ gloss = gloss.strip
321
+ translotor = Translator.new
322
+ gloss = translotor.translate(gloss)
323
+ DictionaryEntry.new(entry, lemma_id, vocalization, morphology, gloss, pos)
324
+ end
325
+ end
@@ -0,0 +1,78 @@
1
+ # An in-memory handler for managing solutions found by the morphological analyzer.
2
+ #
3
+ # Author:: eSpace technologies www.eSpace.com.eg
4
+ # Copyright:: 2008
5
+
6
+
7
+ class InMemorySolutionsHandler
8
+
9
+ # The unique instance of this handler (singleton pattern)
10
+ # Constructor to avoid multiple instanciations
11
+ public_class_method :new
12
+ @@handler = nil
13
+
14
+ def self.create
15
+ @@handler= new unless @@handler
16
+ @@handler
17
+ end
18
+
19
+ public
20
+
21
+ # Add solutions for a given word
22
+ # * [translitered] The translitered word.
23
+ # * [sol] The solution to the translitered word.
24
+ def add_solutions (translitered, sol)
25
+ @@solutions[translitered] = sol
26
+ end
27
+
28
+ # Whether or not the word already gave solutions
29
+ # * [translitered] The translitered word
30
+ # * @return If it has the solution or not (Boolean).
31
+ def has_solutions(translitered)
32
+ @@solutions.has_key?(translitered)
33
+ end
34
+
35
+ # Return the solutions of a given word
36
+ # * [translitered] The translitered word
37
+ # * @return The solution matching the transliterd word.
38
+ def get_solutions(translitered)
39
+ if(self.has_solutions(translitered))
40
+ return @@solutions[translitered]
41
+ else
42
+ return nil
43
+ end
44
+ end
45
+
46
+ # Add alternative spellings for the given word
47
+ # * [translitered] The translitered word
48
+ # * [alt] The alternative spelling
49
+ def add_alternative_spellings(translitered, alt)
50
+ @@alternative_spellings[translitered] = alt
51
+ end
52
+
53
+ # Whether or not the word already gave alternative spellings
54
+ # * [translitered] The translitered word
55
+ # * @return If the transliterd word has alternative spellings
56
+ def has_alternative_spellings(translitered)
57
+ @@alternative_spellings.has_key?(translitered)
58
+ end
59
+
60
+ # Return the alternative spellings of the word
61
+ # * [translitered] The translitered word
62
+ # * @return The alternative spellings matching the transliterd word.
63
+ def get_alternative_spellings(translitered)
64
+ if(self.has_alternative_spellings(translitered))
65
+ return @@alternative_spellings[translitered]
66
+ else
67
+ return nil
68
+ end
69
+ end
70
+
71
+ private
72
+ #Hash of solutions for analyzed words
73
+ @@solutions ={}
74
+ #Hash of alternative spellings
75
+ @@alternative_spellings ={}
76
+
77
+ end
78
+
@@ -0,0 +1,35 @@
1
+ # Class For Latin Arabic Transileration
2
+ # Author:: eSpace technologies www.eSpace.com.eg
3
+ # Copyright:: 2008
4
+
5
+
6
+ class LatinArabicTranslator
7
+
8
+ # * Table Used for Tranlation From Latin Letters To Arabic I.e ( Arabize Word )
9
+ # * According to Buckwalter system Dictionary
10
+ TABLE = {"'" => "\u0621","|" => "\u0622",">" => "\u0623","&" => "\u0624",
11
+ "<" => "\u0625","}" => "\u0626","A" => "\u0627","b" => "\u0628",
12
+ "p" => "\u0629","t" => "\u062A","v" => "\u062B","j" => "\u062C",
13
+ "H" => "\u062D","x" => "\u062E","d" => "\u062F","*" => "\u0630",
14
+ "r" => "\u0631","z" => "\u0632", "s" => "\u0633","$" => "\u0634","S" => "\u0635",
15
+ "D" => "\u0636","T" => "\u0637","Z" => "\u0638","E" => "\u0639","g" => "\u063A",
16
+ "_" => "\u0640","f" => "\u0641","q" => "\u0642","k" => "\u0643","l" => "\u0644",
17
+ "m" => "\u0645","n" => "\u0646","h" => "\u0647","w" => "\u0648","Y" => "\u0649","y" => "\u064A",
18
+ "F" => "\u064B","N" => "\u064C","K" => "\u064D","a" => "\u064E","u" => "\u064F","i" => "\u0650",
19
+ "~" => "\u0651", "o" => "\u0652", "`" => "\u0670","{" => "\u0671","P" => "\u067E","J" => "\u0686",
20
+ "V" => "\u06A4", "G" => "\u06AF", "R" => "\u0698" ,"," => "\u060C" , ";" => "\u061B" , "?" => "\u061F"
21
+ }
22
+
23
+ # * Translate : Transilerate the Roman lettered word to Arabic Word
24
+ # * [word] Word String To be processed
25
+ # * @return transilerated word
26
+ #
27
+ def self.translate(word)
28
+ result = ""
29
+ word.force_encoding "UTF-8"
30
+ word.each_char{|char|
31
+ result+= TABLE[char] ? TABLE[char] : char
32
+ }
33
+ result
34
+ end
35
+ end
@@ -0,0 +1,20 @@
1
+ class Logger
2
+
3
+
4
+ attr_reader :verbose , :output
5
+ def initialize(verbose = nil , output = nil )
6
+ @verbose = verbose
7
+ @output = output
8
+ @stream = StringIO.new
9
+ end
10
+
11
+ def info string , require_verbose = false
12
+ @stream.puts(string) #if ( require_verbose && @verbose || ! require_verbose )
13
+ end
14
+
15
+ def log
16
+ return puts @stream.string if @output.nil?
17
+ File.open(@output , "w") { |f|
18
+ f.puts @stream.string }
19
+ end
20
+ end