raramorph 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,42 @@
1
+ # Class For Arabic Latin Transileration
2
+ # Author:: eSpace technologies www.eSpace.com.eg
3
+ # Copyright:: 2008
4
+ #
5
+
6
+ class ArabicLatinTranslator
7
+
8
+ # * Table Used for Tranlation From Arabic To English I.e ( Romanize Word )
9
+ # * According to Buckwalter system Dictionary
10
+ TABLE = { "\u0621"=> "'" , "\u0622"=> "|" , "\u0623"=> ">" , "\u0624"=> "&" , "\u0625"=> "<" , "\u0626"=> "}" ,
11
+ "\u0627"=> "A" , "\u0628"=> "b" , "\u0629"=> "p" , "\u062A"=> "t" , "\u062B"=> "v" , "\u062C"=> "j" ,
12
+ "\u062D"=> "H" , "\u062E"=> "x" , "\u062F"=> "d" , "\u0630"=> "*" , "\u0631"=> "r" , "\u0632"=> "z" ,
13
+ "\u0633"=> "s" , "\u0634"=> "$" , "\u0635"=> "S" , "\u0636"=> "D" , "\u0637"=> "T" ,"\u0638"=> "Z",
14
+ "\u0639"=> "E" , "\u063A"=> "g" , "\u0640"=> "_" , "\u0641"=> "f" , "\u0642"=> "q" , "\u0643"=> "k" ,
15
+ "\u0644"=> "l" , "\u0645"=> "m" , "\u0646"=> "n" , "\u0647"=> "h" , "\u0648"=> "w" , "\u0649"=> "Y",
16
+ "\u064A"=> "y" , "\u064B"=> "F" , "\u064C"=> "N" , "\u064D"=> "K" , "\u064E"=> "a" , "\u064F"=> "u" ,
17
+ "\u0650"=> "i" , "\u0651"=> "~" , "\u0652"=> "o" , "\u0670"=> "`" ,"\u0671"=> "{" , "\u067E"=> "P" ,
18
+ "\u0686"=> "J" , "\u06A4"=> "V" , "\u06AF"=> "G" , "\u0698"=> "R" , "\u060C" => "," ,"\u061B" => ";",
19
+ "\u061F" => "?" , "\u0640" => "" }
20
+ #Not suitable for morphological analysis : remove all vowels/diacritics, i.e. undo the job !
21
+ VOWEL_REMOVER = Regexp.compile("[FNKaui~o]")
22
+ STRIPER = Regexp.compile("[`\\{]")
23
+ def initilaize
24
+
25
+ end
26
+
27
+ # * Translate : Transilerate the arabic word to Roman lettered Word
28
+ # * [word] Word String To be processed
29
+ # * @return transilerated word
30
+ #
31
+ def self.translate(word)
32
+ result = ""
33
+ word.gsub!(VOWEL_REMOVER , "")
34
+ word.gsub!(STRIPER , "")
35
+ word.force_encoding "UTF-8"
36
+ word.each_char{|char|
37
+ result+= TABLE[char] ? TABLE[char] : char
38
+ }
39
+ result
40
+ end
41
+
42
+ end
@@ -0,0 +1,40 @@
1
+ # Class For Storing Dictionary Entries
2
+ # Author:: eSpace technologies www.eSpace.com.eg
3
+ # Copyright:: 2008
4
+ #
5
+
6
+ class DictionaryEntry
7
+ ## Constructs a Dictionary Entry
8
+
9
+ attr_reader :entry , :lemma_id , :vocalization , :morphology , :gloss , :glosses , :pos
10
+ @@split_regex = Regexp.compile("\\+")
11
+
12
+ protected
13
+ # * Initiliaze New Dict. Entry
14
+ def initialize( entry, lemma_id, vocalization, morphology, gloss, pos)
15
+ # Instance Variables
16
+ @entry = entry.strip
17
+ @lemma_id = lemma_id.strip
18
+ @vocalization = vocalization.strip
19
+ @morphology = morphology.strip
20
+ @gloss = gloss
21
+ @glosses = []
22
+ @pos = []
23
+ @glosses = fill_instance_array_from_sent_array(gloss.split(@@split_regex))
24
+ @pos = fill_instance_array_from_sent_array(pos.split(@@split_regex))
25
+ end
26
+
27
+ private
28
+ def fill_instance_array_from_sent_array( sent_array)
29
+ instance_array = []
30
+ sent_array.each do |value |
31
+ value = value.strip
32
+ end
33
+ sent_array[0] == "" ? offset = 1 : offset = 0
34
+ for i in offset..sent_array.length-1
35
+ instance_array[i - offset] = sent_array[i]
36
+ end
37
+ instance_array
38
+ end
39
+
40
+ end
@@ -0,0 +1,287 @@
1
+ # Class For Storing And Loading Dictionaries
2
+ # Author:: eSpace technologies www.eSpace.com.eg
3
+ # Copyright:: 2008
4
+ #
5
+
6
+ require 'rubygems'
7
+ class InMemoryDictionaryHandler
8
+
9
+ #Signleton Class
10
+ ##### Dictionaries ########
11
+ #### Dictionaries are HASH OF ARRAYS #####
12
+ @@prefixes = {}
13
+ #Dictionary of Prefixes
14
+
15
+ @@stems = {}
16
+ #Dictionary of Stems
17
+
18
+ @@suffixes = {}
19
+ #Dictionary of Suffixes
20
+ private_class_method :new
21
+
22
+ # * Loads Dictionaries and initiate variables
23
+ def self.create
24
+
25
+ ### Variables #####
26
+ @@handler = nil
27
+ @@regex = Regexp.compile(".*" + "<pos>(.+?)</pos>" + ".*")
28
+ @@morphology_regexs=[]
29
+ #@@leema_starter = Regexp.compile(";; ")
30
+ @@morphology_regexs[0] = Regexp.compile("^(Pref-0|Suff-0)$")
31
+ @@morphology_regexs[1] = Regexp.compile("^F" + ".*")
32
+ @@morphology_regexs[2] = Regexp.compile("^IV" + ".*")
33
+ @@morphology_regexs[3] = Regexp.compile("^PV" + ".*")
34
+ @@morphology_regexs[4] = Regexp.compile("^CV" + ".*")
35
+ @@morphology_regexs[5] = Regexp.compile("^N" + ".*")
36
+ @@morphology_regexs[6] = Regexp.compile("^[A-Z]" + ".*")
37
+ @@morphology_regexs[7] = Regexp.compile(".*" + "iy~$")
38
+ @@compatability_stpliter = Regexp.compile("\\s+")
39
+ @@vocalization_array =[]
40
+ @@vocalization_array[0] = "/FUNC_WORD"
41
+ @@vocalization_array[1] ="/VERB_IMPERFECT"
42
+ @@vocalization_array[2] ="/VERB_PERFECT"
43
+ @@vocalization_array[3] ="/VERB_IMPERATIVE"
44
+ @@vocalization_array[4] = "/NOUN_PROP"
45
+ @@vocalization_array[5] ="/NOUN"
46
+ @@vocalization_array[6] = "/NOUN"
47
+
48
+ @@prefixes_stems_compatibility = Set.new
49
+ #Changed
50
+ #Compatibility table for prefixes-stems combinations.
51
+
52
+ @@prefixes_suffixes_compatibility = Set.new
53
+ #Changed
54
+ #Compatibility table for prefixes-suffixes combinations.
55
+
56
+ @@stems_suffixes_compatibility = Set.new
57
+
58
+ #Changed
59
+ #Compatibility table for stem-suffixes combinations.
60
+
61
+ puts "Initializing in-memory dictionary handler..."
62
+ Thread.abort_on_exception = true
63
+ load_dictionary( @@prefixes , "dictPrefixes" , File.dirname(__FILE__) + "/../dictionaries/dictPrefixes" )
64
+ load_stems_marshaled_dictionary
65
+ load_dictionary(@@suffixes, "dictSuffixes" , File.dirname(__FILE__) + "/../dictionaries/dictSuffixes")
66
+ load_compatibility_table(@@prefixes_stems_compatibility , "prefixes_stems_compatibility" , File.dirname(__FILE__) + "/../dictionaries/tableAB")
67
+ load_compatibility_table(@@prefixes_suffixes_compatibility , "prefixes_suffixes_compatibility" , File.dirname(__FILE__) + "/../dictionaries/tableAC")
68
+ load_compatibility_table(@@stems_suffixes_compatibility , "stems_suffixes_compatibility" , File.dirname(__FILE__) + "/../dictionaries/tableBC")
69
+ puts "... Done ... "
70
+ @@handler = new unless @@handler
71
+ end
72
+
73
+ # * load the marshaled stems dictionary if avalaible or load from the origin dictionary if not avalaible
74
+ def self.load_stems_marshaled_dictionary
75
+ if File.exists?( File.dirname(__FILE__) + '/../dictionaries/marshal_stems' )
76
+ File.open( File.dirname(__FILE__) + '/../dictionaries/marshal_stems') do |f|
77
+ @@stems = Marshal.load(f)
78
+ end
79
+ puts("#{@@stems.length} entries totalizing")
80
+ else
81
+ reload_stems_dictionary
82
+ end
83
+ end
84
+
85
+ # * Marshal the stems dictionary into a file
86
+ def self.marshal_stems
87
+ File.open( File.dirname(__FILE__) + '/../dictionaries/marshal_stems' , 'w+') do |f|
88
+ Marshal.dump(@@stems, f)
89
+ end
90
+ end
91
+
92
+
93
+ # * Loads Stem dictionary from original file then marshal the dictionary for faster access
94
+ def self.reload_stems_dictionary
95
+ load_dictionary(@@stems, "dictStems", File.dirname(__FILE__) + "/../dictionaries/dictStems") #File.open("dictionaries/dictStems" , "r:UTF-8" ))
96
+ marshal_stems
97
+ end
98
+
99
+ # * Check if translitered word has prefix
100
+ # * [translitered] Translitered word to be checked
101
+ def has_prefix?(translitered)
102
+ @@prefixes.has_key?(translitered)
103
+ end
104
+
105
+ # * Check if translitered word has stem
106
+ # * [translitered] Translitered word to be checked
107
+ def has_stem?(translitered)
108
+ @@stems.has_key?(translitered)
109
+ end
110
+
111
+ # * Check if translitered word has suffix
112
+ # * [translitered] Translitered word to be checked
113
+ def has_suffix?(translitered)
114
+ @@suffixes.has_key?(translitered)
115
+ end
116
+
117
+ # * Check if prefix and stem are compatible
118
+ # * [prefix] prefix to be checked
119
+ # * [stem] stem to be checked
120
+ def prefixes_stems_compatible?(prefix , stem) #String , #String
121
+ @@prefixes_stems_compatibility.member?(prefix + " " + stem)
122
+ end
123
+
124
+ # * Check if prefix and suffix are compatible
125
+ # * [prefix] prefix to be checked
126
+ # * [suffix] suffix to be checked
127
+ def prefixes_suffixes_compatible?(prefix , suffix)
128
+ @@prefixes_suffixes_compatibility.member?(prefix + " " + suffix)
129
+ end
130
+
131
+ # * Check if stem and suffix are compatible
132
+ # * [stem] stem to be checked
133
+ # * [suffix] suffix to be checked
134
+ def stems_suffixes_compatible?(stem , suffix)
135
+ @@stems_suffixes_compatibility.member?(stem + " " + suffix)
136
+ end
137
+
138
+ # * Returns the prefixes table
139
+ def prefixes
140
+ @@prefixes
141
+ end
142
+
143
+ def prefixes=(prefixes)
144
+ @@prefixes = prefixes
145
+ end
146
+
147
+ # * Returns Stems Dictionary
148
+ def stems
149
+ @@stems
150
+ end
151
+
152
+ def stems=(stems)
153
+ @@stems = stems
154
+ end
155
+
156
+
157
+ # * Returns Suffixes Dictionary
158
+ def suffixes
159
+ @@suffixes
160
+ end
161
+
162
+ def suffixes=(suffixes)
163
+ @@suffixes = suffixes
164
+ end
165
+
166
+ private
167
+
168
+ # * load Dictionary from files
169
+ # * [dictionary] Hash of Arrays to store the Dictionary
170
+ # * [name] Dictionary Name
171
+ # * [file] File Path
172
+ def self.load_dictionary( dictionary , name , file )
173
+ lemmas = Set.new
174
+ forms = 0
175
+ final = 0
176
+ lemma_id = ""
177
+ puts "Loading Dictionary : #{ name }"
178
+ #x = Time.now
179
+ file = IO.readlines(file)
180
+ #@loading_secs += Time.now - x
181
+ line_count = 0
182
+ # leemas = file.select{|line| line.start_with?(@@leema_starter) }
183
+ file = file.select{|line| line.start_with?(";; ") or !line.start_with?(";") }
184
+ # entries = file.select{|line| !( line.start_with?(@@leema_starter) and line.start_with?(";")) }
185
+ # read_leemas(leemas)
186
+ # read_entries(entries)
187
+
188
+
189
+ file.each do |line|
190
+ # puts "." unless line_count % 1000
191
+ if line.start_with?(";; ")
192
+ lemma_id = line[3..line.length]
193
+ # Raise Exception If non-unique Lemma ID
194
+ raise ArgumentError.new("Lemma #{lemma_id } in #{name} #{line_count} isn't unique") if lemmas.member?(lemma_id)
195
+ # Add The New Lemma
196
+ lemmas << lemma_id
197
+ #elsif line.start_with?(";")
198
+ else
199
+ splited_line = line.split("\t" , -1)
200
+ raise ArgumentError.new("Entry In #{name} line #{line_count} doesn't have 4 fields ( 3 tabs )") unless splited_line.length == 4
201
+ de = self.construct_dictionary_entry(splited_line , name, line_count , lemma_id)
202
+ if dictionary.has_key?(de.entry)
203
+ dictionary[de.entry] << de
204
+ else
205
+ tmp_array = []
206
+ tmp_array << de
207
+ dictionary[de.entry] = tmp_array
208
+ end
209
+ forms+=1;
210
+ end
211
+ line_count+=1
212
+ end
213
+ # file.close()
214
+ #puts "Time Taken In If" + @@if_time.to_s
215
+ #puts "Time Taken In Sub" + @@sub_time.to_s
216
+
217
+ puts "#{lemmas.size()} lemmas and " unless lemma_id == ""
218
+ puts("#{dictionary.length} entries totalizing #{forms} forms")
219
+ end
220
+
221
+ # * Load Compatibilty tables
222
+ # * [set] Set for Loading Compatibilty Tables
223
+ # * [name] Table Name
224
+ # * [file] File Path
225
+ def self.load_compatibility_table(set, name, file)
226
+ puts "Loading compatibility table : #{name} "
227
+ file = IO.readlines(file)
228
+ file.each do |line|
229
+ unless (line.start_with?(";")) #Ignore comments
230
+ line = line.strip
231
+ line = line.gsub(@@compatability_stpliter, " ")
232
+ set << line#line
233
+ end
234
+ end
235
+ puts "#{set.size()} entries"
236
+ end
237
+
238
+ # * Construct Dictionary Entry from line
239
+ def self.construct_dictionary_entry(splited_line , name ,line_count , lemma_id)
240
+ entry = splited_line[0]
241
+ vocalization = splited_line[1]
242
+ morphology = splited_line[2]
243
+ gloss_pos = splited_line[3]
244
+ gloss , pos = ""
245
+ # two ways to get the POS info
246
+ # (1) explicitly, by extracting it from the gloss field:
247
+
248
+ matcher = @@regex.match(gloss_pos)
249
+ if matcher
250
+ pos = matcher[1] #extract POS from glossPOS
251
+ gloss = gloss_pos #we clean up the gloss later (see below)
252
+ # (2) by deduction: use the morphology (and sometimes the voc and gloss) to deduce the appropriate POS
253
+ else
254
+ gloss= gloss_pos
255
+ # we need the gloss to guess proper name
256
+
257
+ if morphology.match(@@morphology_regexs[0])
258
+ pos = ""
259
+ elsif morphology.match(@@morphology_regexs[1])
260
+ pos = "#{vocalization} #{@@vocalization_array[0]}"
261
+ elsif (morphology.match(@@morphology_regexs[2]))
262
+ pos = "#{vocalization} #{ @@vocalization_array[1]}"
263
+ elsif (morphology.match(@@morphology_regexs[3]))
264
+ pos = "#{vocalization} #{ @@vocalization_array[2]}"
265
+ elsif (morphology.match(@@morphology_regexs[4] ))
266
+ pos = "#{vocalization} #{@@vocalization_array[3]}"
267
+ elsif (morphology.match(@@morphology_regexs[5]))
268
+ # educated guess (99% correct)
269
+ if (gloss.match(@@morphology_regexs[6]))
270
+ pos = "#{vocalization} #{@@vocalization_array[4]}"
271
+ #(was NOUN_ADJ: some of these are really ADJ's and need to be tagged manually)
272
+ elsif (vocalization.match(@@morphology_regexs[7]))
273
+ pos = "#{vocalization} #{@@vocalization_array[5]}"
274
+ else
275
+ pos = "#{vocalization} #{@@vocalization_array[6]}"
276
+ end
277
+ else raise "No POS can be deduced in #{ name} (line #{line_count}"
278
+ end
279
+ end
280
+ # clean up the gloss: remove POS info and extra space, and convert upper-ASCII to lower (it doesn't convert well to UTF-8)
281
+ gloss =gloss.sub(/<pos>.+?<\/pos>/,"")
282
+ gloss = gloss.strip
283
+ translotor = Translator.new
284
+ gloss = translotor.translate(gloss)
285
+ DictionaryEntry.new(entry, lemma_id, vocalization, morphology, gloss, pos)
286
+ end
287
+ end
@@ -0,0 +1,78 @@
1
+ # An in-memory handler for managing solutions found by the morphological analyzer.
2
+ #
3
+ # Author:: eSpace technologies www.eSpace.com.eg
4
+ # Copyright:: 2008
5
+ #
6
+
7
+ class InMemorySolutionsHandler
8
+
9
+ # The unique instance of this handler (singleton pattern)
10
+ # Constructor to avoid multiple instanciations
11
+ public_class_method :new
12
+ @@handler = nil
13
+
14
+ def self.create
15
+ @@handler= new unless @@handler
16
+ @@handler
17
+ end
18
+
19
+ public
20
+
21
+ # Add solutions for a given word
22
+ # * [translitered] The translitered word.
23
+ # * [sol] The solution to the translitered word.
24
+ def add_solutions (translitered, sol)
25
+ @@solutions[translitered] = sol
26
+ end
27
+
28
+ # Whether or not the word already gave solutions
29
+ # * [translitered] The translitered word
30
+ # * @return If it has the solution or not (Boolean).
31
+ def has_solutions(translitered)
32
+ @@solutions.has_key?(translitered)
33
+ end
34
+
35
+ # Return the solutions of a given word
36
+ # * [translitered] The translitered word
37
+ # * @return The solution matching the transliterd word.
38
+ def get_solutions(translitered)
39
+ if(self.has_solutions(translitered))
40
+ return @@solutions[translitered]
41
+ else
42
+ return nil
43
+ end
44
+ end
45
+
46
+ # Add alternative spellings for the given word
47
+ # * [translitered] The translitered word
48
+ # * [alt] The alternative spelling
49
+ def add_alternative_spellings(translitered, alt)
50
+ @@alternative_spellings[translitered] = alt
51
+ end
52
+
53
+ # Whether or not the word already gave alternative spellings
54
+ # * [translitered] The translitered word
55
+ # * @return If the transliterd word has alternative spellings
56
+ def has_alternative_spellings(translitered)
57
+ @@alternative_spellings.has_key?(translitered)
58
+ end
59
+
60
+ # Return the alternative spellings of the word
61
+ # * [translitered] The translitered word
62
+ # * @return The alternative spellings matching the transliterd word.
63
+ def get_alternative_spellings(translitered)
64
+ if(self.has_alternative_spellings(translitered))
65
+ return @@alternative_spellings[translitered]
66
+ else
67
+ return nil
68
+ end
69
+ end
70
+
71
+ private
72
+ #Hash of solutions for analyzed words
73
+ @@solutions ={}
74
+ #Hash of alternative spellings
75
+ @@alternative_spellings ={}
76
+
77
+ end
78
+
@@ -0,0 +1,35 @@
1
+ # Class For Latin Arabic Transileration
2
+ # Author:: eSpace technologies www.eSpace.com.eg
3
+ # Copyright:: 2008
4
+ #
5
+
6
+ class LatinArabicTranslator
7
+
8
+ # * Table Used for Tranlation From Latin Letters To Arabic I.e ( Arabize Word )
9
+ # * According to Buckwalter system Dictionary
10
+ TABLE = {"'" => "\u0621","|" => "\u0622",">" => "\u0623","&" => "\u0624",
11
+ "<" => "\u0625","}" => "\u0626","A" => "\u0627","b" => "\u0628",
12
+ "p" => "\u0629","t" => "\u062A","v" => "\u062B","j" => "\u062C",
13
+ "H" => "\u062D","x" => "\u062E","d" => "\u062F","*" => "\u0630",
14
+ "r" => "\u0631","z" => "\u0632", "s" => "\u0633","$" => "\u0634","S" => "\u0635",
15
+ "D" => "\u0636","T" => "\u0637","Z" => "\u0638","E" => "\u0639","g" => "\u063A",
16
+ "_" => "\u0640","f" => "\u0641","q" => "\u0642","k" => "\u0643","l" => "\u0644",
17
+ "m" => "\u0645","n" => "\u0646","h" => "\u0647","w" => "\u0648","Y" => "\u0649","y" => "\u064A",
18
+ "F" => "\u064B","N" => "\u064C","K" => "\u064D","a" => "\u064E","u" => "\u064F","i" => "\u0650",
19
+ "~" => "\u0651", "o" => "\u0652", "`" => "\u0670","{" => "\u0671","P" => "\u067E","J" => "\u0686",
20
+ "V" => "\u06A4", "G" => "\u06AF", "R" => "\u0698" ,"," => "\u060C" , ";" => "\u061B" , "?" => "\u061F"
21
+ }
22
+
23
+ # * Translate : Transilerate the Roman lettered word to Arabic Word
24
+ # * [word] Word String To be processed
25
+ # * @return transilerated word
26
+ #
27
+ def self.translate(word)
28
+ result = ""
29
+ word.force_encoding "UTF-8"
30
+ word.each_char{|char|
31
+ result+= TABLE[char] ? TABLE[char] : char
32
+ }
33
+ result
34
+ end
35
+ end