RubyGems - mosta-raramorph - Versions diffs - 0.1.0 - Mend

mosta-raramorph 0.1.0

Files changed (23) hide show

data/README +56 -0
data/bin/raramorph +6 -0
data/lib/dictionaries/dictPrefixes +421 -0
data/lib/dictionaries/dictStems +135989 -0
data/lib/dictionaries/dictSuffixes +1170 -0
data/lib/dictionaries/marshal_stems +0 -0
data/lib/dictionaries/tableAB +2276 -0
data/lib/dictionaries/tableAC +743 -0
data/lib/dictionaries/tableBC +1584 -0
data/lib/raramorph/arabic_latin_translator.rb +38 -0
data/lib/raramorph/dictionary_entry.rb +40 -0
data/lib/raramorph/in_memory_dictionary_handler.rb +325 -0
data/lib/raramorph/in_memory_solutions_handler.rb +78 -0
data/lib/raramorph/latin_arabic_translator.rb +35 -0
data/lib/raramorph/logger.rb +20 -0
data/lib/raramorph/raramorph.rb +417 -0
data/lib/raramorph/solution.rb +592 -0
data/lib/raramorph/translator.rb +40 -0
data/lib/raramorph.rb +16 -0
data/lib/raramorph_main.rb +34 -0
data/lib/test_input/UTF-8.txt +32 -0
data/raramorph.gemspec +42 -0
metadata +75 -0

data/lib/raramorph/arabic_latin_translator.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# Class For Arabic Latin Transileration
+# Author:: eSpace technologies  www.eSpace.com.eg
+# Copyright:: 2008
+#
+class ArabicLatinTranslator
+  # * Table Used for Tranlation From Arabic To English I.e ( Romanize Word )
+  # * According to  Buckwalter system Dictionary
+  TABLE =   { "\u0621"=> "'" , "\u0622"=> "|" , "\u0623"=> ">" , "\u0624"=> "&" , "\u0625"=> "<" , "\u0626"=> "}" ,
+  "\u0627"=> "A" , "\u0628"=> "b" , "\u0629"=> "p" , "\u062A"=> "t" , "\u062B"=> "v" , "\u062C"=> "j" ,
+  "\u062D"=> "H" , "\u062E"=> "x" , "\u062F"=> "d" , "\u0630"=> "*" , "\u0631"=> "r" , "\u0632"=> "z" ,
+  "\u0633"=> "s" , "\u0634"=> "$" , "\u0635"=> "S" , "\u0636"=> "D" , "\u0637"=> "T" ,"\u0638"=> "Z",
+  "\u0639"=> "E" , "\u063A"=> "g" , "\u0640"=> "_" , "\u0641"=> "f" , "\u0642"=> "q" , "\u0643"=> "k" ,
+  "\u0644"=> "l" , "\u0645"=> "m" , "\u0646"=> "n" , "\u0647"=> "h" , "\u0648"=> "w" , "\u0649"=> "Y",
+  "\u064A"=> "y" , "\u064B"=> "F" , "\u064C"=> "N" , "\u064D"=> "K" , "\u064E"=> "a" , "\u064F"=> "u" ,
+  "\u0650"=> "i" , "\u0651"=> "~" , "\u0652"=> "o" , "\u0670"=> "`" ,"\u0671"=> "{" , "\u067E"=> "P" ,
+  "\u0686"=> "J" , "\u06A4"=> "V" , "\u06AF"=> "G" , "\u0698"=> "R" , "\u060C" => "," ,"\u061B" => ";",
+  "\u061F" => "?" , "\u0640" => ""   }
+  #Not suitable for morphological analysis : remove all vowels/diacritics, i.e. undo the job !
+  VOWEL_REMOVER = Regexp.compile("[FNKaui~o]")
+  STRIPER =  Regexp.compile("[`\\{]")
+ # * Translate : Transilerate the arabic word to  Roman lettered Word
+ # * [word] Word String To be processed
+ # * @return transilerated word
+ #
+ def self.translate(word)
+   result = ""
+   word.gsub!(VOWEL_REMOVER , "")
+   word.gsub!(STRIPER , "")
+   word.force_encoding "UTF-8"
+   word.each_char{|char|
+    result+= TABLE[char] ? TABLE[char] : char
+   }
+   result
+ end
+end

data/lib/raramorph/dictionary_entry.rb ADDED Viewed

@@ -0,0 +1,40 @@
+# Class For Storing Dictionary Entries
+# Author:: eSpace technologies  www.eSpace.com.eg
+# Copyright:: 2008
+class DictionaryEntry
+        ## Constructs a Dictionary Entry
+       attr_reader :entry , :lemma_id , :vocalization , :morphology , :gloss , :glosses , :pos
+       @@split_regex = Regexp.compile("\\+")
+	protected
+  # * Initiliaze New Dict. Entry
+ def initialize( entry,  lemma_id,  vocalization,  morphology,  gloss,  pos)
+               # Instance Variables
+		@entry = entry.strip
+		@lemma_id = lemma_id.strip
+    @vocalization = vocalization.strip
+		@morphology = morphology.strip
+		@gloss = gloss
+    @glosses = []
+    @pos = []
+    @glosses = fill_instance_array_from_sent_array(gloss.split(@@split_regex))
+	  @pos = fill_instance_array_from_sent_array(pos.split(@@split_regex))
+ end
+ private
+ def fill_instance_array_from_sent_array( sent_array)
+  instance_array = []
+ 	sent_array.each do  |value |
+	 	value = value.strip
+  end
+   sent_array[0] == "" ?  offset = 1 :  offset = 0
+  for i in offset..sent_array.length-1
+		instance_array[i - offset] = sent_array[i]
+  end
+  instance_array
+ end
+end

data/lib/raramorph/in_memory_dictionary_handler.rb ADDED Viewed

@@ -0,0 +1,325 @@
+# Class For Storing And Loading Dictionaries
+# Author:: eSpace technologies  www.eSpace.com.eg
+# Copyright:: 2008
+require 'rubygems'
+class InMemoryDictionaryHandler
+  #Signleton Class
+      ##### Dictionaries ########
+    #### Dictionaries are HASH OF ARRAYS #####
+       @@prefixes = {}
+    #Dictionary of Prefixes
+       @@stems = {}
+    #Dictionary of Stems
+       @@suffixes = {}
+     #Dictionary of Suffixes
+   private_class_method :new
+  # * Loads Dictionaries and initiate variables
+  def self.create
+    ### Variables #####
+      @@handler  = nil
+      @@regex = Regexp.compile(".*" + "<pos>(.+?)</pos>" + ".*")
+      @@morphology_regexs=[Regexp.compile("^(Pref-0|Suff-0)$") ,
+                           Regexp.compile("^F" + ".*") ,
+                           Regexp.compile("^IV" + ".*") ,
+                           Regexp.compile("^PV" + ".*") ,
+                           Regexp.compile("^CV" + ".*") ,
+                           Regexp.compile("^N" + ".*") ,
+                           Regexp.compile("^[A-Z]" + ".*") ,
+                           Regexp.compile(".*" + "iy~$")
+                           ]
+      @@compatability_stpliter = Regexp.compile("\\s+")
+      @@vocalization_array =["/FUNC_WORD" ,
+                             "/VERB_IMPERFECT" ,
+                            "/VERB_PERFECT" ,
+                            "/VERB_IMPERATIVE" ,
+                            "/NOUN_PROP" ,
+                            "/NOUN" ,
+                            "/NOUN"
+                               ]
+      @@prefixes_stems_compatibility = Set.new
+    #Changed
+    #Compatibility table for prefixes-stems combinations.
+      @@prefixes_suffixes_compatibility = Set.new
+    #Changed
+    #Compatibility table for prefixes-suffixes combinations.
+      @@stems_suffixes_compatibility = Set.new
+    #Changed
+    #Compatibility table for stem-suffixes combinations.
+       puts "Initializing in-memory dictionary handler..."
+       Thread.abort_on_exception = true
+       load_dictionary( @@prefixes , "dictPrefixes"  ,  File.dirname(__FILE__) + "/../dictionaries/dictPrefixes"  )
+       load_stems_marshaled_dictionary
+       load_dictionary(@@suffixes, "dictSuffixes" ,  File.dirname(__FILE__) + "/../dictionaries/dictSuffixes")
+       load_compatibility_table(@@prefixes_stems_compatibility , "prefixes_stems_compatibility" ,  File.dirname(__FILE__) + "/../dictionaries/tableAB")
+       load_compatibility_table(@@prefixes_suffixes_compatibility , "prefixes_suffixes_compatibility" ,  File.dirname(__FILE__) + "/../dictionaries/tableAC")
+       load_compatibility_table(@@stems_suffixes_compatibility , "stems_suffixes_compatibility" ,  File.dirname(__FILE__) + "/../dictionaries/tableBC")
+       puts "... Done ... "
+             @@handler = new unless @@handler
+  end
+  # * load the marshaled stems dictionary if avalaible or load from the origin dictionary if not avalaible
+  def self.load_stems_marshaled_dictionary
+     if File.exists?( File.dirname(__FILE__) + '/../dictionaries/marshal_stems' )
+      File.open( File.dirname(__FILE__) + '/../dictionaries/marshal_stems') do |f|
+         @@stems =  Marshal.load(f)
+       end
+        puts("#{@@stems.length}  entries totalizing")
+     else
+       reload_stems_dictionary
+     end
+  end
+  # * Marshal the stems dictionary into a file
+  def self.marshal_stems
+     File.open( File.dirname(__FILE__) + '/../dictionaries/marshal_stems' , 'w+') do |f|
+        Marshal.dump(@@stems, f)
+      end
+  end
+  # * Loads Stem dictionary from original file then marshal the dictionary for faster access
+  def self.reload_stems_dictionary
+    load_dictionary(@@stems, "dictStems",  File.dirname(__FILE__) + "/../dictionaries/dictStems") #File.open("dictionaries/dictStems" ,  "r:UTF-8" ))
+    marshal_stems
+  end
+   # * Check if translitered word has prefix
+   # * [translitered] Translitered word to be checked
+  def has_prefix?(translitered)
+   	@@prefixes.has_key?(translitered)
+  end
+   # * Check if translitered word has stem
+   # * [translitered] Translitered word to be checked
+  def has_stem?(translitered)
+    @@stems.has_key?(translitered)
+  end
+   # * Check if translitered word has suffix
+   # * [translitered] Translitered word to be checked
+  def has_suffix?(translitered)
+    @@suffixes.has_key?(translitered)
+  end
+   # * Check if prefix and stem are compatible
+   # * [prefix] prefix to be checked
+   # * [stem] stem to be checked
+  def prefixes_stems_compatible?(prefix , stem) #String , #String
+    @@prefixes_stems_compatibility.member?(prefix + " " + stem)
+  end
+   # * Check if prefix and suffix are compatible
+   # * [prefix] prefix to be checked
+   # * [suffix] suffix to be checked
+  def prefixes_suffixes_compatible?(prefix , suffix)
+    @@prefixes_suffixes_compatibility.member?(prefix + " " + suffix)
+  end
+   # * Check if stem and suffix are compatible
+   # * [stem] stem to be checked
+   # * [suffix] suffix to be checked
+  def stems_suffixes_compatible?(stem , suffix)
+    @@stems_suffixes_compatibility.member?(stem + " " + suffix)
+  end
+  # * Returns the prefixes table
+  def prefixes
+    @@prefixes
+  end
+  def prefixes=(prefixes)
+    @@prefixes = prefixes
+  end
+  # * Returns Stems Dictionary
+  def stems
+    @@stems
+  end
+  def stems=(stems)
+    @@stems = stems
+  end
+ # * Returns Suffixes Dictionary
+  def suffixes
+    @@suffixes
+  end
+  def suffixes=(suffixes)
+    @@suffixes = suffixes
+  end
+  def analyze_word_in_dictionaries(segmented_word , word_solutions , verbose  , count)
+       #Is prefix known ?
+       if has_prefix?(segmented_word.prefix)
+         #Is stem known ?
+         # puts "has prefix"
+         if has_stem?(segmented_word.stem)
+          # puts "has stem"
+           #Is suffix known ?
+           if has_suffix?(segmented_word.suffix)
+           #  puts "has suffix"
+             #Compatibility check
+              @@prefixes[segmented_word.prefix].each{|prefix|
+                @@stems[segmented_word.stem].each {|stem|
+                  #Prefix/Stem compatibility
+                    if prefixes_stems_compatible?(prefix.morphology ,stem.morphology )
+                      # puts "has A B Com"
+                      @@suffixes[segmented_word.suffix].each {|suffix|
+                       # Prefix/Suffix compatiblity
+                       if prefixes_suffixes_compatible?(prefix.morphology , suffix.morphology)
+                         # puts "has A C Com"
+                          # Stems/Suffixes compatiblity
+                         if stems_suffixes_compatible?(stem.morphology , suffix.morphology)
+                          # puts "has  B  C COM"
+                            #All tests passed : it is a solution
+                            count = count + 1
+                            word_solutions << Solution.new(verbose , count , prefix , stem , suffix )
+                         end
+                       end
+                      }
+                    end
+                }
+              }
+           end
+         end
+       end
+	   return count
+  end
+ private
+  # * load Dictionary from files
+  # * [dictionary]  Hash of Arrays to store the Dictionary
+  # * [name] Dictionary Name
+  # * [file] File Path
+  def self.load_dictionary(  dictionary , name , file )
+     lemmas = Set.new
+     forms = 0
+     final  = 0
+     lemma_id = ""
+     puts "Loading Dictionary : #{ name }"
+     #x  = Time.now
+     file = IO.readlines(file)
+     #@loading_secs += Time.now - x
+     line_count = 0
+   #  leemas = file.select{|line| line.start_with?(@@leema_starter) }
+     file  = file.select{|line| line.start_with?(";; ") or !line.start_with?(";")  }
+    # entries = file.select{|line| !( line.start_with?(@@leema_starter) and line.start_with?(";")) }
+    # read_leemas(leemas)
+    # read_entries(entries)
+     file.each do |line|
+       # puts "." unless line_count % 1000
+        if line.start_with?(";; ")
+           lemma_id = line[3..line.length]
+          # Raise Exception If non-unique Lemma ID
+            raise ArgumentError.new("Lemma #{lemma_id } in #{name} #{line_count}  isn't unique") if lemmas.member?(lemma_id)
+          # Add The New Lemma
+           lemmas << lemma_id
+        #elsif line.start_with?(";")
+        else
+           splited_line =  line.split("\t" , -1)
+           raise ArgumentError.new("Entry In #{name} line #{line_count} doesn't have 4 fields ( 3 tabs )") unless splited_line.length == 4
+           de = self.construct_dictionary_entry(splited_line , name, line_count , lemma_id)
+           if  dictionary.has_key?(de.entry)
+        		dictionary[de.entry] << de
+      	   else
+                 tmp_array = []
+                 tmp_array << de
+                 dictionary[de.entry] = tmp_array
+            end
+           forms+=1;
+       end
+       line_count+=1
+    end
+  #  file.close()
+       #puts "Time Taken In If"  + @@if_time.to_s
+       #puts "Time Taken In Sub"  + @@sub_time.to_s
+     puts "#{lemmas.size()}  lemmas and " unless lemma_id == ""
+     puts("#{dictionary.length}  entries totalizing  #{forms}  forms")
+  end
+  # * Load Compatibilty tables
+  # * [set] Set for Loading Compatibilty Tables
+  # * [name] Table Name
+  # * [file] File Path
+  def self.load_compatibility_table(set, name, file)
+    puts "Loading compatibility table : #{name}  "
+    file = IO.readlines(file)
+     file.each do |line|
+       unless (line.start_with?(";")) #Ignore comments
+		line = line.strip
+		line = line.gsub(@@compatability_stpliter, " ")
+		set << line#line
+      end
+   end
+   	puts  "#{set.size()} entries"
+ end
+  # * Construct Dictionary Entry from line
+  def self.construct_dictionary_entry(splited_line , name  ,line_count , lemma_id)
+             entry  = splited_line[0]
+             vocalization = splited_line[1]
+             morphology = splited_line[2]
+             gloss_pos = splited_line[3]
+             gloss = ""
+             pos = ""
+             # two ways to get the POS info
+             # (1) explicitly, by extracting it from the gloss field:
+             matcher = @@regex.match(gloss_pos)
+              if matcher
+                 pos = matcher[1] #extract POS from glossPOS
+    		         gloss = gloss_pos #we clean up the gloss later (see below)
+              	# (2) by deduction: use the morphology (and sometimes the voc and gloss) to deduce the appropriate POS
+             else
+                 gloss= gloss_pos
+                 # we need the gloss to guess proper name
+                  if morphology.match(@@morphology_regexs[0])
+                    pos  = ""
+                  elsif morphology.match(@@morphology_regexs[1])
+                    pos = "#{vocalization} #{@@vocalization_array[0]}"
+            		  elsif (morphology.match(@@morphology_regexs[2]))
+            		    pos = "#{vocalization} #{ @@vocalization_array[1]}"
+           	  	  elsif (morphology.match(@@morphology_regexs[3]))
+            		    pos = "#{vocalization} #{ @@vocalization_array[2]}"
+    		          elsif (morphology.match(@@morphology_regexs[4] ))
+            		    pos = "#{vocalization} #{@@vocalization_array[3]}"
+           		    elsif (morphology.match(@@morphology_regexs[5]))
+                            # educated guess (99% correct)
+            			  if (gloss.match(@@morphology_regexs[6]))
+         			      pos = "#{vocalization} #{@@vocalization_array[4]}"
+					    		  #(was NOUN_ADJ: some of these are really ADJ's and need to be tagged manually)
+            			  elsif (vocalization.match(@@morphology_regexs[7]))
+          			     pos = "#{vocalization} #{@@vocalization_array[5]}"
+   		              else
+          			     pos = "#{vocalization} #{@@vocalization_array[6]}"
+                    end
+                  else   raise "No POS can be deduced in #{ name}  (line  #{line_count}"
+                end
+            end
+            # clean up the gloss: remove POS info and extra space, and convert upper-ASCII  to lower (it doesn't convert well to UTF-8)
+             gloss =gloss.sub(/<pos>.+?<\/pos>/,"")
+       	     gloss = gloss.strip
+             translotor = Translator.new
+             gloss = translotor.translate(gloss)
+             DictionaryEntry.new(entry, lemma_id, vocalization, morphology, gloss, pos)
+    end
+end

data/lib/raramorph/in_memory_solutions_handler.rb ADDED Viewed

@@ -0,0 +1,78 @@
+# An in-memory handler for managing solutions found by the morphological analyzer.
+#
+# Author:: eSpace technologies  www.eSpace.com.eg
+# Copyright:: 2008
+class InMemorySolutionsHandler
+  # The unique instance of this handler (singleton pattern)
+  # Constructor to avoid multiple instanciations
+  public_class_method :new
+  @@handler = nil
+  def self.create
+    @@handler= new unless @@handler
+    @@handler
+  end
+  public
+  # Add solutions for a given word
+  # * [translitered] The translitered word.
+  # * [sol] The solution to the translitered word.
+  def add_solutions (translitered, sol)
+    @@solutions[translitered] = sol
+  end
+  # Whether or not the word already gave solutions
+  # * [translitered] The translitered word
+  #  * @return If it has the solution or not (Boolean).
+  def has_solutions(translitered)
+    @@solutions.has_key?(translitered)
+  end
+  # Return the solutions of a given word
+  # * [translitered] The translitered word
+  #  * @return The solution matching the transliterd word.
+  def get_solutions(translitered)
+    if(self.has_solutions(translitered))
+      return @@solutions[translitered]
+    else
+      return nil
+    end
+  end
+  # Add alternative spellings for the given word
+  # * [translitered] The translitered word
+  # * [alt] The alternative spelling
+  def add_alternative_spellings(translitered, alt)
+    @@alternative_spellings[translitered] = alt
+  end
+  # Whether or not the word already gave alternative spellings
+  # * [translitered] The translitered word
+  #  * @return If the transliterd word has alternative spellings
+  def has_alternative_spellings(translitered)
+    @@alternative_spellings.has_key?(translitered)
+  end
+  # Return the alternative spellings of the word
+  # * [translitered] The translitered word
+  #  * @return The alternative spellings matching the transliterd word.
+  def get_alternative_spellings(translitered)
+    if(self.has_alternative_spellings(translitered))
+      return @@alternative_spellings[translitered]
+    else
+      return nil
+    end
+  end
+private
+    #Hash of solutions for analyzed words
+    @@solutions ={}
+    #Hash of alternative spellings
+    @@alternative_spellings ={}
+end

data/lib/raramorph/latin_arabic_translator.rb ADDED Viewed

@@ -0,0 +1,35 @@
+# Class For  Latin Arabic Transileration
+# Author:: eSpace technologies  www.eSpace.com.eg
+# Copyright:: 2008
+class LatinArabicTranslator
+ # * Table Used for Tranlation From Latin Letters To Arabic I.e ( Arabize Word )
+  # * According to  Buckwalter system Dictionary
+  TABLE = {"'" => "\u0621","|" => "\u0622",">" => "\u0623","&" => "\u0624",
+  "<" => "\u0625","}" => "\u0626","A" => "\u0627","b" => "\u0628",
+  "p" => "\u0629","t" => "\u062A","v" => "\u062B","j" => "\u062C",
+  "H" => "\u062D","x" => "\u062E","d" => "\u062F","*" => "\u0630",
+  "r" => "\u0631","z" => "\u0632", "s" => "\u0633","$" => "\u0634","S" => "\u0635",
+  "D" => "\u0636","T" => "\u0637","Z" => "\u0638","E" => "\u0639","g" => "\u063A",
+  "_" => "\u0640","f" => "\u0641","q" => "\u0642","k" => "\u0643","l" => "\u0644",
+  "m" => "\u0645","n" => "\u0646","h" => "\u0647","w" => "\u0648","Y" => "\u0649","y" => "\u064A",
+  "F" => "\u064B","N" => "\u064C","K" => "\u064D","a" => "\u064E","u" => "\u064F","i" => "\u0650",
+  "~" => "\u0651", "o" => "\u0652",  "`" => "\u0670","{" => "\u0671","P" => "\u067E","J" => "\u0686",
+  "V" => "\u06A4",   "G" => "\u06AF", "R" => "\u0698" ,"," => "\u060C" , ";" => "\u061B" , "?" => "\u061F"
+  }
+ # * Translate : Transilerate the Roman lettered word to  Arabic Word
+ # * [word] Word String To be processed
+ # * @return transilerated word
+ #
+  def self.translate(word)
+   result = ""
+   word.force_encoding "UTF-8"
+   word.each_char{|char|
+    result+= TABLE[char] ? TABLE[char] : char
+   }
+   result
+  end
+end

data/lib/raramorph/logger.rb ADDED Viewed

@@ -0,0 +1,20 @@
+class Logger
+  attr_reader :verbose , :output
+  def initialize(verbose = nil  , output = nil )
+     @verbose = verbose
+	 @output = output
+	 @stream = StringIO.new
+  end
+  def info string , require_verbose = false
+    @stream.puts(string) #if (  require_verbose && @verbose  || ! require_verbose )
+  end
+  def log
+    return  puts @stream.string  if @output.nil?
+	File.open(@output , "w") { |f|
+	 f.puts @stream.string }
+  end
+end