RubyGems - espace-raramorph - Versions diffs - 0.1.0 - Mend

espace-raramorph 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

data/README +56 -0
data/bin/raramorph +6 -0
data/lib/dictionaries/dictPrefixes +421 -0
data/lib/dictionaries/dictStems +135989 -0
data/lib/dictionaries/dictSuffixes +1170 -0
data/lib/dictionaries/marshal_stems +0 -0
data/lib/dictionaries/tableAB +2276 -0
data/lib/dictionaries/tableAC +743 -0
data/lib/dictionaries/tableBC +1584 -0
data/lib/raramorph/arabic_latin_translator.rb +38 -0
data/lib/raramorph/dictionary_entry.rb +40 -0
data/lib/raramorph/in_memory_dictionary_handler.rb +325 -0
data/lib/raramorph/in_memory_solutions_handler.rb +78 -0
data/lib/raramorph/latin_arabic_translator.rb +35 -0
data/lib/raramorph/logger.rb +20 -0
data/lib/raramorph/raramorph.rb +417 -0
data/lib/raramorph/solution.rb +592 -0
data/lib/raramorph/translator.rb +40 -0
data/lib/raramorph.rb +16 -0
data/lib/raramorph_main.rb +34 -0
data/lib/test_input/UTF-8.txt +32 -0
data/raramorph.gemspec +42 -0
metadata +75 -0

data/lib/raramorph/arabic_latin_translator.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# Class For Arabic Latin Transileration
+# Author:: eSpace technologies  www.eSpace.com.eg
+# Copyright:: 2008
+#
+class ArabicLatinTranslator
+  # * Table Used for Tranlation From Arabic To English I.e ( Romanize Word )
+  # * According to  Buckwalter system Dictionary
+  TABLE =   { "\u0621"=> "'" , "\u0622"=> "|" , "\u0623"=> ">" , "\u0624"=> "&" , "\u0625"=> "<" , "\u0626"=> "}" ,
+  "\u0627"=> "A" , "\u0628"=> "b" , "\u0629"=> "p" , "\u062A"=> "t" , "\u062B"=> "v" , "\u062C"=> "j" ,
+  "\u062D"=> "H" , "\u062E"=> "x" , "\u062F"=> "d" , "\u0630"=> "*" , "\u0631"=> "r" , "\u0632"=> "z" ,
+  "\u0633"=> "s" , "\u0634"=> "$" , "\u0635"=> "S" , "\u0636"=> "D" , "\u0637"=> "T" ,"\u0638"=> "Z",
+  "\u0639"=> "E" , "\u063A"=> "g" , "\u0640"=> "_" , "\u0641"=> "f" , "\u0642"=> "q" , "\u0643"=> "k" ,
+  "\u0644"=> "l" , "\u0645"=> "m" , "\u0646"=> "n" , "\u0647"=> "h" , "\u0648"=> "w" , "\u0649"=> "Y",
+  "\u064A"=> "y" , "\u064B"=> "F" , "\u064C"=> "N" , "\u064D"=> "K" , "\u064E"=> "a" , "\u064F"=> "u" ,
+  "\u0650"=> "i" , "\u0651"=> "~" , "\u0652"=> "o" , "\u0670"=> "`" ,"\u0671"=> "{" , "\u067E"=> "P" ,
+  "\u0686"=> "J" , "\u06A4"=> "V" , "\u06AF"=> "G" , "\u0698"=> "R" , "\u060C" => "," ,"\u061B" => ";",
+  "\u061F" => "?" , "\u0640" => ""   }
+  #Not suitable for morphological analysis : remove all vowels/diacritics, i.e. undo the job !
+  VOWEL_REMOVER = Regexp.compile("[FNKaui~o]")
+  STRIPER =  Regexp.compile("[`\\{]")
+ # * Translate : Transilerate the arabic word to  Roman lettered Word
+ # * [word] Word String To be processed
+ # * @return transilerated word
+ #
+ def self.translate(word)
+   result = ""
+   word.gsub!(VOWEL_REMOVER , "")
+   word.gsub!(STRIPER , "")
+   word.force_encoding "UTF-8"
+   word.each_char{|char|
+    result+= TABLE[char] ? TABLE[char] : char
+   }
+   result
+ end
+end

data/lib/raramorph/dictionary_entry.rb ADDED Viewed

@@ -0,0 +1,40 @@
+# Class For Storing Dictionary Entries
+# Author:: eSpace technologies  www.eSpace.com.eg
+# Copyright:: 2008
+class DictionaryEntry
+        ## Constructs a Dictionary Entry
+       attr_reader :entry , :lemma_id , :vocalization , :morphology , :gloss , :glosses , :pos
+       @@split_regex = Regexp.compile("\\+")
+	protected
+  # * Initiliaze New Dict. Entry
+ def initialize( entry,  lemma_id,  vocalization,  morphology,  gloss,  pos)
+               # Instance Variables
+		@entry = entry.strip
+		@lemma_id = lemma_id.strip
+    @vocalization = vocalization.strip
+		@morphology = morphology.strip
+		@gloss = gloss
+    @glosses = []
+    @pos = []
+    @glosses = fill_instance_array_from_sent_array(gloss.split(@@split_regex))
+	  @pos = fill_instance_array_from_sent_array(pos.split(@@split_regex))
+ end
+ private
+ def fill_instance_array_from_sent_array( sent_array)
+  instance_array = []
+ 	sent_array.each do  |value |
+	 	value = value.strip
+  end
+   sent_array[0] == "" ?  offset = 1 :  offset = 0
+  for i in offset..sent_array.length-1
+		instance_array[i - offset] = sent_array[i]
+  end
+  instance_array
+ end
+end

data/lib/raramorph/in_memory_dictionary_handler.rb ADDED Viewed

@@ -0,0 +1,325 @@
+# Class For Storing And Loading Dictionaries
+# Author:: eSpace technologies  www.eSpace.com.eg
+# Copyright:: 2008
+require 'rubygems'
+class InMemoryDictionaryHandler
+  #Signleton Class
+      ##### Dictionaries ########
+    #### Dictionaries are HASH OF ARRAYS #####
+       @@prefixes = {}
+    #Dictionary of Prefixes
+       @@stems = {}
+    #Dictionary of Stems
+       @@suffixes = {}
+     #Dictionary of Suffixes
+   private_class_method :new
+  # * Loads Dictionaries and initiate variables
+  def self.create
+    ### Variables #####
+      @@handler  = nil
+      @@regex = Regexp.compile(".*" + "<pos>(.+?)</pos>" + ".*")
+      @@morphology_regexs=[Regexp.compile("^(Pref-0|Suff-0)$") ,
+                           Regexp.compile("^F" + ".*") ,
+                           Regexp.compile("^IV" + ".*") ,
+                           Regexp.compile("^PV" + ".*") ,
+                           Regexp.compile("^CV" + ".*") ,
+                           Regexp.compile("^N" + ".*") ,
+                           Regexp.compile("^[A-Z]" + ".*") ,
+                           Regexp.compile(".*" + "iy~$")
+                           ]
+      @@compatability_stpliter = Regexp.compile("\\s+")
+      @@vocalization_array =["/FUNC_WORD" ,
+                             "/VERB_IMPERFECT" ,
+                            "/VERB_PERFECT" ,
+                            "/VERB_IMPERATIVE" ,
+                            "/NOUN_PROP" ,
+                            "/NOUN" ,
+                            "/NOUN"
+                               ]
+      @@prefixes_stems_compatibility = Set.new
+    #Changed
+    #Compatibility table for prefixes-stems combinations.
+      @@prefixes_suffixes_compatibility = Set.new
+    #Changed
+    #Compatibility table for prefixes-suffixes combinations.
+      @@stems_suffixes_compatibility = Set.new
+    #Changed
+    #Compatibility table for stem-suffixes combinations.
+       puts "Initializing in-memory dictionary handler..."
+       Thread.abort_on_exception = true
+       load_dictionary( @@prefixes , "dictPrefixes"  ,  File.dirname(__FILE__) + "/../dictionaries/dictPrefixes"  )
+       load_stems_marshaled_dictionary
+       load_dictionary(@@suffixes, "dictSuffixes" ,  File.dirname(__FILE__) + "/../dictionaries/dictSuffixes")
+       load_compatibility_table(@@prefixes_stems_compatibility , "prefixes_stems_compatibility" ,  File.dirname(__FILE__) + "/../dictionaries/tableAB")
+       load_compatibility_table(@@prefixes_suffixes_compatibility , "prefixes_suffixes_compatibility" ,  File.dirname(__FILE__) + "/../dictionaries/tableAC")
+       load_compatibility_table(@@stems_suffixes_compatibility , "stems_suffixes_compatibility" ,  File.dirname(__FILE__) + "/../dictionaries/tableBC")
+       puts "... Done ... "
+             @@handler = new unless @@handler
+  end
+  # * load the marshaled stems dictionary if avalaible or load from the origin dictionary if not avalaible
+  def self.load_stems_marshaled_dictionary
+     if File.exists?( File.dirname(__FILE__) + '/../dictionaries/marshal_stems' )
+      File.open( File.dirname(__FILE__) + '/../dictionaries/marshal_stems') do |f|
+         @@stems =  Marshal.load(f)
+       end
+        puts("#{@@stems.length}  entries totalizing")
+     else
+       reload_stems_dictionary
+     end
+  end
+  # * Marshal the stems dictionary into a file
+  def self.marshal_stems
+     File.open( File.dirname(__FILE__) + '/../dictionaries/marshal_stems' , 'w+') do |f|
+        Marshal.dump(@@stems, f)
+      end
+  end
+  # * Loads Stem dictionary from original file then marshal the dictionary for faster access
+  def self.reload_stems_dictionary
+    load_dictionary(@@stems, "dictStems",  File.dirname(__FILE__) + "/../dictionaries/dictStems") #File.open("dictionaries/dictStems" ,  "r:UTF-8" ))
+    marshal_stems
+  end
+   # * Check if translitered word has prefix
+   # * [translitered] Translitered word to be checked
+  def has_prefix?(translitered)
+   	@@prefixes.has_key?(translitered)
+  end
+   # * Check if translitered word has stem
+   # * [translitered] Translitered word to be checked
+  def has_stem?(translitered)
+    @@stems.has_key?(translitered)
+  end
+   # * Check if translitered word has suffix
+   # * [translitered] Translitered word to be checked
+  def has_suffix?(translitered)
+    @@suffixes.has_key?(translitered)
+  end
+   # * Check if prefix and stem are compatible
+   # * [prefix] prefix to be checked
+   # * [stem] stem to be checked
+  def prefixes_stems_compatible?(prefix , stem) #String , #String
+    @@prefixes_stems_compatibility.member?(prefix + " " + stem)
+  end
+   # * Check if prefix and suffix are compatible
+   # * [prefix] prefix to be checked
+   # * [suffix] suffix to be checked
+  def prefixes_suffixes_compatible?(prefix , suffix)
+    @@prefixes_suffixes_compatibility.member?(prefix + " " + suffix)
+  end
+   # * Check if stem and suffix are compatible
+   # * [stem] stem to be checked
+   # * [suffix] suffix to be checked
+  def stems_suffixes_compatible?(stem , suffix)
+    @@stems_suffixes_compatibility.member?(stem + " " + suffix)
+  end
+  # * Returns the prefixes table
+  def prefixes
+    @@prefixes
+  end
+  def prefixes=(prefixes)
+    @@prefixes = prefixes
+  end
+  # * Returns Stems Dictionary
+  def stems
+    @@stems
+  end
+  def stems=(stems)
+    @@stems = stems
+  end
+ # * Returns Suffixes Dictionary
+  def suffixes
+    @@suffixes
+  end
+  def suffixes=(suffixes)
+    @@suffixes = suffixes
+  end
+  def analyze_word_in_dictionaries(segmented_word , word_solutions , verbose  , count)
+       #Is prefix known ?
+       if has_prefix?(segmented_word.prefix)
+         #Is stem known ?
+         # puts "has prefix"
+         if has_stem?(segmented_word.stem)
+          # puts "has stem"
+           #Is suffix known ?
+           if has_suffix?(segmented_word.suffix)
+           #  puts "has suffix"
+             #Compatibility check
+              @@prefixes[segmented_word.prefix].each{|prefix|
+                @@stems[segmented_word.stem].each {|stem|
+                  #Prefix/Stem compatibility
+                    if prefixes_stems_compatible?(prefix.morphology ,stem.morphology )
+                      # puts "has A B Com"
+                      @@suffixes[segmented_word.suffix].each {|suffix|
+                       # Prefix/Suffix compatiblity
+                       if prefixes_suffixes_compatible?(prefix.morphology , suffix.morphology)
+                         # puts "has A C Com"
+                          # Stems/Suffixes compatiblity
+                         if stems_suffixes_compatible?(stem.morphology , suffix.morphology)
+                          # puts "has  B  C COM"
+                            #All tests passed : it is a solution
+                            count = count + 1
+                            word_solutions << Solution.new(verbose , count , prefix , stem , suffix )
+                         end
+                       end
+                      }
+                    end
+                }
+              }
+           end
+         end
+       end
+	   return count
+  end
+ private
+  # * load Dictionary from files
+  # * [dictionary]  Hash of Arrays to store the Dictionary
+  # * [name] Dictionary Name
+  # * [file] File Path
+  def self.load_dictionary(  dictionary , name , file )
+     lemmas = Set.new
+     forms = 0
+     final  = 0
+     lemma_id = ""
+     puts "Loading Dictionary : #{ name }"
+     #x  = Time.now
+     file = IO.readlines(file)
+     #@loading_secs += Time.now - x
+     line_count = 0
+   #  leemas = file.select{|line| line.start_with?(@@leema_starter) }
+     file  = file.select{|line| line.start_with?(";; ") or !line.start_with?(";")  }
+    # entries = file.select{|line| !( line.start_with?(@@leema_starter) and line.start_with?(";")) }
+    # read_leemas(leemas)
+    # read_entries(entries)
+     file.each do |line|
+       # puts "." unless line_count % 1000
+        if line.start_with?(";; ")
+           lemma_id = line[3..line.length]
+          # Raise Exception If non-unique Lemma ID
+            raise ArgumentError.new("Lemma #{lemma_id } in #{name} #{line_count}  isn't unique") if lemmas.member?(lemma_id)
+          # Add The New Lemma
+           lemmas << lemma_id
+        #elsif line.start_with?(";")
+        else
+           splited_line =  line.split("\t" , -1)
+           raise ArgumentError.new("Entry In #{name} line #{line_count} doesn't have 4 fields ( 3 tabs )") unless splited_line.length == 4
+           de = self.construct_dictionary_entry(splited_line , name, line_count , lemma_id)
+           if  dictionary.has_key?(de.entry)
+        		dictionary[de.entry] << de
+      	   else
+                 tmp_array = []
+                 tmp_array << de
+                 dictionary[de.entry] = tmp_array
+            end
+           forms+=1;
+       end
+       line_count+=1
+    end
+  #  file.close()
+       #puts "Time Taken In If"  + @@if_time.to_s
+       #puts "Time Taken In Sub"  + @@sub_time.to_s
+     puts "#{lemmas.size()}  lemmas and " unless lemma_id == ""
+     puts("#{dictionary.length}  entries totalizing  #{forms}  forms")
+  end
+  # * Load Compatibilty tables
+  # * [set] Set for Loading Compatibilty Tables
+  # * [name] Table Name
+  # * [file] File Path
+  def self.load_compatibility_table(set, name, file)
+    puts "Loading compatibility table : #{name}  "
+    file = IO.readlines(file)
+     file.each do |line|
+       unless (line.start_with?(";")) #Ignore comments
+		line = line.strip
+		line = line.gsub(@@compatability_stpliter, " ")
+		set << line#line
+      end
+   end
+   	puts  "#{set.size()} entries"
+ end
+  # * Construct Dictionary Entry from line
+  def self.construct_dictionary_entry(splited_line , name  ,line_count , lemma_id)
+             entry  = splited_line[0]
+             vocalization = splited_line[1]
+             morphology = splited_line[2]
+             gloss_pos = splited_line[3]
+             gloss = ""
+             pos = ""
+             # two ways to get the POS info
+             # (1) explicitly, by extracting it from the gloss field:
+             matcher = @@regex.match(gloss_pos)
+              if matcher
+                 pos = matcher[1] #extract POS from glossPOS
+    		         gloss = gloss_pos #we clean up the gloss later (see below)
+              	# (2) by deduction: use the morphology (and sometimes the voc and gloss) to deduce the appropriate POS
+             else
+                 gloss= gloss_pos
+                 # we need the gloss to guess proper name
+                  if morphology.match(@@morphology_regexs[0])
+                    pos  = ""
+                  elsif morphology.match(@@morphology_regexs[1])
+                    pos = "#{vocalization} #{@@vocalization_array[0]}"
+            		  elsif (morphology.match(@@morphology_regexs[2]))
+            		    pos = "#{vocalization} #{ @@vocalization_array[1]}"
+           	  	  elsif (morphology.match(@@morphology_regexs[3]))
+            		    pos = "#{vocalization} #{ @@vocalization_array[2]}"
+    		          elsif (morphology.match(@@morphology_regexs[4] ))
+            		    pos = "#{vocalization} #{@@vocalization_array[3]}"
+           		    elsif (morphology.match(@@morphology_regexs[5]))
+                            # educated guess (99% correct)
+            			  if (gloss.match(@@morphology_regexs[6]))
+         			      pos = "#{vocalization} #{@@vocalization_array[4]}"
+					    		  #(was NOUN_ADJ: some of these are really ADJ's and need to be tagged manually)
+            			  elsif (vocalization.match(@@morphology_regexs[7]))
+          			     pos = "#{vocalization} #{@@vocalization_array[5]}"
+   		              else
+          			     pos = "#{vocalization} #{@@vocalization_array[6]}"
+                    end
+                  else   raise "No POS can be deduced in #{ name}  (line  #{line_count}"
+                end
+            end
+            # clean up the gloss: remove POS info and extra space, and convert upper-ASCII  to lower (it doesn't convert well to UTF-8)
+             gloss =gloss.sub(/<pos>.+?<\/pos>/,"")
+       	     gloss = gloss.strip
+             translotor = Translator.new
+             gloss = translotor.translate(gloss)
+             DictionaryEntry.new(entry, lemma_id, vocalization, morphology, gloss, pos)
+    end
+end

data/lib/raramorph/in_memory_solutions_handler.rb ADDED Viewed

@@ -0,0 +1,78 @@
+# An in-memory handler for managing solutions found by the morphological analyzer.
+#
+# Author:: eSpace technologies  www.eSpace.com.eg
+# Copyright:: 2008
+class InMemorySolutionsHandler
+  # The unique instance of this handler (singleton pattern)
+  # Constructor to avoid multiple instanciations
+  public_class_method :new
+  @@handler = nil
+  def self.create
+    @@handler= new unless @@handler
+    @@handler
+  end
+  public
+  # Add solutions for a given word
+  # * [translitered] The translitered word.
+  # * [sol] The solution to the translitered word.
+  def add_solutions (translitered, sol)
+    @@solutions[translitered] = sol
+  end
+  # Whether or not the word already gave solutions
+  # * [translitered] The translitered word
+  #  * @return If it has the solution or not (Boolean).
+  def has_solutions(translitered)
+    @@solutions.has_key?(translitered)
+  end
+  # Return the solutions of a given word
+  # * [translitered] The translitered word
+  #  * @return The solution matching the transliterd word.
+  def get_solutions(translitered)
+    if(self.has_solutions(translitered))
+      return @@solutions[translitered]
+    else
+      return nil
+    end
+  end
+  # Add alternative spellings for the given word
+  # * [translitered] The translitered word
+  # * [alt] The alternative spelling
+  def add_alternative_spellings(translitered, alt)
+    @@alternative_spellings[translitered] = alt
+  end
+  # Whether or not the word already gave alternative spellings
+  # * [translitered] The translitered word
+  #  * @return If the transliterd word has alternative spellings
+  def has_alternative_spellings(translitered)
+    @@alternative_spellings.has_key?(translitered)
+  end
+  # Return the alternative spellings of the word
+  # * [translitered] The translitered word
+  #  * @return The alternative spellings matching the transliterd word.
+  def get_alternative_spellings(translitered)
+    if(self.has_alternative_spellings(translitered))
+      return @@alternative_spellings[translitered]
+    else
+      return nil
+    end
+  end
+private
+    #Hash of solutions for analyzed words
+    @@solutions ={}
+    #Hash of alternative spellings
+    @@alternative_spellings ={}
+end

data/lib/raramorph/latin_arabic_translator.rb ADDED Viewed

@@ -0,0 +1,35 @@
+# Class For  Latin Arabic Transileration
+# Author:: eSpace technologies  www.eSpace.com.eg
+# Copyright:: 2008
+class LatinArabicTranslator
+ # * Table Used for Tranlation From Latin Letters To Arabic I.e ( Arabize Word )
+  # * According to  Buckwalter system Dictionary
+  TABLE = {"'" => "\u0621","|" => "\u0622",">" => "\u0623","&" => "\u0624",
+  "<" => "\u0625","}" => "\u0626","A" => "\u0627","b" => "\u0628",
+  "p" => "\u0629","t" => "\u062A","v" => "\u062B","j" => "\u062C",
+  "H" => "\u062D","x" => "\u062E","d" => "\u062F","*" => "\u0630",
+  "r" => "\u0631","z" => "\u0632", "s" => "\u0633","$" => "\u0634","S" => "\u0635",
+  "D" => "\u0636","T" => "\u0637","Z" => "\u0638","E" => "\u0639","g" => "\u063A",
+  "_" => "\u0640","f" => "\u0641","q" => "\u0642","k" => "\u0643","l" => "\u0644",
+  "m" => "\u0645","n" => "\u0646","h" => "\u0647","w" => "\u0648","Y" => "\u0649","y" => "\u064A",
+  "F" => "\u064B","N" => "\u064C","K" => "\u064D","a" => "\u064E","u" => "\u064F","i" => "\u0650",
+  "~" => "\u0651", "o" => "\u0652",  "`" => "\u0670","{" => "\u0671","P" => "\u067E","J" => "\u0686",
+  "V" => "\u06A4",   "G" => "\u06AF", "R" => "\u0698" ,"," => "\u060C" , ";" => "\u061B" , "?" => "\u061F"
+  }
+ # * Translate : Transilerate the Roman lettered word to  Arabic Word
+ # * [word] Word String To be processed
+ # * @return transilerated word
+ #
+  def self.translate(word)
+   result = ""
+   word.force_encoding "UTF-8"
+   word.each_char{|char|
+    result+= TABLE[char] ? TABLE[char] : char
+   }
+   result
+  end
+end

data/lib/raramorph/logger.rb ADDED Viewed

@@ -0,0 +1,20 @@
+class Logger
+  attr_reader :verbose , :output
+  def initialize(verbose = nil  , output = nil )
+     @verbose = verbose
+	 @output = output
+	 @stream = StringIO.new
+  end
+  def info string , require_verbose = false
+    @stream.puts(string) #if (  require_verbose && @verbose  || ! require_verbose )
+  end
+  def log
+    return  puts @stream.string  if @output.nil?
+	File.open(@output , "w") { |f|
+	 f.puts @stream.string }
+  end
+end