RubyGems - nlp - Versions diffs - 0.2.5 → 0.2.6 - Mend

nlp 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

data/lib/analyzer.rb CHANGED Viewed

@@ -9,72 +9,40 @@ require 'sentence'
 require "token_scanner.rb"
 require "lemmatizer"
 require 'jcode'
+require 'statistic'
 $KODE = "UTF8"
 module NLP
   class Analyzer
-   CACHE_DIR = '~/'
-    def initialize( category_file, restore = true )
-        state_file = File.expand_path(Analyzer::CACHE_DIR)
-        if restore
-           @dictionary = Dictionary.restore(state_file)
-        else
-            @dictionary = Dictionary.new
-            @dictionary.load_categories( category_file, :rid => true )
-            @dictionary.store(state_file)
-        end
+    def initialize(dict)
+      @dictionary = Dictionary.new(dict)
     end
-    def analyze( scanner)
-     results = {
-        :word_count => 0,
-        :word_total => 0,
-        :scores => Hash.new { 0 },
-        :words => []
-      }
+    def analyze(scanner)
+      results = Statistic.new
-     while token = scanner.current
+      while token = scanner.current
         word = token.lemat
-        categories = @dictionary.find( word.gsub( /[^\w-]/, "" ) )
+        categories = @dictionary.find(word.gsub(/[^\w-]/, "" ))
         unless categories.nil?
-            categories.each do |category|
-               puts "Znalazłem słowo #{word} : #{category}"
-                results[:scores][category] = results[:scores][category] + 1
-		results[:word_count] += 1
-            	results[:words].push word
-            end
+          categories.each do |category|
+            puts "Znalazłem słowo #{word} : #{category}"
+            results.add(word,category)
+          end
         end
-        results[:word_total] += 1
+        results.total_words += 1
         scanner.next(:word)
-     end
-      results[:sorted_scores] = results[:scores].to_a.sort_by { |result| -result[1] }
-	primary_sum = results[:sorted_scores].select { |result| result[0].primary? }.inject( 0 ) { |count,result| count + result[1] }
-	secondary_sum = results[:sorted_scores].select { |result| result[0].secondary? }.inject( 0 ) { |count,result| count + result[1] }
-	emotion_sum = results[:sorted_scores].select { |result| result[0].emotions? }.inject( 0 ) { |count,result| count + result[1] }
+      end
-      results[:classes] = {
-        :primary => Float(primary_sum) / results[:word_count],
-        :secondary => Float(secondary_sum) / results[:word_count],
-        :emotions => Float(emotion_sum) / results[:word_count]
-      }
       results
-    end
+    end
   end
 end

data/lib/category.rb CHANGED Viewed

@@ -1,16 +1,16 @@
 module NLP
   class Category
     attr_reader :parent, :name
-    def initialize( name, parent = nil )
+    def initialize(name, parent = nil)
       @parent = parent
       @name = name.to_sym
     end
     def path
-      @parent ? ( @parent.path + '/' + name.to_s ) : name.to_s
+      @parent ? (@parent.path + '/' + name.to_s) : name.to_s
     end
     def root
       category = self
       while category.parent != nil
@@ -18,11 +18,10 @@ module NLP
       end
       category.name
     end
     def to_s
       "#{path.inspect}"
     end
   end
 end

data/lib/dictionary.rb CHANGED Viewed

@@ -5,12 +5,24 @@ require 'rid_category'
 require 'liwc_category'
 module NLP
   class Dictionary
-    def initialize
-      @tree = SearchTree.new
-      @categories = {}
+    attr_accessor :tree
+    def initialize(category_file=:rid,restore = true)
+      state_file = File.expand_path(DICTIONARY_CACHE_DIR+".#{category_file.to_s}")
+      if restore and File.exist?(state_file)
+        d = Dictionary.restore(state_file)
+        @tree = d.tree
+      else
+        @tree = SearchTree.new
+        load_categories(File.dirname(__FILE__)+"/../dict/#{category_file.to_s}", category_file )
+        store(state_file)
+      end
     end
     def store( state_file )
       File.open( File.expand_path( state_file ), "w" ) do |file|
         Marshal.dump( self, file )
@@ -24,50 +36,40 @@ module NLP
       end
     end
-    def find( word )
+    def find(word)
       if @exception_pattern && @exception_pattern =~ word
         nil
       else
-        @tree.find( word )
+        @tree.find(word)
       end
     end
-    def load_categories( category_file,options )
+    def load_categories(category_file,type)
       category = nil
       primary = nil
       secondary = nil
       tertiary = nil
+      if type == :rid
+        cat_class = NLP.const_get("RIDCategory")
+      else
+        cat_class = NLP.const_get("LIWCCategory")
+      end
       File.open( category_file ) do |file|
         while line = file.gets
           line.chomp!
           begin
             lead, rest = line.scan( /(\t*)(.*)/ ).first
             if lead.size == 0
-		    if options[:rid]
-              category = primary = RIDCategory.new( rest )
-		    else
-              category = primary = LIWCCategory.new( rest )
-		    end
+              category = primary = cat_class.new(rest)
               secondary, tertiary = nil
             elsif lead.size == 1
-		    if options[:rid]
-              category = secondary = RIDCategory.new( rest, primary )
-		    else
-			    category = secondary = LIWCCategory.new(rest,primary)
-		    end
+              category = secondary = cat_class.new(rest, primary )
               tertiary = nil
             elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ_]+$/)) && cat >= 0
-		    if options[:rid]
-              category = tertiary = RIDCategory.new( rest, secondary )
-		    else
-              category = tertiary = LIWCCategory.new( rest, secondary )
-		    end
+              category = tertiary = cat_class.new( rest, secondary )
             else
               word = rest.downcase.gsub( /\s*\(1\)$/, '' )
               @tree.insert( word, category )

data/lib/emoticon.rb CHANGED Viewed

@@ -1,14 +1,14 @@
 require 'meaningable'
-module NLP
-    class Emoticon < Token
-	include Meaningable
-	def initialize(tokens,tags)
-            @orth = tokens.join("")
-            @tags = 'emoticon'
-	end
+module NLP
+  class Emoticon < Token
+    include Meaningable
+    def initialize(tokens,tags)
+      @orth = tokens.join("")
+      @tags = 'emoticon'
     end
+  end
 end

data/lib/inflectable.rb CHANGED Viewed

@@ -1,61 +1,60 @@
 module Inflectable
-	GRAM_CAT = {
-		#rzeczownik
-		:adj => 'przymiotnik',
-                [:subst,:depr] => 'rzeczownik',
-		:adv => 'przyslowek',
-		:num => 'liczebnik',
-		[:pron,:siebie] => 'zaimek',
-		:prep => 'przyimek',
-		#liczby
-	    	:sg => 'liczba_pojedyncza',
-		:pl => 'liczba_mnoga',
-		#Przypadki
-		:nom => 'mianownik',
-    		:gen => 'dopelniacz',
-    		:dat => 'celownik',
-    		:acc => 'biernik',
-    		:inst => 'narzednik',
-    		:loc => 'miejscownik',
-    		:voc => 'wolacz',
-		#Rodzaje
-    		:m1 => 'meski_osobowy',
-    		:m2 => 'meski_zwierzecy',
-    		:m3 => 'meski_rzeczowy',
-    		:f => 'zenski',
-    		:n1 => 'nijaki_zbiorowy',
-		:n2 => 'nijaki zwykly',
-    		:p1 => 'przymnogi_osobowy',
-		:p2 => 'przymnogi_zwykly',
-		:p3 => 'przymnogi_opisowy',
-		#Osoby
-		:pri => "pierwsza_osoba",
-		:sec => "druga_osoba",
-		:ter => "trzecia_osoba",
-		#Stopień
-		:pos => "stopien_rowny",
-		:comp => "stopien_wyzszy",
-		:sup => "stopien_najwyzszy"
-	}
-	      GRAM_CAT.each do |key,value|
-                  define_method(value+"?"){
-			    inflection.split(":").any?{|e|
-                                if key.is_a? Array
-                                    key.any?{|k| e.include? k.to_s}
-                                else
-                                    e.include? key.to_s
-                                end
-                            }
-		    }
-	      end
+  GRAM_CAT = {
+    #rzeczownik
+    :adj => 'przymiotnik',
+    [:subst,:depr] => 'rzeczownik',
+    :adv => 'przyslowek',
+    :num => 'liczebnik',
+    [:pron,:siebie] => 'zaimek',
+    :prep => 'przyimek',
+    #liczby
+    :sg => 'liczba_pojedyncza',
+    :pl => 'liczba_mnoga',
+    #Przypadki
+    :nom => 'mianownik',
+    :gen => 'dopelniacz',
+    :dat => 'celownik',
+    :acc => 'biernik',
+    :inst => 'narzednik',
+    :loc => 'miejscownik',
+    :voc => 'wolacz',
+    #Rodzaje
+    :m1 => 'meski_osobowy',
+    :m2 => 'meski_zwierzecy',
+    :m3 => 'meski_rzeczowy',
+    :f => 'zenski',
+    :n1 => 'nijaki_zbiorowy',
+    :n2 => 'nijaki zwykly',
+    :p1 => 'przymnogi_osobowy',
+    :p2 => 'przymnogi_zwykly',
+    :p3 => 'przymnogi_opisowy',
+    #Osoby
+    :pri => "pierwsza_osoba",
+    :sec => "druga_osoba",
+    :ter => "trzecia_osoba",
+    #Stopień
+    :pos => "stopien_rowny",
+    :comp => "stopien_wyzszy",
+    :sup => "stopien_najwyzszy"
+  }
+  GRAM_CAT.each do |key,value|
+    define_method(value+"?"){
+      inflection.split(":").any?{|e|
+        if key.is_a? Array
+          key.any?{|k| e.include? k.to_s}
+        else
+          e.include? key.to_s
+        end
+      }
+    }
+  end
 end

data/lib/lemmatizer.rb CHANGED Viewed

@@ -3,106 +3,110 @@ require 'rexml/document'
 require 'morfeusz'
 module NLP
-class Lemmatizer
+  class Lemmatizer
     include REXML
-    def self.lematize(text,method,input_type)
-        if text.is_a? File
-            str = text.read
-            text.close
-        elsif text.is_a? String
-           str = text
-        else
-            raise ArgumentError, "Argument is not String or File"
-        end
-        if method === :takipi
-            takipi_lematize(str,input_type)
-        #Default lematization method is  Morfeusz
-        else
-            morfeusz_lematize(str)
-        end
+    def self.lemmatize(text,method=nil,input_type=nil)
+      if text.is_a? File
+        str = text.read
+        text.close
+      elsif text.is_a? String
+        str = text
+      else
+        raise ArgumentError, "Argument is not String or File"
+      end
+      if method === :takipi
+        takipi_lemmatize(str,input_type)
+      #Default lematization method is  Morfeusz
+      else
+        morfeusz_lemmatize(str)
+      end
     end
-    def self.takipi_lematize(text,method)
-        if method === :local
+    def self.takipi_lemmatize(text,method)
-            t1 = Thread.new do
-                 `takipi -i text.txt -o output.xml -it TXT`
-            end
+      if method === :local
-            t1.join
+        xml_file = TAKIPI_XML_FILE
-            f = File.open("output.xml","r")
-            doc = Document.new f
-        elsif method === :remote
-            xml = TakipiWebService.request(text)
-            doc = Document.new xml
-        else
-            raise ArgumentError, 'Argument is not :local or :remote'
+        t1 = Thread.new do
+          `echo '#{text}' > /tmp/text.txt; takipi -i /tmp/text.txt -o #{xml_file} -it TXT`
         end
-        parse_lematized_xml(doc)
+        t1.join
+        f = File.open(xml_file,"r")
+        doc = Document.new f
+      elsif method === :remote
+        xml = TakipiWebService.request(text)
+        doc = Document.new xml
+      else
+        raise ArgumentError, 'Argument is not :local or :remote'
+      end
+      parse_lemmatized_xml(doc)
     end
-    def self.morfeusz_lematize(text)
-            temp_text = []
-            #simple tagger
-            #TODO lematizer should take block or object Tagger that defines
-            #how split string
-            text.split(/\.|!|\?/).each do |s|
-                sentence = Sentence.new
-                sentence << s.split(" ").collect{ |t|
-                    if word = Morfeusz::Lexeme.find(t)
-                       if word[0]
-                            Word.new(t,word[0].base_form,"")
-                       else
-                            Word.new(t,"","")
-                       end
-                    else
-                        Word.new(t,"","")
-                    end
-                }
-                temp_text.push  sentence
-           end
-           temp_text
-       end
-    def self.parse_lematized_xml(doc)
-        text = Text.new
-        doc.elements.each("*/chunkList/chunk") do |chunk|
-            sentence = Sentence.new
-            tokens = []
-            chunk.elements.each("tok") do |tok|
-               word = tok.elements[1].text
-               lemat, inflect = ""
-               tok.elements.each("lex") do |lex|
-                    if lex.has_attributes?
-                        lemat = lex.elements[1].text
-                        inflect = lex.elements[2].text
-                    end
-               end
-               tokens << Word.new(word,lemat,inflect)
+    def self.morfeusz_lemmatize(text)
+      temp_text = Text.new
+      #simple tagger
+      #TODO lemmatizer should take TokenScanner object that defines
+      #how split string
+      text.split(/\.|!|\?/).each do |s|
+        sentence = Sentence.new
+        sentence << s.split(" ").collect{ |t|
+          if word = Morfeusz::Lexeme.find(t)
+            if word[0]
+              Word.new(t,word[0].base_form,"")
+            else
+              Word.new(t,"","")
             end
+          else
+            Word.new(t,"","")
+          end
+        }
+        temp_text <<  sentence
+      end
+      temp_text
+    end
-            sentence << tokens
-            text << sentence
+    def self.parse_lemmatized_xml(doc)
+      text = Text.new
+      doc.elements.each("*/chunkList/chunk") do |chunk|
+        sentence = Sentence.new
+        tokens = []
+        chunk.elements.each("tok") do |tok|
+          word = tok.elements[1].text
+          lemat, inflect = ""
+          tok.elements.each("lex") do |lex|
+            if lex.has_attributes?
+              lemat = lex.elements[1].text
+              inflect = lex.elements[2].text
+            end
+          end
+          tokens << Word.new(word,lemat,inflect)
         end
-        text
+        sentence << tokens
+        text << sentence
+      end
+      text
     end
-end
+  end
 end