RubyGems - nlp - Versions diffs - 0.2.7 → 0.2.8 - Mend

nlp 0.2.7 → 0.2.8

Files changed (21) hide show

data/lib/analizators/analyzer.rb +28 -0
data/lib/analizators/liwc_analyzer.rb +68 -0
data/lib/analizators/rid_analyzer.rb +10 -0
data/lib/dictionaries/category.rb +27 -0
data/lib/dictionaries/dictionary.rb +76 -0
data/lib/dictionaries/liwc_category.rb +54 -0
data/lib/dictionaries/pl_trie.rb +31 -0
data/lib/dictionaries/rid_category.rb +21 -0
data/lib/nlp.rb +0 -1
data/lib/tagger/emoticon.rb +13 -0
data/lib/tagger/inflectable.rb +59 -0
data/lib/tagger/lemmatizer.rb +112 -0
data/lib/tagger/meaningable.rb +63 -0
data/lib/tagger/sentence.rb +24 -0
data/lib/tagger/takipi_web_service.rb +51 -0
data/lib/tagger/text.rb +24 -0
data/lib/tagger/token.rb +45 -0
data/lib/tagger/token_scanner.rb +58 -0
data/lib/tagger/word.rb +20 -0
metadata +21 -4
data/lib/morfeusz.rb +0 -69

@@ -0,0 +1,28 @@
+module NLP
+  class Analyzer
+    def initialize(dict)
+      @dictionary = Dictionary.new(dict)
+    end
+    def analyze(scanner)
+      results = TextStatistics.new
+      while token = scanner.current
+        word = token.lemat
+        categories = @dictionary.find(word)
+        results.add(word,categories) unless categories.nil?
+        results.total_words += 1
+        scanner.next(:word)
+      end
+      results
+    end
+  end
+end

data/lib/analizators/liwc_analyzer.rb ADDED

@@ -0,0 +1,68 @@
+module NLP
+  class LIWCAnalyzer < Analyzer
+    def initialize
+      @dictionary = Dictionary.new(:liwc)
+    end
+    def analyze(scanner)
+      results = TextStatistics.new
+      results.hash = {
+        :long_words => [],
+        :zaimki => [],
+        :zaimki1 => [],
+        :zaimki2 => [],
+        :zaimki3 => [],
+        :przyimki => [],
+        :numbers => [],
+        :emotion => [],
+        :social => [],
+        :personal => [],
+        :posemotion => [],
+        :negemotion => [],
+        :wulgar => [],
+        :cognitive => []
+      }
+      while token = scanner.current
+        word = token.lemat
+        categories = @dictionary.find(word.gsub( /[^\w-]/, "" ))
+        unless categories.nil?
+          results.add(word,categories)
+          token.category = categories.first
+          results[:emotion].push token.orth if token.emotion?
+          results[:social].push token.orth if token.social?
+          results[:personal].push token.orth if token.personal?
+          results[:wulgar].push token.orth if token.bad_word?
+          results[:cognitive].push token.orth if token.cognitive?
+          results[:posemotion].push token.orth if token.positive_emotion?
+          results[:negemotion].push token.orth if token.negative_emotion?
+        end
+          #words longer than 10
+          results[:long_words].push word if word.jlength > 10
+          if token.zaimek?
+            results[:zaimki].push word
+            results[:zaimki1].push token.orth if word === 'ja' or word === 'my'
+            results[:zaimki2].push token.orth if word === 'ty' or word === 'wy'
+            results[:zaimki3].push token.orth if word === 'on'
+          end
+          results[:przyimki].push word if token.przyimek?
+          results[:numbers].push token.orth if token.number? or token.liczebnik?
+        results.total_words += 1
+        scanner.next(:alphanum)
+      end
+      results
+    end
+  end
+end

data/lib/analizators/rid_analyzer.rb ADDED

@@ -0,0 +1,10 @@
+module NLP
+  class  RIDAnalyzer < Analyzer
+    def initialize
+      @dictionary = Dictionary.new(:rid)
+    end
+  end
+end

data/lib/dictionaries/category.rb ADDED

@@ -0,0 +1,27 @@
+module NLP
+  class Category
+    attr_reader :parent, :name
+    def initialize(name, parent = nil)
+      @parent = parent
+      @name = name.to_sym
+    end
+    def path
+      @parent ? (@parent.path + '/' + name.to_s) : name.to_s
+    end
+    def root
+      category = self
+      while category.parent != nil
+        category = category.parent
+      end
+      category.name
+    end
+    def to_s
+      "#{path.inspect}"
+    end
+  end
+end

data/lib/dictionaries/dictionary.rb ADDED

@@ -0,0 +1,76 @@
+module NLP
+  class Dictionary
+    attr_accessor :tree
+    def initialize(category_file=:rid,restore = true)
+      state_file = File.expand_path(DICTIONARY_CACHE_DIR+".#{category_file.to_s}")
+      if restore and File.exist?(state_file)
+        @tree = Dictionary.restore(state_file)
+      else
+        @tree = PlTrie.new
+        load_categories(File.dirname(__FILE__)+"/../../dict/#{category_file.to_s}", category_file )
+        store(state_file)
+      end
+    end
+    def store( state_file )
+      File.open( File.expand_path( state_file ), "w" ) do |file|
+        Marshal.dump( self.tree, file )
+      end
+      self
+    end
+    def self.restore( state_file )
+      File.open( File.expand_path( state_file ) ) do |file|
+        Marshal.restore( file )
+      end
+    end
+    def find(word)
+      begin
+        @tree.find(word)
+      rescue
+        nil
+      end
+    end
+    def load_categories(category_file,type)
+      category = nil
+      primary = nil
+      secondary = nil
+      tertiary = nil
+      if type == :rid
+        cat_class = NLP.const_get("RIDCategory")
+      else
+        cat_class = NLP.const_get("LIWCCategory")
+      end
+      File.open(category_file) do |file|
+        while line = file.gets
+          line.chomp!
+          begin
+            lead, rest = line.scan(/(\t*)(.*)/).first
+            if lead.size == 0
+              category = primary = cat_class.new(rest)
+              secondary, tertiary = nil
+            elsif lead.size == 1
+              category = secondary = cat_class.new(rest, primary)
+              tertiary = nil
+            elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ_]+$/)) && cat >= 0
+              category = tertiary = cat_class.new( rest, secondary )
+            else
+              word = rest.downcase.gsub( /\s*\(1\)$/, '' )
+              @tree.insert(word, category)
+            end
+          rescue
+            raise
+          end
+        end
+      end
+    end
+  end
+end

data/lib/dictionaries/liwc_category.rb ADDED

@@ -0,0 +1,54 @@
+module NLP
+  class LIWCCategory < Category
+    #primary categories
+    def linguistic?
+      root == :PIERWOTNE
+    end
+    def psychological?
+      root == :PROCESY_PSYCHOLOGICZNE
+    end
+    def relative?
+      root === :RELATYWNOSC
+    end
+    def personal?
+      root == :OSOBISTE
+    end
+    #second categories
+    def emotion?
+      path.include? 'EMOCJE'
+    end
+    def positive_emotion?
+      path.include? 'POZYTYWNE_EMOCJE'
+    end
+    def negative_emotion?
+      path.include? 'NEGATYWNE_EMOCJE'
+    end
+    def cognitive?
+      path.include? 'KOGNITYWNE_PROCESY'
+    end
+    def sense?
+      path.include? 'ZMYSLY'
+    end
+    def social?
+      path.include? 'SOCIAL'
+    end
+    def bad_word?
+      path.include? 'WULGAR'
+    end
+  end
+end

data/lib/dictionaries/pl_trie.rb ADDED

@@ -0,0 +1,31 @@
+require 'ds'
+module NLP
+  include DS
+  class PlTrie < Trie
+    ALPHABET = %w{- a ą b c ć d e ę f g h i j k l ł m n ń o ó p r s ś t u v w x y z ź ż} << ' '
+    #private
+    def priv_insert(s, value)
+      if s.empty?
+        if @data.nil?
+          @data = [value]
+        else
+          @data.push value
+        end
+      else
+        index = key(s.first)
+        subtree = if @children[index]
+                    @children[index]
+                  else
+                    @children[index] = PlTrie.new
+                  end
+        subtree.priv_insert(s[1..-1], value)
+      end
+    end
+  end
+end

data/lib/dictionaries/rid_category.rb ADDED

@@ -0,0 +1,21 @@
+module NLP
+  class RIDCategory < Category
+    def self.top_level
+      [new(:PIERWOTNE),new(:WTORNE),new(:EMOCJE)]
+    end
+    def primary?
+      root == :PIERWOTNE
+    end
+    def secondary?
+      root == :WTORNE
+    end
+    def emotions?
+      root == :EMOCJE
+    end
+  end
+end

data/lib/nlp.rb CHANGED

@@ -5,7 +5,6 @@ end
 require 'stdlib/ext/array'
-require 'morfeusz'
 require "analizators/analyzer"
 require "analizators/rid_analyzer.rb"

data/lib/tagger/emoticon.rb ADDED

@@ -0,0 +1,13 @@
+module NLP
+  class Emoticon < Token
+    include Meaningable
+    def initialize(tokens,tags)
+      @orth = tokens.join("")
+      @tags = 'emoticon'
+    end
+  end
+end

data/lib/tagger/inflectable.rb ADDED

@@ -0,0 +1,59 @@
+module Inflectable
+  GRAM_CAT = {
+    #rzeczownik
+    :adj => 'przymiotnik',
+    [:subst,:depr] => 'rzeczownik',
+    :adv => 'przyslowek',
+    :num => 'liczebnik',
+    [:pron,:siebie] => 'zaimek',
+    :prep => 'przyimek',
+    #liczby
+    :sg => 'liczba_pojedyncza',
+    :pl => 'liczba_mnoga',
+    #Przypadki
+    :nom => 'mianownik',
+    :gen => 'dopelniacz',
+    :dat => 'celownik',
+    :acc => 'biernik',
+    :inst => 'narzednik',
+    :loc => 'miejscownik',
+    :voc => 'wolacz',
+    #Rodzaje
+    :m1 => 'meski_osobowy',
+    :m2 => 'meski_zwierzecy',
+    :m3 => 'meski_rzeczowy',
+    :f => 'zenski',
+    :n1 => 'nijaki_zbiorowy',
+    :n2 => 'nijaki zwykly',
+    :p1 => 'przymnogi_osobowy',
+    :p2 => 'przymnogi_zwykly',
+    :p3 => 'przymnogi_opisowy',
+    #Osoby
+    :pri => "pierwsza_osoba",
+    :sec => "druga_osoba",
+    :ter => "trzecia_osoba",
+    #Stopień
+    :pos => "stopien_rowny",
+    :comp => "stopien_wyzszy",
+    :sup => "stopien_najwyzszy"
+  }
+  GRAM_CAT.each do |key,value|
+    define_method(value+"?"){
+      inflection.split(":").any?{|e|
+        if key.is_a? Array
+          key.any?{|k| e.include? k.to_s}
+        else
+          e.include? key.to_s
+        end
+      }
+    }
+  end
+end

data/lib/tagger/lemmatizer.rb ADDED

@@ -0,0 +1,112 @@
+require 'rexml/document'
+module NLP
+  class Lemmatizer
+    include REXML
+    def self.lemmatize(text,method=nil,input_type=nil)
+      if text.is_a? File
+        str = text.read
+        text.close
+      elsif text.is_a? String
+        str = text
+      else
+        raise ArgumentError, "Argument is not String or File"
+      end
+      if method === :takipi
+        takipi_lemmatize(str,input_type)
+      #Default lematization method is  Morfeusz
+      else
+        takipi_lemmatize(str,:remote)
+        #morfeusz_lemmatize(str)
+      end
+    end
+    def self.takipi_lemmatize(text,method)
+      if method === :local
+        xml_file = TAKIPI_XML_FILE
+        t1 = Thread.new do
+          `echo '#{text}' > /tmp/text.txt; takipi -i /tmp/text.txt -o #{xml_file} -it TXT`
+        end
+        t1.join
+        f = File.open(xml_file,"r")
+        doc = Document.new f
+      elsif method === :remote
+        xml = TakipiWebService.request(text)
+        doc = Document.new xml
+      else
+        raise ArgumentError, 'Argument is not :local or :remote'
+      end
+      parse_lemmatized_xml(doc)
+    end
+    def self.morfeusz_lemmatize(text)
+      temp_text = Text.new
+      #simple tagger
+      #TODO lemmatizer should take TokenScanner object that defines
+      #how split string
+      # text.split(/\.|!|\?/).each do |s|
+      #   sentence = Sentence.new
+      #   sentence << s.split(" ").collect{ |t|
+      #     if word = Morfeusz::Lexeme.find(t)
+      #       if word[0]
+      #         Word.new(t,word[0].base_form,"")
+      #       else
+      #         Word.new(t,"","")
+      #       end
+      #     else
+      #       Word.new(t,"","")
+      #     end
+      #   }
+      #   temp_text <<  sentence
+      # end
+      temp_text
+    end
+    def self.parse_lemmatized_xml(doc)
+      text = Text.new
+      doc.elements.each("*/chunkList/chunk") do |chunk|
+        sentence = Sentence.new
+        tokens = []
+        chunk.elements.each("tok") do |tok|
+          word = tok.elements[1].text
+          lemat, inflect = ""
+          tok.elements.each("lex") do |lex|
+            if lex.has_attributes?
+              lemat = lex.elements[1].text
+              inflect = lex.elements[2].text
+            end
+          end
+          tokens << Word.new(word,lemat,inflect)
+        end
+        sentence << tokens
+        text << sentence
+      end
+      text
+    end
+  end
+end

data/lib/tagger/meaningable.rb ADDED

@@ -0,0 +1,63 @@
+module Meaningable
+  #LIWC
+  #primary categories
+  def linguistic?
+    category.root == :PIERWOTNE
+  end
+  def psychological?
+    category.root == :PROCESY_PSYCHOLOGICZNE
+  end
+  def relative?
+    category.root === :RELATYWNOSC
+  end
+  def personal?
+    category.root == :OSOBISTE
+  end
+  #second categories
+  def emotion?
+    category.path.include? 'EMOCJE'
+  end
+  def positive_emotion?
+    category.path.include? 'POZYTYWNE_EMOCJE'
+  end
+  def negative_emotion?
+    category.path.include? 'NEGATYWNE_EMOCJE'
+  end
+  def cognitive?
+    category.path.include? 'KOGNITYWNE_PROCESY'
+  end
+  def sense?
+    category.path.include? 'ZMYSLY'
+  end
+  def social?
+    category.path.include? 'SOCIAL'
+  end
+  def bad_word?
+    category.path.include? 'WULGAR'
+  end
+  #SEMANTIC
+  def synonym?(other)
+  end
+  def synonyms
+  end
+end

data/lib/tagger/sentence.rb ADDED

@@ -0,0 +1,24 @@
+module NLP
+  class Sentence
+    attr_reader :tokens
+    def initialize()
+      @tokens = []
+    end
+    def << tokens
+      if tokens.is_a? Array
+        @tokens.concat tokens
+      else
+        @tokens << tokens
+      end
+      self
+    end
+    def words_number
+      @tokens.count{|t| !t.interp?}
+    end
+  end
+end

data/lib/tagger/takipi_web_service.rb ADDED

@@ -0,0 +1,51 @@
+require 'rubygems'
+require 'savon'
+class TakipiWebService
+  URL = 'http://nlp.pwr.wroc.pl/clarin/ws/takipi/'
+  WSDL_URL = URL + 'takipi.wsdl'
+  def self.request(text)
+    client  = Savon::Client.new WSDL_URL, :soap_endpoint => URL
+    # Call remote service methods
+    response =  client.tag do |soap|
+      soap.body = "<text>#{text}</text><format>TXT</format><useGuesser>true</useGuesser>"
+    end
+    response =  response.to_hash
+    token =  response[:tag_response][:tag_response][:msg]
+    status = (response[:tag_response][:tag_response][:status]).to_i
+    #checking status
+    timeout = 60
+    step = 5
+    count = 0
+    loop do
+      break if count > timeout
+      if status == 1
+        break
+      elsif status == 2 or status == 3
+        count += 5
+        sleep(1)
+        r = client.get_status do |soap|
+          soap.body = "<token>#{token}</token>"
+        end.to_hash
+        status =  (r[:get_status_response][:status]).to_i
+      end
+    end
+    #geting result
+    result = client.get_result do |soap|
+      soap.body="<token>#{token}</token>"
+    end
+    response_document = result.to_hash[:get_result_response][:tag_response][:msg]
+    #transforming response to well formed xml string
+    return "<xml><chunkList>#{response_document}</chunkList></xml>"
+  end
+end

data/lib/tagger/text.rb ADDED

@@ -0,0 +1,24 @@
+module NLP
+  class Text
+    attr_reader :sentences
+    def initialize
+      @sentences = []
+    end
+    def << sentence
+      @sentences.push sentence
+    end
+    def words_per_sentence
+      @sentences.collect{|s| s.words_number}.mean
+    end
+    def flatten
+      flattened = []
+      @sentences.each{ |s| s.tokens.each{|t| flattened.push t } }
+      flattened
+    end
+  end
+end

data/lib/tagger/token.rb ADDED

@@ -0,0 +1,45 @@
+module NLP
+  class Token
+    attr_reader :orth
+    attr_reader :tags
+    def initialize(orth,tags)
+      @orth = orth
+      @tags = tags
+    end
+    def symbol?
+      @tags.eql? "tsym"
+    end
+    def interp?
+      @tags.eql? "interp"
+    end
+    def word?
+      not interp? and not number? and not agl?
+    end
+    def number?
+      @tags.include?("tnum")
+    end
+    def integer?
+      @tags.include?("tnum:integer")
+    end
+    def float?
+      @tags.include?("tnum:frac")
+    end
+    def qublic?
+      @tags.include?("qub")
+    end
+    def agl?
+      @tags.include?("agl")
+    end
+  end
+end

data/lib/tagger/token_scanner.rb ADDED

@@ -0,0 +1,58 @@
+module NLP
+  class TokenScanner
+    attr_reader :text, :tokens
+    def initialize(text)
+      @text = text
+      @pos = 0
+      @tokens = @text.flatten
+    end
+    def next(type)
+      @pos+=1
+      case type
+      when :word
+        while @pos < @tokens.size and !@tokens[@pos].word?
+          @pos+= 1
+        end
+      when :interp
+        while @pos < @tokens.size and !@tokens[@pos].interp?
+          @pos+= 1
+        end
+      when :number
+        while @pos < @tokens.size and !@tokens[@pos].number?
+          @pos+= 1
+        end
+      when :alphanum
+        while @pos < @tokens.size and !@tokens[@pos].number? and !@tokens[@pos].word?
+          @pos+= 1
+        end
+      end
+    end
+    def current
+      if @pos == @tokens.size
+        nil
+      else
+        @tokens[@pos]
+      end
+    end
+    def rewind
+      @pos = 0
+    end
+    def index
+      @pos
+    end
+    def end?
+      @pos == tokens.size
+    end
+  end
+end

data/lib/tagger/word.rb ADDED

@@ -0,0 +1,20 @@
+module NLP
+  class Word < Token
+    include Inflectable
+    include Meaningable
+    attr_reader :lemat
+    attr_accessor :category
+    def initialize(word, lemat, tags)
+      super(word,tags)
+      @lemat = lemat
+    end
+    def inflection
+      @tags
+    end
+  end
+end

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: nlp
 version: !ruby/object:Gem::Version
-  hash: 25
+  hash: 7
   prerelease:
   segments:
   - 0
   - 2
-  - 7
-  version: 0.2.7
+  - 8
+  version: 0.2.8
 platform: ruby
 authors:
 - knife
@@ -59,9 +59,26 @@ extra_rdoc_files:
 files:
 - dict/liwc
 - dict/rid
-- lib/morfeusz.rb
+- lib/analizators/analyzer.rb
+- lib/analizators/liwc_analyzer.rb
+- lib/analizators/rid_analyzer.rb
+- lib/dictionaries/category.rb
+- lib/dictionaries/dictionary.rb
+- lib/dictionaries/liwc_category.rb
+- lib/dictionaries/pl_trie.rb
+- lib/dictionaries/rid_category.rb
 - lib/nlp.rb
 - lib/stdlib/ext/array.rb
+- lib/tagger/emoticon.rb
+- lib/tagger/inflectable.rb
+- lib/tagger/lemmatizer.rb
+- lib/tagger/meaningable.rb
+- lib/tagger/sentence.rb
+- lib/tagger/takipi_web_service.rb
+- lib/tagger/text.rb
+- lib/tagger/token.rb
+- lib/tagger/token_scanner.rb
+- lib/tagger/word.rb
 - lib/text_statistics.rb
 - LICENSE
 - README.rdoc

data/lib/morfeusz.rb DELETED

@@ -1,69 +0,0 @@
-# Ruby bindings for Morfeusz v. 0.1
-# Author: Aleksander Pohl
-# apohllo@o2.pl
-require 'rubygems'
-require 'inline'
-require 'singleton'
-require 'iconv'
-module NLP
-  module Morfeusz
-    MORFOPT_ENCODING = 1
-    MORFEUSZ_UTF_8 = 8
-    class Morfeusz
-      include Singleton
-      inline(:C) do |builder|
-        builder.include '"morfeusz.h"'
-        builder.add_compile_flags '-lmorfeusz', '-I/home/knife/morf/include/'
-        builder.c <<-END
-          void initialize(){
-            morfeusz_set_option(#{MORFOPT_ENCODING},#{MORFEUSZ_UTF_8});
-          }
-        END
-        builder.c <<-END
-          char * about(){
-            return morfeusz_about();
-          }
-        END
-        builder.c <<-END
-          VALUE _base(VALUE str){
-            char * p;
-            int index = 0;
-            VALUE arr = rb_ary_new();
-            int id_push = rb_intern("push");
-            p = StringValuePtr(str);
-            InterpMorf* result = morfeusz_analyse(p);
-              InterpMorf el;
-              while((el = result[index++]).k != -1){
-                if(el.haslo != NULL){
-                  rb_funcall(arr,id_push,1,rb_str_new2(el.haslo));
-                }
-              }
-            return arr;
-          }
-        END
-        def base(word)
-#          _base(word)
-          _base(word).collect{|e| e}
-        end
-      end
-    end
-    class Lexeme
-      attr_reader :base_form
-      def initialize(base_form)
-        @base_form = base_form
-      end
-      def self.find(word)
-        Morfeusz.instance.base(word).collect{|bf| Lexeme.new(bf)}
-      end
-    end
-  end
-end