RubyGems - nlp - Versions diffs - 0.2.1 - Mend

nlp 0.2.1

Files changed (23) hide show

data/lib/analyzer.rb ADDED Viewed

@@ -0,0 +1,72 @@
+require 'dictionary'
+require 'morfeusz'
+require 'token'
+require 'word'
+require 'emoticon'
+require 'sentence'
+require "token_scanner.rb"
+require "inflectable"
+require "meaningable"
+$KODE = "UTF8"
+module NLP
+  class Analyzer
+    include REXML
+   #Lexeme = Apohllo::Morfeusz::Lexeme
+    def initialize( category_file, restore = true )
+        state_file = File.expand_path(Dictionary::CACHE_DIR)
+        if restore
+           @dictionary = Dictionary.restore(state_file)
+        else
+            @dictionary = Dictionary.new
+            @dictionary.load_categories( category_file )
+            @dictionary.store(state_file)
+        end
+    end
+    def analyze( scanner)
+     results = {
+        :word_count => 0,
+        :word_total => 0,
+        :scores => Hash.new { 0 },
+        :words => []
+      }
+     while token = scanner.current
+        word = token.lemat
+        categories = @dictionary.find( word.gsub( /[^\w-]/, "" ) )
+        unless categories.nil?
+            categories.each do |category|
+                puts "#{word} : #{category.name}"
+                results[:scores][category] = results[:scores][category] + 1
+            end
+            results[:word_count] += 1
+            results[:words].push word
+        end
+        results[:word_total] += 1
+        scanner.next(:word)
+     end
+      results[:sorted_scores] = results[:scores].to_a.sort_by { |result| -result[1] }
+      results[:classes] = {
+        :primary => Float(results[:sorted_scores].select { |result| result[0].primary? }.inject( 0 ) { |count,result| count + result[1] }) / results[:word_count],
+        :secondary => Float(results[:sorted_scores].select { |result| result[0].secondary? }.inject( 0 ) { |count,result| count + result[1] }) / results[:word_count],
+        :emotions => Float(results[:sorted_scores].select { |result| result[0].emotions? }.inject( 0 ) { |count,result| count + result[1] }) / results[:word_count]
+      }
+      results
+    end
+  end
+end

data/lib/category.rb ADDED Viewed

@@ -0,0 +1,28 @@
+module NLP
+  class Category
+    attr_reader :parent, :name
+    def initialize( name, parent = nil )
+      @parent = parent
+      @name = name.to_sym
+    end
+    def path
+      @parent ? ( @parent.path + '/' + name.to_s ) : name.to_s
+    end
+    def root
+      category = self
+      while category.parent != nil
+        category = category.parent
+      end
+      category.name
+    end
+    def to_s
+      "#{path.inspect}"
+    end
+  end
+end

data/lib/dictionary.rb ADDED Viewed

@@ -0,0 +1,70 @@
+require 'stree'
+require 'category'
+require 'rid_category'
+module NLP
+  class Dictionary
+   CACHE_DIR = '~/.rima'
+    def initialize
+      @tree = SearchTree.new
+      @categories = {}
+    end
+    def store( state_file )
+      File.open( File.expand_path( state_file ), "w" ) do |file|
+        Marshal.dump( self, file )
+      end
+      self
+    end
+    def self.restore( state_file )
+      File.open( File.expand_path( state_file ) ) do |file|
+        Marshal.restore( file )
+      end
+    end
+    def find( word )
+      if @exception_pattern && @exception_pattern =~ word
+        nil
+      else
+        @tree.find( word )
+      end
+    end
+    def load_categories( category_file )
+      category = nil
+      primary = nil
+      secondary = nil
+      tertiary = nil
+      File.open( category_file ) do |file|
+        while line = file.gets
+          line.chomp!
+          begin
+            lead, rest = line.scan( /(\t*)(.*)/ ).first
+            if lead.size == 0
+              category = primary = RIDCategory.new( rest )
+              secondary, tertiary = nil
+            elsif lead.size == 1
+              category = secondary = RIDCategory.new( rest, primary )
+              tertiary = nil
+            elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ]+$/)) && cat >= 0
+              category = tertiary = RIDCategory.new( rest, secondary )
+            else
+              word = rest.downcase.gsub( /\s*\(1\)$/, '' )
+              @tree.insert( word, category )
+            end
+          rescue
+            puts "Error for line: #{line}"
+            raise
+          end
+        end
+      end
+    end
+  end
+end

data/lib/emoticon.rb ADDED Viewed

@@ -0,0 +1,13 @@
+module NLP
+class Emoticon < Token
+	include Meaningable
+	def initialize(tokens,tags)
+            @orth = tokens.join("")
+            @tags = 'emoticon'
+	end
+end
+end

data/lib/inflectable.rb ADDED Viewed

@@ -0,0 +1,59 @@
+module Inflectable
+	GRAM_CAT = {
+		#rzeczownik
+		[:subst, :depr] => 'rzeczownik',
+		:adj => 'przymiotnik',
+		:adv => 'przyslowek',
+		:num => 'liczebnik',
+		[:pron,:siebie] => 'zaimek',
+		:prep => 'przyimek',
+		#liczby
+	    	:sg => 'liczba_pojedyncza',
+		:pl => 'liczba_mnoga',
+		#Przypadki
+		:nom => 'mianownik',
+    		:gen => 'dopelniacz',
+    		:dat => 'celownik',
+    		:acc => 'biernik',
+    		:inst => 'narzednik',
+    		:loc => 'miejscownik',
+    		:voc => 'wolacz',
+		#Rodzaje
+    		:m1 => 'męski_osobowy',
+    		:m2 => 'męski_zwierzęcy',
+    		:m3 => 'męski_rzeczowy',
+    		:f => 'żeński',
+    		:n1 => 'nijaki zbiorowy',
+		:n2 => 'nijaki zwykły',
+    		:p1 => 'przymnogi osobowy',
+		:p2 => 'przymnogi zwykły',
+		:p3 => 'przymnogi opisowy',
+		#Osoby
+		:pri => "pierwsza_osoba",
+		:sec => "druga_osoba",
+		:ter => "trzecia_osoba",
+		#Stopień
+		:pos => "stopien_rowny",
+		:comp => "stopien_wyzszy",
+		:sup => "stopien_najwyzszy"
+	}
+	      GRAM_CAT.each do |key,value|
+		if key.kind_of? Array
+			key = key.first
+		else
+			define_method(value+"?"){
+				inflection.split(":").any?{|e| e.include? key.to_s[1..-1]}
+			}
+		end
+	      end
+end

data/lib/liwc_category.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module NLP
+    class LIWCCategory < Category
+    end
+end

data/lib/meaningable.rb ADDED Viewed

@@ -0,0 +1,55 @@
+module Meaningable
+#LIWC
+	def positive_emotion?
+	end
+	def negative_emotion?
+	end
+	def emotion?
+	end
+	def cognitive?
+	end
+	def social?
+	end
+#EXPERIMENTAl
+	def bad_word?
+	end
+	def emoticon?
+	end
+	def filler?
+	end
+	def nonfluent?
+	end
+#SEMANTIC
+	def synonym?(other)
+	end
+	def synonyms
+	end
+end

data/lib/morfeusz.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# Ruby bindings for Morfeusz v. 0.1
+# Author: Aleksander Pohl
+# apohllo@o2.pl
+require 'rubygems'
+require 'inline'
+require 'singleton'
+require 'iconv'
+module NLP
+  module Morfeusz
+    MORFOPT_ENCODING = 1
+    MORFEUSZ_UTF_8 = 8
+    class Morfeusz
+      include Singleton
+      inline(:C) do |builder|
+        builder.include '"morfeusz.h"'
+        builder.add_compile_flags '-lmorfeusz', '-I/home/knife/morf/include/'
+        builder.c <<-END
+          void initialize(){
+            morfeusz_set_option(#{MORFOPT_ENCODING},#{MORFEUSZ_UTF_8});
+          }
+        END
+        builder.c <<-END
+          char * about(){
+            return morfeusz_about();
+          }
+        END
+        builder.c <<-END
+          VALUE _base(VALUE str){
+            char * p;
+            int index = 0;
+            VALUE arr = rb_ary_new();
+            int id_push = rb_intern("push");
+            p = StringValuePtr(str);
+            InterpMorf* result = morfeusz_analyse(p);
+              InterpMorf el;
+              while((el = result[index++]).k != -1){
+                if(el.haslo != NULL){
+                  rb_funcall(arr,id_push,1,rb_str_new2(el.haslo));
+                }
+              }
+            return arr;
+          }
+        END
+        def base(word)
+#          _base(word)
+          _base(word).collect{|e| e}
+        end
+      end
+    end
+    class Lexeme
+      attr_reader :base_form
+      def initialize(base_form)
+        @base_form = base_form
+      end
+      def self.find(word)
+        Morfeusz.instance.base(word).collect{|bf| Lexeme.new(bf)}
+      end
+    end
+  end
+end

data/lib/nlp.rb ADDED Viewed

@@ -0,0 +1,4 @@
+require 'stdlib/ext/array'
+require 'stdlib/ext/string.rb'
+require 'analyzer'

data/lib/rid_category.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module NLP
+    class RIDCategory < Category
+    def primary?
+      root == :PIERWOTNE
+    end
+    def secondary?
+      root == :WTORNE
+    end
+    def emotions?
+      root == :EMOCJE
+    end
+    end
+end

data/lib/sentence.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module NLP
+class Sentence
+    attr_reader :tokens
+    def initialize()
+        @tokens = []
+    end
+    def << tokens
+        @tokens.concat tokens
+    end
+end
+end

data/lib/stdlib/ext/array.rb ADDED Viewed

@@ -0,0 +1,6 @@
+class Array
+  def tail
+    self[1..-1]
+  end
+end

data/lib/stdlib/ext/string.rb ADDED Viewed

@@ -0,0 +1,19 @@
+class String
+    alias old_memeber []
+    def ord (index)
+        self.old_memeber index
+    end
+    def get(index)
+        self.scan(/./)[index]
+    end
+    def set(index,value)
+        arr = self.scan(/./)
+        arr[index] = value
+        self.replace(arr.join)
+        value
+    end
+end

data/lib/stree.rb ADDED Viewed

@@ -0,0 +1,85 @@
+module NLP
+  class SearchTree
+    ALPHABET = %w{* - a ą b c ć d e ę f g h i j k l ł m n ń o ó p r s ś t u w y z ź ż}
+    SYMBOLS = %w{* - : - / ) (}
+    attr_accessor :value
+    attr_accessor :subtrees
+    # 0 -> *
+    # 1 -> -
+    # 2 -> a
+    # 33 -> ź
+    def initialize
+      @subtrees = Array.new( 34, nil )
+      @value = []
+    end
+    def insert( s, value )
+      priv_insert( s.scan(/./), value )
+    end
+    def find( s )
+      priv_find( s.scan(/./) )
+    end
+  protected
+    def key( chr )
+        unless chr
+            raise ArgumentError,  "Argument chr is nil"
+        end
+        rval = ALPHABET.index(chr) || -1
+        if rval > 35
+          rval = -1 # invalid character
+        end
+       rval
+    end
+    def priv_insert( s, value )
+      if s.empty?
+        @value.push value
+      else
+        index = key( s.first )
+        subtree = if @subtrees[index] == nil
+          @subtrees[index] = SearchTree.new
+        else
+          @subtrees[index]
+        end
+        subtree.priv_insert( s.tail, value )
+      end
+    end
+    def priv_find( search )
+      if @subtrees[0]
+        @subtrees[0].value
+      else
+        if search.empty?
+          value
+        else
+          index = key( search.first )
+          if @subtrees[index]
+            @subtrees[index].priv_find( search.tail )
+          else
+            nil
+          end
+        end
+      end
+    end
+public
+   def traverse()
+        list = []
+        yield @value
+        list.concat @subrees if @subtrees  != nil
+        loop do
+            break if list.empty?
+            node = list.shift
+            yield node.value
+            list.concat node.subtrees if node.subtrees != nil
+        end
+end
+end
+end

data/lib/token.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require 'inflectable'
+module NLP
+class Token
+   attr_reader :orth
+   attr_reader :tags
+    def initialize(orth,tags)
+        @orth = orth
+        @tags = tags
+    end
+    def interp?
+        @tags.eql? "interp"
+    end
+    def word?
+        not interp? and not number?
+    end
+    def number?
+        @tags.include?("tnum")
+    end
+    def integer?
+        @tags.include?("tnum:integer")
+    end
+    def float?
+        @tags.include?("tnum:frac")
+    end
+end
+end

data/lib/token_scanner.rb ADDED Viewed

@@ -0,0 +1,137 @@
+require 'rexml/document'
+require 'soap/rpc/driver'
+module NLP
+class TokenScanner
+include REXML
+  attr_reader :text, :tokens
+    def initialize(text, method)
+        @pos = 0
+        if method === :file
+            puts "laduje tekst"
+            @text = load_lemated_text(text)
+        elsif method === :text
+            @text = lematize_text(text)
+        else
+            @text = text
+        end
+        @tokens = flatten_text(@text)
+    end
+    def next(type)
+        @pos+=1
+        case type
+        when :word
+            while @pos < @tokens.size and !@tokens[@pos].word?
+                @pos+= 1
+            end
+        when :interp
+            while @pos < @tokens.size and !@tokens[@pos].interp?
+                @pos+= 1
+            end
+         when :number
+            while @pos < @tokens.size and !@tokens[@pos].number?
+                @pos+= 1
+            end
+        end
+    end
+    def current
+        if @pos == @tokens.size
+                nil
+        else
+                @tokens[@pos]
+        end
+    end
+    def index
+        @pos
+    end
+    def end?
+        @pos == tokens.size
+    end
+    private
+    def flatten_text(text)
+        flattened = []
+        text.each { |s| s.tokens.each {|t| flattened.push t } }
+        flattened
+    end
+   def load_lemated_text(text_file)
+        	t1 = Thread.new do
+          	`takipi -i #{text_file} -o output.xml -it TXT`
+        	end
+        	t1.join
+       text = []
+       File.open("output.xml") do |f|
+           doc = Document.new(f)
+           doc.elements.each("*/chunkList/chunk") do |chunk|
+                sentence = Sentence.new
+                tokens = []
+                chunk.elements.each("tok") do |tok|
+                   word = tok.elements[1].text
+                   lemat, inflect = ""
+                   tok.elements.each("lex") do |lex|
+                        if lex.has_attributes?
+                            lemat = lex.elements[1].text
+                            inflect = lex.elements[2].text
+                        end
+                   end
+                   tokens << Word.new(word,lemat,inflect)
+               end
+                sentence << tokens
+                text << sentence
+        end
+    end
+    text
+    end
+   def lematize_text(text)
+        temp_text = []
+       text.split(/\.|!|\?/).each do |s|
+            sentence = Sentence.new
+            sentence << s.split(" ").collect{ |t|
+                if word = Morfeusz::Lexeme.find(t)
+                   if word[0]
+                        Word.new(t,word[0].base_form,"")
+                   else
+                        Word.new(t,"","")
+                   end
+                else
+                    Word.new(t,"","")
+                end
+            }
+            temp_text.push  sentence
+       end
+       temp_text
+   end
+end
+end