RubyGems - nlp - Versions diffs - 0.2.6 → 0.2.7 - Mend

nlp 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

data/lib/dictionary.rb DELETED

@@ -1,85 +0,0 @@
-require 'stree'
-require 'category'
-require 'rid_category'
-require 'liwc_category'
-module NLP
-  class Dictionary
-    attr_accessor :tree
-    def initialize(category_file=:rid,restore = true)
-      state_file = File.expand_path(DICTIONARY_CACHE_DIR+".#{category_file.to_s}")
-      if restore and File.exist?(state_file)
-        d = Dictionary.restore(state_file)
-        @tree = d.tree
-      else
-        @tree = SearchTree.new
-        load_categories(File.dirname(__FILE__)+"/../dict/#{category_file.to_s}", category_file )
-        store(state_file)
-      end
-    end
-    def store( state_file )
-      File.open( File.expand_path( state_file ), "w" ) do |file|
-        Marshal.dump( self, file )
-      end
-      self
-    end
-    def self.restore( state_file )
-      File.open( File.expand_path( state_file ) ) do |file|
-        Marshal.restore( file )
-      end
-    end
-    def find(word)
-      if @exception_pattern && @exception_pattern =~ word
-        nil
-      else
-        @tree.find(word)
-      end
-    end
-    def load_categories(category_file,type)
-      category = nil
-      primary = nil
-      secondary = nil
-      tertiary = nil
-      if type == :rid
-        cat_class = NLP.const_get("RIDCategory")
-      else
-        cat_class = NLP.const_get("LIWCCategory")
-      end
-      File.open( category_file ) do |file|
-        while line = file.gets
-          line.chomp!
-          begin
-            lead, rest = line.scan( /(\t*)(.*)/ ).first
-            if lead.size == 0
-              category = primary = cat_class.new(rest)
-              secondary, tertiary = nil
-            elsif lead.size == 1
-              category = secondary = cat_class.new(rest, primary )
-              tertiary = nil
-            elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ_]+$/)) && cat >= 0
-              category = tertiary = cat_class.new( rest, secondary )
-            else
-              word = rest.downcase.gsub( /\s*\(1\)$/, '' )
-              @tree.insert( word, category )
-            end
-          rescue
-            raise
-          end
-        end
-      end
-    end
-  end
-end

data/lib/emoticon.rb DELETED

@@ -1,14 +0,0 @@
-require 'meaningable'
-module NLP
-  class Emoticon < Token
-    include Meaningable
-    def initialize(tokens,tags)
-      @orth = tokens.join("")
-      @tags = 'emoticon'
-    end
-  end
-end

data/lib/inflectable.rb DELETED

@@ -1,60 +0,0 @@
-module Inflectable
-  GRAM_CAT = {
-    #rzeczownik
-    :adj => 'przymiotnik',
-    [:subst,:depr] => 'rzeczownik',
-    :adv => 'przyslowek',
-    :num => 'liczebnik',
-    [:pron,:siebie] => 'zaimek',
-    :prep => 'przyimek',
-    #liczby
-    :sg => 'liczba_pojedyncza',
-    :pl => 'liczba_mnoga',
-    #Przypadki
-    :nom => 'mianownik',
-    :gen => 'dopelniacz',
-    :dat => 'celownik',
-    :acc => 'biernik',
-    :inst => 'narzednik',
-    :loc => 'miejscownik',
-    :voc => 'wolacz',
-    #Rodzaje
-    :m1 => 'meski_osobowy',
-    :m2 => 'meski_zwierzecy',
-    :m3 => 'meski_rzeczowy',
-    :f => 'zenski',
-    :n1 => 'nijaki_zbiorowy',
-    :n2 => 'nijaki zwykly',
-    :p1 => 'przymnogi_osobowy',
-    :p2 => 'przymnogi_zwykly',
-    :p3 => 'przymnogi_opisowy',
-    #Osoby
-    :pri => "pierwsza_osoba",
-    :sec => "druga_osoba",
-    :ter => "trzecia_osoba",
-    #Stopień
-    :pos => "stopien_rowny",
-    :comp => "stopien_wyzszy",
-    :sup => "stopien_najwyzszy"
-  }
-  GRAM_CAT.each do |key,value|
-    define_method(value+"?"){
-      inflection.split(":").any?{|e|
-        if key.is_a? Array
-          key.any?{|k| e.include? k.to_s}
-        else
-          e.include? key.to_s
-        end
-      }
-    }
-  end
-end

data/lib/lemmatizer.rb DELETED

@@ -1,112 +0,0 @@
-require 'takipi_web_service'
-require 'rexml/document'
-require 'morfeusz'
-module NLP
-  class Lemmatizer
-    include REXML
-    def self.lemmatize(text,method=nil,input_type=nil)
-      if text.is_a? File
-        str = text.read
-        text.close
-      elsif text.is_a? String
-        str = text
-      else
-        raise ArgumentError, "Argument is not String or File"
-      end
-      if method === :takipi
-        takipi_lemmatize(str,input_type)
-      #Default lematization method is  Morfeusz
-      else
-        morfeusz_lemmatize(str)
-      end
-    end
-    def self.takipi_lemmatize(text,method)
-      if method === :local
-        xml_file = TAKIPI_XML_FILE
-        t1 = Thread.new do
-          `echo '#{text}' > /tmp/text.txt; takipi -i /tmp/text.txt -o #{xml_file} -it TXT`
-        end
-        t1.join
-        f = File.open(xml_file,"r")
-        doc = Document.new f
-      elsif method === :remote
-        xml = TakipiWebService.request(text)
-        doc = Document.new xml
-      else
-        raise ArgumentError, 'Argument is not :local or :remote'
-      end
-      parse_lemmatized_xml(doc)
-    end
-    def self.morfeusz_lemmatize(text)
-      temp_text = Text.new
-      #simple tagger
-      #TODO lemmatizer should take TokenScanner object that defines
-      #how split string
-      text.split(/\.|!|\?/).each do |s|
-        sentence = Sentence.new
-        sentence << s.split(" ").collect{ |t|
-          if word = Morfeusz::Lexeme.find(t)
-            if word[0]
-              Word.new(t,word[0].base_form,"")
-            else
-              Word.new(t,"","")
-            end
-          else
-            Word.new(t,"","")
-          end
-        }
-        temp_text <<  sentence
-      end
-      temp_text
-    end
-    def self.parse_lemmatized_xml(doc)
-      text = Text.new
-      doc.elements.each("*/chunkList/chunk") do |chunk|
-        sentence = Sentence.new
-        tokens = []
-        chunk.elements.each("tok") do |tok|
-          word = tok.elements[1].text
-          lemat, inflect = ""
-          tok.elements.each("lex") do |lex|
-            if lex.has_attributes?
-              lemat = lex.elements[1].text
-              inflect = lex.elements[2].text
-            end
-          end
-          tokens << Word.new(word,lemat,inflect)
-        end
-        sentence << tokens
-        text << sentence
-      end
-      text
-    end
-  end
-end

data/lib/liwc_analyzer.rb DELETED

@@ -1,74 +0,0 @@
-module NLP
-  class LIWCAnalyzer < Analyzer
-    def initialize(dicts)
-      @dictionary = Dictionary.new(:liwc)
-    end
-    def analyze(scanner)
-      results = Statistic.new
-      results.hash = {
-        :long_words => [],
-        :zaimki => [],
-        :zaimki1 => [],
-        :zaimki2 => [],
-        :zaimki3 => [],
-        :przyimki => [],
-        :numbers => [],
-        :emotion => [],
-        :social => [],
-        :personal => [],
-        :posemotion => [],
-        :negemotion => [],
-        :wulgar => [],
-        :cognitive => []
-      }
-      while token = scanner.current
-        word = token.lemat
-        categories = @dictionary.find(word.gsub( /[^\w-]/, "" ))
-        unless categories.nil?
-          categories.each do |category|
-            puts "Znalazłem słowo #{word} : #{category} root: #{category.root}"
-            token.category = category
-            results.add(word,category)
-            results[:emotion].push token.orth if token.emotion?
-            results[:social].push token.orth if token.social?
-            results[:personal].push token.orth if token.personal?
-            results[:wulgar].push token.orth if token.bad_word?
-            results[:cognitive].push token.orth if token.cognitive?
-            results[:posemotion].push token.orth if token.positive_emotion?
-            results[:negemotion].push token.orth if token.negative_emotion?
-          end
-        end
-        #words longer than 10
-        results[:long_words].push word if word.jlength > 10
-        if token.zaimek?
-          results[:zaimki].push word
-          results[:zaimki1].push token.orth if word === 'ja' or word === 'my'
-          results[:zaimki2].push token.orth if word === 'ty' or word === 'wy'
-          results[:zaimki3].push token.orth if word === 'on'
-        end
-        results[:przyimki].push word if token.przyimek?
-        results[:numbers].push token.orth if token.number? or token.liczebnik?
-        results.total_words += 1
-        scanner.next(:alphanum)
-      end
-      results
-    end
-  end
-end

data/lib/liwc_category.rb DELETED

@@ -1,61 +0,0 @@
-module NLP
-  class LIWCCategory < Category
-    #primary categories
-    def linguistic?
-      root == :PIERWOTNE
-    end
-    def psychological?
-      root == :PROCESY_PSYCHOLOGICZNE
-    end
-    def relative?
-      root === :RELATYWNOSC
-    end
-    def personal?
-      root == :OSOBISTE
-    end
-    #second categories
-    def emotion?
-      path.include? 'EMOCJE'
-    end
-    def positive_emotion?
-      path.include? 'POZYTYWNE_EMOCJE'
-    end
-    def negative_emotion?
-      path.include? 'NEGATYWNE_EMOCJE'
-    end
-    def cognitive?
-      path.include? 'KOGNITYWNE_PROCESY'
-    end
-    def sense?
-      path.include? 'ZMYSLY'
-    end
-    def social?
-      path.include? 'SOCIAL'
-    end
-    def bad_word?
-      path.include? 'WULGAR'
-    end
-  end
-end

data/lib/meaningable.rb DELETED

@@ -1,69 +0,0 @@
-module Meaningable
-  #LIWC
-  #primary categories
-  def linguistic?
-    category.root == :PIERWOTNE
-  end
-  def psychological?
-    category.root == :PROCESY_PSYCHOLOGICZNE
-  end
-  def relative?
-    category.root === :RELATYWNOSC
-  end
-  def personal?
-    category.root == :OSOBISTE
-  end
-  #second categories
-  def emotion?
-    category.path.include? 'EMOCJE'
-  end
-  def positive_emotion?
-    category.path.include? 'POZYTYWNE_EMOCJE'
-  end
-  def negative_emotion?
-    category.path.include? 'NEGATYWNE_EMOCJE'
-  end
-  def cognitive?
-    category.path.include? 'KOGNITYWNE_PROCESY'
-  end
-  def sense?
-    category.path.include? 'ZMYSLY'
-  end
-  def social?
-    category.path.include? 'SOCIAL'
-  end
-  def bad_word?
-    category.path.include? 'WULGAR'
-  end
-  #SEMANTIC
-  def synonym?(other)
-  end
-  def synonyms
-  end
-end