RubyGems - nlp - Versions diffs - 0.2.3 → 0.2.5 - Mend

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/dict/rid CHANGED Viewed

@@ -234,8 +234,7 @@ PIERWOTNE
 			defakacja
 			dupa
 			dupek
-                        dupsko
-                        enema
+			dupsko
 			fetor
 			gazy
 			gnić
@@ -1018,7 +1017,7 @@ PIERWOTNE
 			potrząsać
 			przejażdżka
 			przenosić się
-		PRZYPADKOWE RUCHY
+		PRZYPADKOWE_RUCHY
 			puchnąć
 			pulsować
 			pustynia

data/lib/analyzer.rb CHANGED Viewed

@@ -2,25 +2,28 @@ require 'dictionary'
 #require 'morfeusz'
 require 'token'
 require 'word'
+require 'token'
+require 'text'
 require 'emoticon'
 require 'sentence'
 require "token_scanner.rb"
 require "lemmatizer"
+require 'jcode'
 $KODE = "UTF8"
 module NLP
   class Analyzer
+   CACHE_DIR = '~/'
     def initialize( category_file, restore = true )
-        state_file = File.expand_path(Dictionary::CACHE_DIR)
+        state_file = File.expand_path(Analyzer::CACHE_DIR)
         if restore
            @dictionary = Dictionary.restore(state_file)
         else
             @dictionary = Dictionary.new
-            @dictionary.load_categories( category_file )
+            @dictionary.load_categories( category_file, :rid => true )
             @dictionary.store(state_file)
         end
@@ -33,7 +36,7 @@ module NLP
         :word_count => 0,
         :word_total => 0,
         :scores => Hash.new { 0 },
-        :words => []
+        :words => []
       }

data/lib/dictionary.rb CHANGED Viewed

@@ -2,10 +2,10 @@
 require 'stree'
 require 'category'
 require 'rid_category'
+require 'liwc_category'
 module NLP
   class Dictionary
-   CACHE_DIR = '~/.rima'
     def initialize
       @tree = SearchTree.new
       @categories = {}
@@ -35,7 +35,7 @@ module NLP
     end
-    def load_categories( category_file )
+    def load_categories( category_file,options )
       category = nil
       primary = nil
       secondary = nil
@@ -47,13 +47,27 @@ module NLP
           begin
             lead, rest = line.scan( /(\t*)(.*)/ ).first
             if lead.size == 0
+		    if options[:rid]
               category = primary = RIDCategory.new( rest )
+		    else
+              category = primary = LIWCCategory.new( rest )
+		    end
               secondary, tertiary = nil
             elsif lead.size == 1
+		    if options[:rid]
               category = secondary = RIDCategory.new( rest, primary )
+		    else
+			    category = secondary = LIWCCategory.new(rest,primary)
+		    end
               tertiary = nil
-            elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ]+$/)) && cat >= 0
+            elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ_]+$/)) && cat >= 0
+		    if options[:rid]
               category = tertiary = RIDCategory.new( rest, secondary )
+		    else
+              category = tertiary = LIWCCategory.new( rest, secondary )
+		    end
             else
               word = rest.downcase.gsub( /\s*\(1\)$/, '' )
               @tree.insert( word, category )

data/lib/lemmatizer.rb CHANGED Viewed

@@ -77,7 +77,7 @@ class Lemmatizer
     def self.parse_lematized_xml(doc)
-        text = []
+        text = Text.new
         doc.elements.each("*/chunkList/chunk") do |chunk|
             sentence = Sentence.new

data/lib/liwc_analyzer.rb CHANGED Viewed

@@ -1,8 +1,94 @@
 module NLP
-class LIWC_Analyzer < Analyzer
+class LIWCAnalyzer < Analyzer
+    def initialize( category_file, restore = true )
+        state_file = File.expand_path(Analyzer::CACHE_DIR+'.liwc')
+        if restore
+           @dictionary = Dictionary.restore(state_file)
+        else
+            @dictionary = Dictionary.new
+            @dictionary.load_categories( category_file, :rid => false )
+            @dictionary.store(state_file)
+        end
+    end
-        def analyze
+        def analyze(scanner)
+	    results = {
+                :word_count => 0,
+                :word_total => 0,
+                :scores => Hash.new { 0 },
+                :words => [],
+                :cwords => Hash.new { nil },
+                :long_words => [],
+                :zaimki => [],
+		:zaimki1 => [],
+		:zaimki2 => [],
+		:zaimki3 => [],
+                :przyimki => [],
+                :numbers => [],
+                :emotion => [],
+                :social => [],
+                :personal => [],
+                :posemotion => [],
+                :negemotion => [],
+                :wulgar => [],
+                :cognitive => []
+              }
+             while token = scanner.current
+                word = token.lemat
+                categories = @dictionary.find( word.gsub( /[^\w-]/, "" ) )
+                unless categories.nil?
+                    categories.each do |category|
+                       puts "Znalazłem słowo #{word} : #{category} root: #{category.root}"
+                       token.category = category
+                        results[:scores][category] = results[:scores][category] + 1
+                        if results[:cwords][category.name].nil?
+                            results[:cwords][category.name] = []
+                        end
+                        results[:cwords][category.name].push token.orth
+                        results[:emotion].push token.orth if token.emotion?
+                        results[:social].push token.orth if token.social?
+                        results[:personal].push token.orth if token.personal?
+                        results[:wulgar].push token.orth if token.bad_word?
+                        results[:cognitive].push token.orth if token.cognitive?
+                        results[:posemotion].push token.orth if token.positive_emotion?
+                        results[:negemotion].push token.orth if token.negative_emotion?
+                        results[:word_count] += 1
+                        results[:words].push word
+                    end
+                end
+                #words longer than 9
+                results[:long_words].push word if word.jlength > 9
+		if token.zaimek?
+                	results[:zaimki].push word
+			results[:zaimki1].push token.orth if word === 'ja' or word === 'my'
+			results[:zaimki2].push token.orth if word === 'ty' or word === 'wy'
+			results[:zaimki3].push token.orth if word === 'on'
+		end
+                results[:przyimki].push word if token.przyimek?
+                results[:numbers].push token.orth if token.number? or token.liczebnik?
+                results[:word_total] += 1
+                scanner.next(:alphanum)
+             end
+             results
 	end

data/lib/liwc_category.rb CHANGED Viewed

@@ -1,7 +1,62 @@
 module NLP
     class LIWCCategory < Category
+	#primary categories
+       def linguistic?
+          root == :PIERWOTNE
+        end
+        def psychological?
+          root == :PROCESY_PSYCHOLOGICZNE
+        end
+        def relative?
+            root === :RELATYWNOSC
+        end
+        def personal?
+          root == :OSOBISTE
+        end
+        #second categories
+        def emotion?
+            path.include? 'EMOCJE'
+        end
+        def positive_emotion?
+             path.include? 'POZYTYWNE_EMOCJE'
+        end
+        def negative_emotion?
+            path.include? 'NEGATYWNE_EMOCJE'
+        end
+        def cognitive?
+            path.include? 'KOGNITYWNE_PROCESY'
+        end
+        def sense?
+            path.include? 'ZMYSLY'
+        end
+        def social?
+            path.include? 'SOCIAL'
+        end
+        def bad_word?
+            path.include? 'WULGAR'
+        end
     end
 end

data/lib/meaningable.rb CHANGED Viewed

@@ -1,44 +1,65 @@
 module Meaningable
 #LIWC
-	def positive_emotion?
+    #primary categories
+       def linguistic?
+          category.root == :PIERWOTNE
+        end
+        def psychological?
+          category.root == :PROCESY_PSYCHOLOGICZNE
+        end
-	end
+        def relative?
+            category.root === :RELATYWNOSC
+        end
+        def personal?
+          category.root == :OSOBISTE
+        end
-	def negative_emotion?
-	end
+        #second categories
+        def emotion?
+            category.path.include? 'EMOCJE'
-	def emotion?
+        end
-	end
+        def positive_emotion?
+             category.path.include? 'POZYTYWNE_EMOCJE'
+        end
-	def cognitive?
-	end
+        def negative_emotion?
+            category.path.include? 'NEGATYWNE_EMOCJE'
-	def social?
-	end
-#EXPERIMENTAl
+        end
-	def bad_word?
+        def cognitive?
+            category.path.include? 'KOGNITYWNE_PROCESY'
-	end
+        end
+        def sense?
+            category.path.include? 'ZMYSLY'
+        end
-	def emoticon?
+        def social?
+            category.path.include? 'SOCIAL'
-	end
+        end
+        def bad_word?
+            category.path.include? 'WULGAR'
+        end
-	def filler?
-	end
-	def nonfluent?
-	end
 #SEMANTIC
 	def synonym?(other)

data/lib/rid_analyzer.rb CHANGED Viewed

@@ -1,12 +1,27 @@
 module NLP
     class  RIDAnalyzer < NLP::Analyzer
+	def initialize( category_file, restore = true )
+        state_file = File.expand_path(Analyzer::CACHE_DIR+'.rid')
+        if restore
+           @dictionary = Dictionary.restore(state_file)
+        else
+            @dictionary = Dictionary.new
+            @dictionary.load_categories( category_file, :rid => true )
+            @dictionary.store(state_file)
+        end
+    end
         def analyze(scanner)
              results = {
                 :word_count => 0,
                 :word_total => 0,
                 :scores => Hash.new { 0 },
-                :words => []
+                :words => [],
+                :cwords => Hash.new { nil }
               }
              while token = scanner.current
@@ -15,8 +30,13 @@ module NLP
                 categories = @dictionary.find( word.gsub( /[^\w-]/, "" ) )
                 unless categories.nil?
                     categories.each do |category|
-                       puts "Znalazłem słowo #{word} : #{category}"
+                       puts "Znalazłem słowo #{word} : #{category} root: #{category.root}"
                         results[:scores][category] = results[:scores][category] + 1
+                        category = category.name
+                        if results[:cwords][category].nil?
+                            results[:cwords][category] = []
+                        end
+                        results[:cwords][category].push word
                         results[:word_count] += 1
                         results[:words].push word
                     end
@@ -46,6 +66,8 @@ module NLP
     end

data/lib/sentence.rb CHANGED Viewed

@@ -8,5 +8,9 @@ module NLP
         def << tokens
             @tokens.concat tokens
         end
+        def words_number
+            @tokens.size
+        end
     end
 end

data/lib/text.rb ADDED Viewed

@@ -0,0 +1,25 @@
+module NLP
+    class Text
+        attr_reader :sentences
+        def initialize
+            @sentences = []
+        end
+        def << sentence
+            @sentences.push sentence
+        end
+        def words_per_sentence
+            mean(@sentences.collect{|s| s.words_number})
+        end
+        private
+        def mean(x)
+            sum=0
+            x.each{|v| sum+=v }
+            sum/x.size
+        end
+    end
+end

nlp 0.2.3 → 0.2.5