RubyGems - rblearn - Versions diffs - 0.2.2 → 0.3.0 - Mend

rblearn 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml +4 -4
data/lib/rblearn/CountVectorizer.rb +84 -89
data/lib/rblearn/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: baeb51c361fba549f4909d2d9dc082b56b07bc54
-  data.tar.gz: 17855594b1a780753f5a6e708be8603aaa3497db
+  metadata.gz: 0fb5e3f456a1e221345f6217eb95da98592eaf81
+  data.tar.gz: d14a5e3ef23821b381e77545c2ed71c90da82e0d
 SHA512:
-  metadata.gz: beb24d4020d67d1b9e7e6455d682433adee10848dfdb5e01a940a22d470778120713e05d2d0780173ec4d87aa0399895c97509f145768a2bbe13793e5bddddf0
-  data.tar.gz: 060d38b1a4fd38c05f4937b447f037bab7f662ea13f9849443a5c479601f8781b149cc450f54602225aae0961e11dba0cd77432f10b4e12feb80cb3b32df654c
+  metadata.gz: 5b86f3d52a267b07277419d1dffc6f7ab461a8e524cf44af9de2c96e08b6e75fdb7a9ffac99f0f63534bfbb3994b325a426732cbad4f2740f582008e74996984
+  data.tar.gz: a393401ceac0d65543b3b1dfee6975bf82b0f695525f10702ed66abb4a64c481a250c24932effad3460b413eb25483d8eb88f91831fcca3a08397c03ef1525d1

data/lib/rblearn/CountVectorizer.rb CHANGED Viewed

@@ -1,100 +1,95 @@
 module Rblearn
-	class CountVectorizer
+  class CountVectorizer
     # TODO: consider the access controll about all variables
-		attr_accessor :feature_names, :doc_matrix, :token2index
-		# tokenizer: lambda function :: string -> Array<string>
-		# lowcase: whether if words are lowercases :: bool
-		# stop_words: list of stop words :: Array<string>
-		# max_features: limitation of feature size :: Float \in [0, 1]
-		# TODO: by max_features, zero vectors are sometimes created.
-		def initialize(tokenizer, lowercase=true, max_features=0.8)
-			@tokenizer = tokenizer
-			@lowercase = lowercase
-			stop_words = Stopwords::STOP_WORDS
-			stop_words.map! {|token| token.stem}
-			stop_words.map! {|token| token.downcase} if @lowercase
-			@stopwords = stop_words
-			@max_feature = max_features
-		end
-		# features: Each documents' feature :: Array<String> -> NArray::Int64
-		def fit_transform(features)
-			all_vocaburaries = []
-			word_frequency = Hash.new{|hash, key| hash[key] = 0}
-      document_frequency = Hash.new{|hash, key| hash[key] = 0}
-      word_tfidf_score = Hash.new{|hash, key| hash[key] = 0}
-      document_size = features.size
-			features.each do |feature|
-        token_list = @tokenizer.call(feature)
-        # compute tf-value
-				token_list.each do |token|
-					token.downcase! if @lowercase
-					word_frequency[token] += 1
-				end
-        # compute df-value
+    attr_accessor :token2index
+    # tokenizer: lambda function :: string -> Array<string>
+    # lowcase: whether if words are lowercases :: bool
+    # stop_words: list of stop words :: Array<string>
+    # max_features: limitation of feature size :: Float \in [0, 1]
+    # TODO: by max_features, zero vectors are sometimes created.
+    def initialize(tokenizer, lowercase=true, max_features=0.5)
+      @tokenizer = tokenizer
+      @lowercase = lowercase
+      stop_words = Stopwords::STOP_WORDS + ['-', '--', '(', ')', "\\", "'", '"', '!', '?', ':', ';', '.', ',', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']
+      stop_words.map! {|token| token.stem}
+      stop_words.map! {|token| token.downcase} if @lowercase
+      @stopwords = stop_words
+      @max_feature = max_features
+    end
+    def get_feature_names
+      @feature_names
+    end
+    # features: Each documents' feature :: Array<String> -> NArray::Int64
+    def fit_transform(features)
+      all_vocabularies = []
+      tf = Hash.new{|hash, token| hash[token] = 0}
+      df = Hash.new{|hash, token| hash[token] = 0}
+      tfidf = Hash.new{|hash, token| hash[token] = 0}
+      # features: Array<string>
+      features.each do |feature|
+        feature.downcase! if @lowercase
+        token_list = @tokenizer.call(feature).reject{|token| @stopwords.include?(token)}
+        all_vocabularies += token_list
+        token_list.each do |token|
+          tf[token] += 1
+        end
         token_list.uniq.each do |token|
-          document_frequency[token] += 1
-					all_vocaburaries << token
+          df[token] += 1
         end
-			end
+      end
+      # to get the set of vocabulary
+      all_vocabularies.uniq!
+      tf.sort{|(_, v1), (_, v2)| v2 <=> v1}.first(20).each do |token, count|
+        tf[token] = 0
+      end
+      all_vocabularies.each do |token|
+        tfval = Math.log(tf[token])
+        idfval = Math.log(all_vocabularies.size.to_f / df[token]) + 1
+        tfidf[token] = tfval * idfval
+      end
+      tfidf = tfidf.sort{|(_, v1), (_, v2)| v2 <=> v1}
-			all_vocaburaries.uniq!
-      all_vocaburaries.each do |token|
-        tf = 1 + Math.log(word_frequency[token])
-        idf = Math.log(1+(document_size/document_frequency[token]))
-        word_tfidf_score[token] = tf * idf
+      feature_names = (0...(tfidf.size * @max_feature).to_i).map{|i| tfidf[i][0]}
+      token2index = {}
+      feature_names.each_with_index do |token, i|
+        token2index[token] = i
       end
-      word_tfidf_score = word_tfidf_score.sort{|(_, v1), (_, v2)| v2 <=> v1}
-			feature_names = (0...(word_tfidf_score.size * @max_feature).to_i).map{|i| word_tfidf_score[i][0]}
-			token2index = {}
-			feature_names.each_with_index do |token, i|
-				token2index[token] = i
-			end
-			doc_matrix = Numo::Int32.zeros([features.size, feature_names.size])
-			features.each_with_index do |feature, doc_id|
-				tokens = []
-				@tokenizer.call(feature).each do |token|
-					token.downcase! if @lowercase
-					tokens << token unless @stopwords.include?(token)
-				end
-				# BoW representation
-				counter = Hash.new{|hash, key| hash[key] = 0}
-				tokens.each do |token|
-					counter[token] += 1
-				end
-				counter.each do |token, freq|
-					doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
-				end
-			end
-			@doc_matrix = doc_matrix
-			@feature_names = feature_names
-			@token2index = token2index
-			return @doc_matrix
-		end
-	end
-	if __FILE__ == $0
-		cv = CountVectorizer.new(lambda{|s| s.split.map{|token| token.stem}}, 1, 0.8)
-		features = ['I am train man which automata and philosophy', 'numerical analysis young man', 'logic programmer']
-		p cv.fit_transform(features)
-		p cv.feature_names
-		p cv.token2index
-	end
+      doc_matrix = Numo::Int32.zeros([features.size, feature_names.size])
+      features.each_with_index do |feature, doc_id|
+        tokens = []
+        @tokenizer.call(feature).each do |token|
+          token.downcase! if @lowercase
+          tokens << token unless @stopwords.include?(token)
+        end
+        # BoW representation
+        counter = Hash.new{|hash, key| hash[key] = 0}
+        tokens.each do |token|
+          counter[token] += 1
+        end
+        counter.each do |token, freq|
+          doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
+        end
+      end
+      @feature_names = feature_names
+      @token2index = token2index
+      return doc_matrix
+    end
+  end
 end

data/lib/rblearn/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Rblearn
-  VERSION = "0.2.2"
+  VERSION = "0.3.0"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rblearn
 version: !ruby/object:Gem::Version
-  version: 0.2.2
+  version: 0.3.0
 platform: ruby
 authors:
 - himkt
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-08-01 00:00:00.000000000 Z
+date: 2016-08-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler