RubyGems - rblearn - Versions diffs - 0.2.0 → 0.2.1 - Mend

rblearn 0.2.0 → 0.2.1

Files changed (5) hide show

checksums.yaml +4 -4
data/README.md +64 -3
data/lib/rblearn/CountVectorizer.rb +23 -3
data/lib/rblearn/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: e421c233da7861bb3062a93b964b9cc4a3b23f1c
-  data.tar.gz: b2be5eb9d5f61bb7a70bda13be2d876afd509dbc
+  metadata.gz: 5aa0e8ff81bd927e89f9b061eb3fbe95b7d90c2e
+  data.tar.gz: c25921d85f54ccd9dad0b53941414db14ac83403
 SHA512:
-  metadata.gz: 6b2d6557d3a07864fe0a50a892f0ea7540fe44a585aaf451afdd220255c11fbc1bf70f9429af955b8b6b8958d4dcb157df46ae67cd0ac1d0091810489c857cc2
-  data.tar.gz: eb58039f44c2584d81b524c9dc6faaf24a32ffbf3b31d33a61817e668ede474fdd021fd315c69cfcced7fa4dc74a648b2f696ab32fed50642508e6d4f7a91de7
+  metadata.gz: 60bb7c0a1d7e08eadd65117b705cdd449e3f142d6be238d97384e9fefdd2b731c9e2e3860ccf86b9421eb9a0b69200e56281581c8d3a58f44661eefb14634f9b
+  data.tar.gz: dd9de7761f4fb7792843948b89b1933756213b8a632a1e6d41bc5a65221ed0a551034d2510210353f4b77c5cb9bfb6eb596526998a7ff478c31eded9d7e0986f

data/README.md CHANGED Viewed

@@ -1,8 +1,15 @@
 # Rblearn
-Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/rblearn`. To experiment with that code, run `bin/console` for an interactive prompt.
+[![MIT License](http://img.shields.io/badge/license-MIT-blue.svg?style=flat)](LICENSE)
+[![Website](https://img.shields.io/website-up-down-green-red/http/shields.io.svg?maxAge=2592000)](https://rubygems.org/gems/rblearn)
+[![GitHub issues](https://img.shields.io/github/issues/himkt/rblearn.svg)](https://github.com/himkt/rblearn/issues)
+[![GitHub stars](https://img.shields.io/github/stars/himkt/rblearn.svg)](https://github.com/himkt/rblearn/stargazers)
+[![GitHub release](https://img.shields.io/github/release/qubyte/rubidium.svg?maxAge=2592000)](https://github.com/himkt/rblearn)
+[![GitHub commits](https://img.shields.io/github/commits-since/SubtitleEdit/subtitleedit/3.4.7.svg?maxAge=2592000)](https://github.com/himkt/rblearn)
-TODO: Delete this and the text above, and describe your gem
+ruby-learn is a library for machine learning.
+Now, we support cross-validation and feature extraction.
 ## Installation
@@ -22,7 +29,61 @@ Or install it yourself as:
 ## Usage
-TODO: Write usage instructions here
+### Cross Validation
+CrossValidation provides two features for cross-validation and train_test_split.
+1. train_test_split
+  This method splits your dataset into train\_set and test\_set.
+  ```ruby
+  x\_train, y\_train, x\_test, y\_test = Rblearn::CrossValidation.train_test_split(x, y, 0.7).map(&:dup)
+  ```
+2. K-Fold
+  This method is for k-fold cross-validation.
+  three parameters are required.
+  1. n :: integer
+    n indicates the size of dataset.
+  2. n_folds :: integer
+    we specify the k by n_folds.
+  3. shuffle :: boolean
+    if shuffle is true, dataset are shuffled.
+  ```ruby
+  kf = Rblearn::CrossValidation::KFold.new(100, 10, true)
+  kf.create #=> list<list<train_set_indices, test_set_indices>>
+  ```
+### Count Vectorizer
+CountVectorizer is the feature extractor from texts.
+Constructor needs three parameters.
+1. tokenizer :: function
+2. lowercase :: boolean
+3. max_features :: integer
+for example,
+```ruby
+cv = Rblearn::CountVectorizer.new(lambda{|feature| feature.split.map(&:stem)}, 1, 0.7)
+cv.fit_transform(features)
+```
 ## Development

data/lib/rblearn/CountVectorizer.rb CHANGED Viewed

@@ -26,18 +26,38 @@ module Rblearn
 		def fit_transform(features)
 			all_vocaburaries = []
 			word_frequency = Hash.new{|hash, key| hash[key] = 0}
+      document_frequency = Hash.new{|hash, key| hash[key] = 0}
+      word_tfidf_score = Hash.new{|hash, key| hash[key] = 0}
+      document_size = features.size
 			features.each do |feature|
-				@tokenizer.call(feature).each do |token|
+        token_list = @tokenizer.call(feature)
+        # compute tf-value
+				token_list.each do |token|
 					token.downcase! if @lowercase
-					all_vocaburaries << token
 					word_frequency[token] += 1
 				end
+        # compute df-value
+        token_list.uniq.each do |token|
+          document_frequency[token] += 1
+					all_vocaburaries << token
+        end
 			end
 			all_vocaburaries.uniq!
 			word_frequency =  word_frequency.sort{|(_, value1), (_, value2)| value2 <=> value1}
-			feature_names = (0...(word_frequency.size * @max_feature).to_i).map{|i| word_frequency[i][0]}
+      all_vocaburaries.each do |token|
+        tf = 1 + Math.log(word_frequency[token])
+        idf = Math.log(1+(document_size/document_frequency[token]))
+        word_tfidf_score[token] = tf * idf
+      end
+      word_tfidf_score = word_tfidf_score.sort{|(_, v1), (_, v2)| v2 <=> v1}
+			feature_names = (0...(word_tfidf_score.size * @max_feature).to_i).map{|i| word_tfidf_score[i][0]}
 			token2index = {}
 			feature_names.each_with_index do |token, i|

data/lib/rblearn/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Rblearn
-  VERSION = "0.2.0"
+  VERSION = "0.2.1"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rblearn
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.2.1
 platform: ruby
 authors:
 - himkt
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-07-29 00:00:00.000000000 Z
+date: 2016-08-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler