rblearn 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +64 -3
- data/lib/rblearn/CountVectorizer.rb +23 -3
- data/lib/rblearn/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5aa0e8ff81bd927e89f9b061eb3fbe95b7d90c2e
|
4
|
+
data.tar.gz: c25921d85f54ccd9dad0b53941414db14ac83403
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 60bb7c0a1d7e08eadd65117b705cdd449e3f142d6be238d97384e9fefdd2b731c9e2e3860ccf86b9421eb9a0b69200e56281581c8d3a58f44661eefb14634f9b
|
7
|
+
data.tar.gz: dd9de7761f4fb7792843948b89b1933756213b8a632a1e6d41bc5a65221ed0a551034d2510210353f4b77c5cb9bfb6eb596526998a7ff478c31eded9d7e0986f
|
data/README.md
CHANGED
@@ -1,8 +1,15 @@
|
|
1
1
|
# Rblearn
|
2
2
|
|
3
|
-
|
3
|
+
[![MIT License](http://img.shields.io/badge/license-MIT-blue.svg?style=flat)](LICENSE)
|
4
|
+
[![Website](https://img.shields.io/website-up-down-green-red/http/shields.io.svg?maxAge=2592000)](https://rubygems.org/gems/rblearn)
|
5
|
+
[![GitHub issues](https://img.shields.io/github/issues/himkt/rblearn.svg)](https://github.com/himkt/rblearn/issues)
|
6
|
+
[![GitHub stars](https://img.shields.io/github/stars/himkt/rblearn.svg)](https://github.com/himkt/rblearn/stargazers)
|
7
|
+
[![GitHub release](https://img.shields.io/github/release/qubyte/rubidium.svg?maxAge=2592000)](https://github.com/himkt/rblearn)
|
8
|
+
[![GitHub commits](https://img.shields.io/github/commits-since/SubtitleEdit/subtitleedit/3.4.7.svg?maxAge=2592000)](https://github.com/himkt/rblearn)
|
4
9
|
|
5
|
-
|
10
|
+
ruby-learn is a library for machine learning.
|
11
|
+
|
12
|
+
Now, we support cross-validation and feature extraction.
|
6
13
|
|
7
14
|
## Installation
|
8
15
|
|
@@ -22,7 +29,61 @@ Or install it yourself as:
|
|
22
29
|
|
23
30
|
## Usage
|
24
31
|
|
25
|
-
|
32
|
+
### Cross Validation
|
33
|
+
|
34
|
+
CrossValidation provides two features for cross-validation and train_test_split.
|
35
|
+
|
36
|
+
1. train_test_split
|
37
|
+
|
38
|
+
This method splits your dataset into train\_set and test\_set.
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
x\_train, y\_train, x\_test, y\_test = Rblearn::CrossValidation.train_test_split(x, y, 0.7).map(&:dup)
|
42
|
+
```
|
43
|
+
|
44
|
+
2. K-Fold
|
45
|
+
|
46
|
+
This method is for k-fold cross-validation.
|
47
|
+
|
48
|
+
three parameters are required.
|
49
|
+
|
50
|
+
1. n :: integer
|
51
|
+
|
52
|
+
n indicates the size of dataset.
|
53
|
+
|
54
|
+
2. n_folds :: integer
|
55
|
+
|
56
|
+
we specify the k by n_folds.
|
57
|
+
|
58
|
+
3. shuffle :: boolean
|
59
|
+
|
60
|
+
if shuffle is true, dataset are shuffled.
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
kf = Rblearn::CrossValidation::KFold.new(100, 10, true)
|
64
|
+
kf.create #=> list<list<train_set_indices, test_set_indices>>
|
65
|
+
```
|
66
|
+
|
67
|
+
### Count Vectorizer
|
68
|
+
|
69
|
+
CountVectorizer is the feature extractor from texts.
|
70
|
+
|
71
|
+
Constructor needs three parameters.
|
72
|
+
|
73
|
+
1. tokenizer :: function
|
74
|
+
|
75
|
+
2. lowercase :: boolean
|
76
|
+
|
77
|
+
3. max_features :: integer
|
78
|
+
|
79
|
+
|
80
|
+
for example,
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
cv = Rblearn::CountVectorizer.new(lambda{|feature| feature.split.map(&:stem)}, 1, 0.7)
|
84
|
+
cv.fit_transform(features)
|
85
|
+
```
|
86
|
+
|
26
87
|
|
27
88
|
## Development
|
28
89
|
|
@@ -26,18 +26,38 @@ module Rblearn
|
|
26
26
|
def fit_transform(features)
|
27
27
|
all_vocaburaries = []
|
28
28
|
word_frequency = Hash.new{|hash, key| hash[key] = 0}
|
29
|
+
document_frequency = Hash.new{|hash, key| hash[key] = 0}
|
30
|
+
word_tfidf_score = Hash.new{|hash, key| hash[key] = 0}
|
31
|
+
document_size = features.size
|
29
32
|
|
30
33
|
features.each do |feature|
|
31
|
-
|
34
|
+
token_list = @tokenizer.call(feature)
|
35
|
+
|
36
|
+
# compute tf-value
|
37
|
+
token_list.each do |token|
|
32
38
|
token.downcase! if @lowercase
|
33
|
-
all_vocaburaries << token
|
34
39
|
word_frequency[token] += 1
|
35
40
|
end
|
41
|
+
|
42
|
+
# compute df-value
|
43
|
+
token_list.uniq.each do |token|
|
44
|
+
document_frequency[token] += 1
|
45
|
+
all_vocaburaries << token
|
46
|
+
end
|
36
47
|
end
|
37
48
|
|
38
49
|
all_vocaburaries.uniq!
|
39
50
|
word_frequency = word_frequency.sort{|(_, value1), (_, value2)| value2 <=> value1}
|
40
|
-
|
51
|
+
|
52
|
+
all_vocaburaries.each do |token|
|
53
|
+
tf = 1 + Math.log(word_frequency[token])
|
54
|
+
idf = Math.log(1+(document_size/document_frequency[token]))
|
55
|
+
word_tfidf_score[token] = tf * idf
|
56
|
+
end
|
57
|
+
|
58
|
+
word_tfidf_score = word_tfidf_score.sort{|(_, v1), (_, v2)| v2 <=> v1}
|
59
|
+
|
60
|
+
feature_names = (0...(word_tfidf_score.size * @max_feature).to_i).map{|i| word_tfidf_score[i][0]}
|
41
61
|
|
42
62
|
token2index = {}
|
43
63
|
feature_names.each_with_index do |token, i|
|
data/lib/rblearn/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rblearn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- himkt
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-08-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|