rblearn 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +64 -3
- data/lib/rblearn/CountVectorizer.rb +23 -3
- data/lib/rblearn/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5aa0e8ff81bd927e89f9b061eb3fbe95b7d90c2e
|
4
|
+
data.tar.gz: c25921d85f54ccd9dad0b53941414db14ac83403
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 60bb7c0a1d7e08eadd65117b705cdd449e3f142d6be238d97384e9fefdd2b731c9e2e3860ccf86b9421eb9a0b69200e56281581c8d3a58f44661eefb14634f9b
|
7
|
+
data.tar.gz: dd9de7761f4fb7792843948b89b1933756213b8a632a1e6d41bc5a65221ed0a551034d2510210353f4b77c5cb9bfb6eb596526998a7ff478c31eded9d7e0986f
|
data/README.md
CHANGED
@@ -1,8 +1,15 @@
|
|
1
1
|
# Rblearn
|
2
2
|
|
3
|
-
|
3
|
+
[](LICENSE)
|
4
|
+
[](https://rubygems.org/gems/rblearn)
|
5
|
+
[](https://github.com/himkt/rblearn/issues)
|
6
|
+
[](https://github.com/himkt/rblearn/stargazers)
|
7
|
+
[](https://github.com/himkt/rblearn)
|
8
|
+
[](https://github.com/himkt/rblearn)
|
4
9
|
|
5
|
-
|
10
|
+
ruby-learn is a library for machine learning.
|
11
|
+
|
12
|
+
Now, we support cross-validation and feature extraction.
|
6
13
|
|
7
14
|
## Installation
|
8
15
|
|
@@ -22,7 +29,61 @@ Or install it yourself as:
|
|
22
29
|
|
23
30
|
## Usage
|
24
31
|
|
25
|
-
|
32
|
+
### Cross Validation
|
33
|
+
|
34
|
+
CrossValidation provides two features for cross-validation and train_test_split.
|
35
|
+
|
36
|
+
1. train_test_split
|
37
|
+
|
38
|
+
This method splits your dataset into train\_set and test\_set.
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
x\_train, y\_train, x\_test, y\_test = Rblearn::CrossValidation.train_test_split(x, y, 0.7).map(&:dup)
|
42
|
+
```
|
43
|
+
|
44
|
+
2. K-Fold
|
45
|
+
|
46
|
+
This method is for k-fold cross-validation.
|
47
|
+
|
48
|
+
three parameters are required.
|
49
|
+
|
50
|
+
1. n :: integer
|
51
|
+
|
52
|
+
n indicates the size of dataset.
|
53
|
+
|
54
|
+
2. n_folds :: integer
|
55
|
+
|
56
|
+
we specify the k by n_folds.
|
57
|
+
|
58
|
+
3. shuffle :: boolean
|
59
|
+
|
60
|
+
if shuffle is true, dataset are shuffled.
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
kf = Rblearn::CrossValidation::KFold.new(100, 10, true)
|
64
|
+
kf.create #=> list<list<train_set_indices, test_set_indices>>
|
65
|
+
```
|
66
|
+
|
67
|
+
### Count Vectorizer
|
68
|
+
|
69
|
+
CountVectorizer is the feature extractor from texts.
|
70
|
+
|
71
|
+
Constructor needs three parameters.
|
72
|
+
|
73
|
+
1. tokenizer :: function
|
74
|
+
|
75
|
+
2. lowercase :: boolean
|
76
|
+
|
77
|
+
3. max_features :: integer
|
78
|
+
|
79
|
+
|
80
|
+
for example,
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
cv = Rblearn::CountVectorizer.new(lambda{|feature| feature.split.map(&:stem)}, 1, 0.7)
|
84
|
+
cv.fit_transform(features)
|
85
|
+
```
|
86
|
+
|
26
87
|
|
27
88
|
## Development
|
28
89
|
|
@@ -26,18 +26,38 @@ module Rblearn
|
|
26
26
|
def fit_transform(features)
|
27
27
|
all_vocaburaries = []
|
28
28
|
word_frequency = Hash.new{|hash, key| hash[key] = 0}
|
29
|
+
document_frequency = Hash.new{|hash, key| hash[key] = 0}
|
30
|
+
word_tfidf_score = Hash.new{|hash, key| hash[key] = 0}
|
31
|
+
document_size = features.size
|
29
32
|
|
30
33
|
features.each do |feature|
|
31
|
-
|
34
|
+
token_list = @tokenizer.call(feature)
|
35
|
+
|
36
|
+
# compute tf-value
|
37
|
+
token_list.each do |token|
|
32
38
|
token.downcase! if @lowercase
|
33
|
-
all_vocaburaries << token
|
34
39
|
word_frequency[token] += 1
|
35
40
|
end
|
41
|
+
|
42
|
+
# compute df-value
|
43
|
+
token_list.uniq.each do |token|
|
44
|
+
document_frequency[token] += 1
|
45
|
+
all_vocaburaries << token
|
46
|
+
end
|
36
47
|
end
|
37
48
|
|
38
49
|
all_vocaburaries.uniq!
|
39
50
|
word_frequency = word_frequency.sort{|(_, value1), (_, value2)| value2 <=> value1}
|
40
|
-
|
51
|
+
|
52
|
+
all_vocaburaries.each do |token|
|
53
|
+
tf = 1 + Math.log(word_frequency[token])
|
54
|
+
idf = Math.log(1+(document_size/document_frequency[token]))
|
55
|
+
word_tfidf_score[token] = tf * idf
|
56
|
+
end
|
57
|
+
|
58
|
+
word_tfidf_score = word_tfidf_score.sort{|(_, v1), (_, v2)| v2 <=> v1}
|
59
|
+
|
60
|
+
feature_names = (0...(word_tfidf_score.size * @max_feature).to_i).map{|i| word_tfidf_score[i][0]}
|
41
61
|
|
42
62
|
token2index = {}
|
43
63
|
feature_names.each_with_index do |token, i|
|
data/lib/rblearn/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rblearn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- himkt
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-08-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|