rblearn 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rblearn/CountVectorizer.rb +83 -0
- data/lib/rblearn/CrossValidation.rb +45 -0
- data/lib/rblearn/version.rb +1 -1
- data/lib/rblearn.rb +6 -1
- data/rblearn.gemspec +4 -0
- metadata +46 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e421c233da7861bb3062a93b964b9cc4a3b23f1c
|
4
|
+
data.tar.gz: b2be5eb9d5f61bb7a70bda13be2d876afd509dbc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6b2d6557d3a07864fe0a50a892f0ea7540fe44a585aaf451afdd220255c11fbc1bf70f9429af955b8b6b8958d4dcb157df46ae67cd0ac1d0091810489c857cc2
|
7
|
+
data.tar.gz: eb58039f44c2584d81b524c9dc6faaf24a32ffbf3b31d33a61817e668ede474fdd021fd315c69cfcced7fa4dc74a648b2f696ab32fed50642508e6d4f7a91de7
|
@@ -0,0 +1,83 @@
|
|
1
|
+
|
2
|
+
module Rblearn
|
3
|
+
|
4
|
+
class CountVectorizer
|
5
|
+
|
6
|
+
# TODO: consider the access controll about all variables
|
7
|
+
attr_accessor :feature_names, :doc_matrix, :token2index
|
8
|
+
|
9
|
+
# tokenizer: lambda function :: string -> Array<string>
|
10
|
+
# lowcase: whether if words are lowercases :: bool
|
11
|
+
# stop_words: list of stop words :: Array<string>
|
12
|
+
# max_features: limitation of feature size :: Float \in [0, 1]
|
13
|
+
# TODO: by max_features, zero vectors are sometimes created.
|
14
|
+
def initialize(tokenizer, lowercase=true, max_features=0.8)
|
15
|
+
@tokenizer = tokenizer
|
16
|
+
@lowercase = lowercase
|
17
|
+
|
18
|
+
stop_words = Stopwords::STOP_WORDS
|
19
|
+
stop_words.map! {|token| token.stem}
|
20
|
+
stop_words.map! {|token| token.downcase} if @lowercase
|
21
|
+
@stopwords = stop_words
|
22
|
+
@max_feature = max_features
|
23
|
+
end
|
24
|
+
|
25
|
+
# features: Each documents' feature :: Array<String> -> NArray::Int64
|
26
|
+
def fit_transform(features)
|
27
|
+
all_vocaburaries = []
|
28
|
+
word_frequency = Hash.new{|hash, key| hash[key] = 0}
|
29
|
+
|
30
|
+
features.each do |feature|
|
31
|
+
@tokenizer.call(feature).each do |token|
|
32
|
+
token.downcase! if @lowercase
|
33
|
+
all_vocaburaries << token
|
34
|
+
word_frequency[token] += 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
all_vocaburaries.uniq!
|
39
|
+
word_frequency = word_frequency.sort{|(_, value1), (_, value2)| value2 <=> value1}
|
40
|
+
feature_names = (0...(word_frequency.size * @max_feature).to_i).map{|i| word_frequency[i][0]}
|
41
|
+
|
42
|
+
token2index = {}
|
43
|
+
feature_names.each_with_index do |token, i|
|
44
|
+
token2index[token] = i
|
45
|
+
end
|
46
|
+
|
47
|
+
doc_matrix = Numo::Int32.zeros([features.size, feature_names.size])
|
48
|
+
features.each_with_index do |feature, doc_id|
|
49
|
+
tokens = []
|
50
|
+
@tokenizer.call(feature).each do |token|
|
51
|
+
token.downcase! if @lowercase
|
52
|
+
tokens << token unless @stopwords.include?(token)
|
53
|
+
end
|
54
|
+
|
55
|
+
# BoW representation
|
56
|
+
counter = Hash.new{|hash, key| hash[key] = 0}
|
57
|
+
tokens.each do |token|
|
58
|
+
counter[token] += 1
|
59
|
+
end
|
60
|
+
|
61
|
+
counter.each do |token, freq|
|
62
|
+
doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
@doc_matrix = doc_matrix
|
67
|
+
@feature_names = feature_names
|
68
|
+
@token2index = token2index
|
69
|
+
return @doc_matrix
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
|
75
|
+
if __FILE__ == $0
|
76
|
+
cv = CountVectorizer.new(lambda{|s| s.split.map{|token| token.stem}}, 1, 0.8)
|
77
|
+
features = ['I am train man which automata and philosophy', 'numerical analysis young man', 'logic programmer']
|
78
|
+
p cv.fit_transform(features)
|
79
|
+
p cv.feature_names
|
80
|
+
p cv.token2index
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
|
2
|
+
module Rblearn
|
3
|
+
module CrossValidation
|
4
|
+
# x, y: Narray object
|
5
|
+
# We slice a matrix by x[Array<Integer>, true]
|
6
|
+
def self.train_test_split(x, y, test_size=0.33)
|
7
|
+
doc_size = x.shape[0]
|
8
|
+
random_indices = (0...doc_size).to_a.shuffle
|
9
|
+
endpoint = (doc_size * test_size).to_i
|
10
|
+
train_indices = random_indices[endpoint..-1]
|
11
|
+
test_indices = random_indices[0...endpoint]
|
12
|
+
|
13
|
+
return [x[train_indices, true], y[train_indices, true], x[test_indices, true], y[test_indices, true]]
|
14
|
+
end
|
15
|
+
|
16
|
+
class KFold
|
17
|
+
# TODO: make indices and n_folds private
|
18
|
+
|
19
|
+
def initialize(n, n_folds, shuffle)
|
20
|
+
indices = (0...n).to_a
|
21
|
+
indices.shuffle! if shuffle
|
22
|
+
@indices = indices
|
23
|
+
@n_folds = n_folds
|
24
|
+
end
|
25
|
+
|
26
|
+
def create
|
27
|
+
groups_nfolds = @indices.each_slice((@indices.size.to_f / @n_folds).ceil).to_a
|
28
|
+
groups = []
|
29
|
+
|
30
|
+
@n_folds.times do |k|
|
31
|
+
validation_set = []
|
32
|
+
test_set = []
|
33
|
+
|
34
|
+
@n_folds.times do |j|
|
35
|
+
test_set += groups_nfolds[j] if k == j
|
36
|
+
validation_set += groups_nfolds[j] unless k == j
|
37
|
+
end
|
38
|
+
groups << [validation_set, test_set]
|
39
|
+
end
|
40
|
+
|
41
|
+
return groups
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/lib/rblearn/version.rb
CHANGED
data/lib/rblearn.rb
CHANGED
data/rblearn.gemspec
CHANGED
@@ -25,4 +25,8 @@ Gem::Specification.new do |spec|
|
|
25
25
|
spec.add_development_dependency "bundler", "~> 1.11"
|
26
26
|
spec.add_development_dependency "rake", "~> 10.0"
|
27
27
|
spec.add_development_dependency "rspec", "~> 3.0"
|
28
|
+
|
29
|
+
spec.add_runtime_dependency "numo-narray"
|
30
|
+
spec.add_runtime_dependency 'stopwords'
|
31
|
+
spec.add_runtime_dependency 'stemmify'
|
28
32
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rblearn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- himkt
|
@@ -52,6 +52,48 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: numo-narray
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: stopwords
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: stemmify
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
55
97
|
description: rblearn (ruby-learn) provides methods for feature extracting and some
|
56
98
|
algorithms.
|
57
99
|
email:
|
@@ -71,6 +113,8 @@ files:
|
|
71
113
|
- bin/console
|
72
114
|
- bin/setup
|
73
115
|
- lib/rblearn.rb
|
116
|
+
- lib/rblearn/CountVectorizer.rb
|
117
|
+
- lib/rblearn/CrossValidation.rb
|
74
118
|
- lib/rblearn/version.rb
|
75
119
|
- rblearn.gemspec
|
76
120
|
homepage: https://github.com/himkt/rblearn
|
@@ -98,3 +142,4 @@ signing_key:
|
|
98
142
|
specification_version: 4
|
99
143
|
summary: Simple repository for machine learning
|
100
144
|
test_files: []
|
145
|
+
has_rdoc:
|