rblearn 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rblearn/CountVectorizer.rb +83 -0
- data/lib/rblearn/CrossValidation.rb +45 -0
- data/lib/rblearn/version.rb +1 -1
- data/lib/rblearn.rb +6 -1
- data/rblearn.gemspec +4 -0
- metadata +46 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e421c233da7861bb3062a93b964b9cc4a3b23f1c
|
4
|
+
data.tar.gz: b2be5eb9d5f61bb7a70bda13be2d876afd509dbc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6b2d6557d3a07864fe0a50a892f0ea7540fe44a585aaf451afdd220255c11fbc1bf70f9429af955b8b6b8958d4dcb157df46ae67cd0ac1d0091810489c857cc2
|
7
|
+
data.tar.gz: eb58039f44c2584d81b524c9dc6faaf24a32ffbf3b31d33a61817e668ede474fdd021fd315c69cfcced7fa4dc74a648b2f696ab32fed50642508e6d4f7a91de7
|
@@ -0,0 +1,83 @@
|
|
1
|
+
|
2
|
+
module Rblearn
|
3
|
+
|
4
|
+
class CountVectorizer
|
5
|
+
|
6
|
+
# TODO: consider the access controll about all variables
|
7
|
+
attr_accessor :feature_names, :doc_matrix, :token2index
|
8
|
+
|
9
|
+
# tokenizer: lambda function :: string -> Array<string>
|
10
|
+
# lowcase: whether if words are lowercases :: bool
|
11
|
+
# stop_words: list of stop words :: Array<string>
|
12
|
+
# max_features: limitation of feature size :: Float \in [0, 1]
|
13
|
+
# TODO: by max_features, zero vectors are sometimes created.
|
14
|
+
def initialize(tokenizer, lowercase=true, max_features=0.8)
|
15
|
+
@tokenizer = tokenizer
|
16
|
+
@lowercase = lowercase
|
17
|
+
|
18
|
+
stop_words = Stopwords::STOP_WORDS
|
19
|
+
stop_words.map! {|token| token.stem}
|
20
|
+
stop_words.map! {|token| token.downcase} if @lowercase
|
21
|
+
@stopwords = stop_words
|
22
|
+
@max_feature = max_features
|
23
|
+
end
|
24
|
+
|
25
|
+
# features: Each documents' feature :: Array<String> -> NArray::Int64
|
26
|
+
def fit_transform(features)
|
27
|
+
all_vocaburaries = []
|
28
|
+
word_frequency = Hash.new{|hash, key| hash[key] = 0}
|
29
|
+
|
30
|
+
features.each do |feature|
|
31
|
+
@tokenizer.call(feature).each do |token|
|
32
|
+
token.downcase! if @lowercase
|
33
|
+
all_vocaburaries << token
|
34
|
+
word_frequency[token] += 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
all_vocaburaries.uniq!
|
39
|
+
word_frequency = word_frequency.sort{|(_, value1), (_, value2)| value2 <=> value1}
|
40
|
+
feature_names = (0...(word_frequency.size * @max_feature).to_i).map{|i| word_frequency[i][0]}
|
41
|
+
|
42
|
+
token2index = {}
|
43
|
+
feature_names.each_with_index do |token, i|
|
44
|
+
token2index[token] = i
|
45
|
+
end
|
46
|
+
|
47
|
+
doc_matrix = Numo::Int32.zeros([features.size, feature_names.size])
|
48
|
+
features.each_with_index do |feature, doc_id|
|
49
|
+
tokens = []
|
50
|
+
@tokenizer.call(feature).each do |token|
|
51
|
+
token.downcase! if @lowercase
|
52
|
+
tokens << token unless @stopwords.include?(token)
|
53
|
+
end
|
54
|
+
|
55
|
+
# BoW representation
|
56
|
+
counter = Hash.new{|hash, key| hash[key] = 0}
|
57
|
+
tokens.each do |token|
|
58
|
+
counter[token] += 1
|
59
|
+
end
|
60
|
+
|
61
|
+
counter.each do |token, freq|
|
62
|
+
doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
@doc_matrix = doc_matrix
|
67
|
+
@feature_names = feature_names
|
68
|
+
@token2index = token2index
|
69
|
+
return @doc_matrix
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
|
75
|
+
if __FILE__ == $0
|
76
|
+
cv = CountVectorizer.new(lambda{|s| s.split.map{|token| token.stem}}, 1, 0.8)
|
77
|
+
features = ['I am train man which automata and philosophy', 'numerical analysis young man', 'logic programmer']
|
78
|
+
p cv.fit_transform(features)
|
79
|
+
p cv.feature_names
|
80
|
+
p cv.token2index
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
|
2
|
+
module Rblearn
|
3
|
+
module CrossValidation
|
4
|
+
# x, y: Narray object
|
5
|
+
# We slice a matrix by x[Array<Integer>, true]
|
6
|
+
def self.train_test_split(x, y, test_size=0.33)
|
7
|
+
doc_size = x.shape[0]
|
8
|
+
random_indices = (0...doc_size).to_a.shuffle
|
9
|
+
endpoint = (doc_size * test_size).to_i
|
10
|
+
train_indices = random_indices[endpoint..-1]
|
11
|
+
test_indices = random_indices[0...endpoint]
|
12
|
+
|
13
|
+
return [x[train_indices, true], y[train_indices, true], x[test_indices, true], y[test_indices, true]]
|
14
|
+
end
|
15
|
+
|
16
|
+
class KFold
|
17
|
+
# TODO: make indices and n_folds private
|
18
|
+
|
19
|
+
def initialize(n, n_folds, shuffle)
|
20
|
+
indices = (0...n).to_a
|
21
|
+
indices.shuffle! if shuffle
|
22
|
+
@indices = indices
|
23
|
+
@n_folds = n_folds
|
24
|
+
end
|
25
|
+
|
26
|
+
def create
|
27
|
+
groups_nfolds = @indices.each_slice((@indices.size.to_f / @n_folds).ceil).to_a
|
28
|
+
groups = []
|
29
|
+
|
30
|
+
@n_folds.times do |k|
|
31
|
+
validation_set = []
|
32
|
+
test_set = []
|
33
|
+
|
34
|
+
@n_folds.times do |j|
|
35
|
+
test_set += groups_nfolds[j] if k == j
|
36
|
+
validation_set += groups_nfolds[j] unless k == j
|
37
|
+
end
|
38
|
+
groups << [validation_set, test_set]
|
39
|
+
end
|
40
|
+
|
41
|
+
return groups
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/lib/rblearn/version.rb
CHANGED
data/lib/rblearn.rb
CHANGED
data/rblearn.gemspec
CHANGED
@@ -25,4 +25,8 @@ Gem::Specification.new do |spec|
|
|
25
25
|
spec.add_development_dependency "bundler", "~> 1.11"
|
26
26
|
spec.add_development_dependency "rake", "~> 10.0"
|
27
27
|
spec.add_development_dependency "rspec", "~> 3.0"
|
28
|
+
|
29
|
+
spec.add_runtime_dependency "numo-narray"
|
30
|
+
spec.add_runtime_dependency 'stopwords'
|
31
|
+
spec.add_runtime_dependency 'stemmify'
|
28
32
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rblearn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- himkt
|
@@ -52,6 +52,48 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: numo-narray
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: stopwords
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: stemmify
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
55
97
|
description: rblearn (ruby-learn) provides methods for feature extracting and some
|
56
98
|
algorithms.
|
57
99
|
email:
|
@@ -71,6 +113,8 @@ files:
|
|
71
113
|
- bin/console
|
72
114
|
- bin/setup
|
73
115
|
- lib/rblearn.rb
|
116
|
+
- lib/rblearn/CountVectorizer.rb
|
117
|
+
- lib/rblearn/CrossValidation.rb
|
74
118
|
- lib/rblearn/version.rb
|
75
119
|
- rblearn.gemspec
|
76
120
|
homepage: https://github.com/himkt/rblearn
|
@@ -98,3 +142,4 @@ signing_key:
|
|
98
142
|
specification_version: 4
|
99
143
|
summary: Simple repository for machine learning
|
100
144
|
test_files: []
|
145
|
+
has_rdoc:
|