rblearn 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 98bd238b96729a72b7823d341a8f59ed084f7787
4
- data.tar.gz: 8d800e60ab8af1f69367e010b744c715f37b2826
3
+ metadata.gz: e421c233da7861bb3062a93b964b9cc4a3b23f1c
4
+ data.tar.gz: b2be5eb9d5f61bb7a70bda13be2d876afd509dbc
5
5
  SHA512:
6
- metadata.gz: 63751de5c1a69ca150c8e0abfad8b54e27f5757c9897736090df3e1b3b7a3a67aad6db3087afbb1e033d7221012a78e44f0d271be713fcbcf0fd5cc24d863a4f
7
- data.tar.gz: b4d033b2798aa332fa5ac8501b7ccf6bd14cd32caf4631d077f7f1a2c0d172a376358375392c493edaf061203b47c19fc937a44f734bf3c4918a08d210dfd510
6
+ metadata.gz: 6b2d6557d3a07864fe0a50a892f0ea7540fe44a585aaf451afdd220255c11fbc1bf70f9429af955b8b6b8958d4dcb157df46ae67cd0ac1d0091810489c857cc2
7
+ data.tar.gz: eb58039f44c2584d81b524c9dc6faaf24a32ffbf3b31d33a61817e668ede474fdd021fd315c69cfcced7fa4dc74a648b2f696ab32fed50642508e6d4f7a91de7
@@ -0,0 +1,83 @@
1
+
2
+ module Rblearn
3
+
4
+ class CountVectorizer
5
+
6
+ # TODO: consider the access controll about all variables
7
+ attr_accessor :feature_names, :doc_matrix, :token2index
8
+
9
+ # tokenizer: lambda function :: string -> Array<string>
10
+ # lowcase: whether if words are lowercases :: bool
11
+ # stop_words: list of stop words :: Array<string>
12
+ # max_features: limitation of feature size :: Float \in [0, 1]
13
+ # TODO: by max_features, zero vectors are sometimes created.
14
+ def initialize(tokenizer, lowercase=true, max_features=0.8)
15
+ @tokenizer = tokenizer
16
+ @lowercase = lowercase
17
+
18
+ stop_words = Stopwords::STOP_WORDS
19
+ stop_words.map! {|token| token.stem}
20
+ stop_words.map! {|token| token.downcase} if @lowercase
21
+ @stopwords = stop_words
22
+ @max_feature = max_features
23
+ end
24
+
25
+ # features: Each documents' feature :: Array<String> -> NArray::Int64
26
+ def fit_transform(features)
27
+ all_vocaburaries = []
28
+ word_frequency = Hash.new{|hash, key| hash[key] = 0}
29
+
30
+ features.each do |feature|
31
+ @tokenizer.call(feature).each do |token|
32
+ token.downcase! if @lowercase
33
+ all_vocaburaries << token
34
+ word_frequency[token] += 1
35
+ end
36
+ end
37
+
38
+ all_vocaburaries.uniq!
39
+ word_frequency = word_frequency.sort{|(_, value1), (_, value2)| value2 <=> value1}
40
+ feature_names = (0...(word_frequency.size * @max_feature).to_i).map{|i| word_frequency[i][0]}
41
+
42
+ token2index = {}
43
+ feature_names.each_with_index do |token, i|
44
+ token2index[token] = i
45
+ end
46
+
47
+ doc_matrix = Numo::Int32.zeros([features.size, feature_names.size])
48
+ features.each_with_index do |feature, doc_id|
49
+ tokens = []
50
+ @tokenizer.call(feature).each do |token|
51
+ token.downcase! if @lowercase
52
+ tokens << token unless @stopwords.include?(token)
53
+ end
54
+
55
+ # BoW representation
56
+ counter = Hash.new{|hash, key| hash[key] = 0}
57
+ tokens.each do |token|
58
+ counter[token] += 1
59
+ end
60
+
61
+ counter.each do |token, freq|
62
+ doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
63
+ end
64
+ end
65
+
66
+ @doc_matrix = doc_matrix
67
+ @feature_names = feature_names
68
+ @token2index = token2index
69
+ return @doc_matrix
70
+ end
71
+ end
72
+
73
+
74
+
75
+ if __FILE__ == $0
76
+ cv = CountVectorizer.new(lambda{|s| s.split.map{|token| token.stem}}, 1, 0.8)
77
+ features = ['I am train man which automata and philosophy', 'numerical analysis young man', 'logic programmer']
78
+ p cv.fit_transform(features)
79
+ p cv.feature_names
80
+ p cv.token2index
81
+ end
82
+
83
+ end
@@ -0,0 +1,45 @@
1
+
2
+ module Rblearn
3
+ module CrossValidation
4
+ # x, y: Narray object
5
+ # We slice a matrix by x[Array<Integer>, true]
6
+ def self.train_test_split(x, y, test_size=0.33)
7
+ doc_size = x.shape[0]
8
+ random_indices = (0...doc_size).to_a.shuffle
9
+ endpoint = (doc_size * test_size).to_i
10
+ train_indices = random_indices[endpoint..-1]
11
+ test_indices = random_indices[0...endpoint]
12
+
13
+ return [x[train_indices, true], y[train_indices, true], x[test_indices, true], y[test_indices, true]]
14
+ end
15
+
16
+ class KFold
17
+ # TODO: make indices and n_folds private
18
+
19
+ def initialize(n, n_folds, shuffle)
20
+ indices = (0...n).to_a
21
+ indices.shuffle! if shuffle
22
+ @indices = indices
23
+ @n_folds = n_folds
24
+ end
25
+
26
+ def create
27
+ groups_nfolds = @indices.each_slice((@indices.size.to_f / @n_folds).ceil).to_a
28
+ groups = []
29
+
30
+ @n_folds.times do |k|
31
+ validation_set = []
32
+ test_set = []
33
+
34
+ @n_folds.times do |j|
35
+ test_set += groups_nfolds[j] if k == j
36
+ validation_set += groups_nfolds[j] unless k == j
37
+ end
38
+ groups << [validation_set, test_set]
39
+ end
40
+
41
+ return groups
42
+ end
43
+ end
44
+ end
45
+ end
@@ -1,3 +1,3 @@
1
1
  module Rblearn
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/rblearn.rb CHANGED
@@ -1,4 +1,9 @@
1
- require "rblearn/version"
1
+ require 'numo/narray'
2
+ require 'stopwords'
3
+ require 'stemmify'
4
+ require 'rblearn/version'
5
+ require 'rblearn/CountVectorizer'
6
+ require 'rblearn/CrossValidation'
2
7
 
3
8
  module Rblearn
4
9
  # Your code goes here...
data/rblearn.gemspec CHANGED
@@ -25,4 +25,8 @@ Gem::Specification.new do |spec|
25
25
  spec.add_development_dependency "bundler", "~> 1.11"
26
26
  spec.add_development_dependency "rake", "~> 10.0"
27
27
  spec.add_development_dependency "rspec", "~> 3.0"
28
+
29
+ spec.add_runtime_dependency "numo-narray"
30
+ spec.add_runtime_dependency 'stopwords'
31
+ spec.add_runtime_dependency 'stemmify'
28
32
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rblearn
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - himkt
@@ -52,6 +52,48 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: numo-narray
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: stopwords
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: stemmify
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
55
97
  description: rblearn (ruby-learn) provides methods for feature extracting and some
56
98
  algorithms.
57
99
  email:
@@ -71,6 +113,8 @@ files:
71
113
  - bin/console
72
114
  - bin/setup
73
115
  - lib/rblearn.rb
116
+ - lib/rblearn/CountVectorizer.rb
117
+ - lib/rblearn/CrossValidation.rb
74
118
  - lib/rblearn/version.rb
75
119
  - rblearn.gemspec
76
120
  homepage: https://github.com/himkt/rblearn
@@ -98,3 +142,4 @@ signing_key:
98
142
  specification_version: 4
99
143
  summary: Simple repository for machine learning
100
144
  test_files: []
145
+ has_rdoc: