rblearn 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 98bd238b96729a72b7823d341a8f59ed084f7787
4
- data.tar.gz: 8d800e60ab8af1f69367e010b744c715f37b2826
3
+ metadata.gz: e421c233da7861bb3062a93b964b9cc4a3b23f1c
4
+ data.tar.gz: b2be5eb9d5f61bb7a70bda13be2d876afd509dbc
5
5
  SHA512:
6
- metadata.gz: 63751de5c1a69ca150c8e0abfad8b54e27f5757c9897736090df3e1b3b7a3a67aad6db3087afbb1e033d7221012a78e44f0d271be713fcbcf0fd5cc24d863a4f
7
- data.tar.gz: b4d033b2798aa332fa5ac8501b7ccf6bd14cd32caf4631d077f7f1a2c0d172a376358375392c493edaf061203b47c19fc937a44f734bf3c4918a08d210dfd510
6
+ metadata.gz: 6b2d6557d3a07864fe0a50a892f0ea7540fe44a585aaf451afdd220255c11fbc1bf70f9429af955b8b6b8958d4dcb157df46ae67cd0ac1d0091810489c857cc2
7
+ data.tar.gz: eb58039f44c2584d81b524c9dc6faaf24a32ffbf3b31d33a61817e668ede474fdd021fd315c69cfcced7fa4dc74a648b2f696ab32fed50642508e6d4f7a91de7
@@ -0,0 +1,83 @@
1
+
2
+ module Rblearn
3
+
4
+ class CountVectorizer
5
+
6
+ # TODO: consider the access controll about all variables
7
+ attr_accessor :feature_names, :doc_matrix, :token2index
8
+
9
+ # tokenizer: lambda function :: string -> Array<string>
10
+ # lowcase: whether if words are lowercases :: bool
11
+ # stop_words: list of stop words :: Array<string>
12
+ # max_features: limitation of feature size :: Float \in [0, 1]
13
+ # TODO: by max_features, zero vectors are sometimes created.
14
+ def initialize(tokenizer, lowercase=true, max_features=0.8)
15
+ @tokenizer = tokenizer
16
+ @lowercase = lowercase
17
+
18
+ stop_words = Stopwords::STOP_WORDS
19
+ stop_words.map! {|token| token.stem}
20
+ stop_words.map! {|token| token.downcase} if @lowercase
21
+ @stopwords = stop_words
22
+ @max_feature = max_features
23
+ end
24
+
25
+ # features: Each documents' feature :: Array<String> -> NArray::Int64
26
+ def fit_transform(features)
27
+ all_vocaburaries = []
28
+ word_frequency = Hash.new{|hash, key| hash[key] = 0}
29
+
30
+ features.each do |feature|
31
+ @tokenizer.call(feature).each do |token|
32
+ token.downcase! if @lowercase
33
+ all_vocaburaries << token
34
+ word_frequency[token] += 1
35
+ end
36
+ end
37
+
38
+ all_vocaburaries.uniq!
39
+ word_frequency = word_frequency.sort{|(_, value1), (_, value2)| value2 <=> value1}
40
+ feature_names = (0...(word_frequency.size * @max_feature).to_i).map{|i| word_frequency[i][0]}
41
+
42
+ token2index = {}
43
+ feature_names.each_with_index do |token, i|
44
+ token2index[token] = i
45
+ end
46
+
47
+ doc_matrix = Numo::Int32.zeros([features.size, feature_names.size])
48
+ features.each_with_index do |feature, doc_id|
49
+ tokens = []
50
+ @tokenizer.call(feature).each do |token|
51
+ token.downcase! if @lowercase
52
+ tokens << token unless @stopwords.include?(token)
53
+ end
54
+
55
+ # BoW representation
56
+ counter = Hash.new{|hash, key| hash[key] = 0}
57
+ tokens.each do |token|
58
+ counter[token] += 1
59
+ end
60
+
61
+ counter.each do |token, freq|
62
+ doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
63
+ end
64
+ end
65
+
66
+ @doc_matrix = doc_matrix
67
+ @feature_names = feature_names
68
+ @token2index = token2index
69
+ return @doc_matrix
70
+ end
71
+ end
72
+
73
+
74
+
75
+ if __FILE__ == $0
76
+ cv = CountVectorizer.new(lambda{|s| s.split.map{|token| token.stem}}, 1, 0.8)
77
+ features = ['I am train man which automata and philosophy', 'numerical analysis young man', 'logic programmer']
78
+ p cv.fit_transform(features)
79
+ p cv.feature_names
80
+ p cv.token2index
81
+ end
82
+
83
+ end
@@ -0,0 +1,45 @@
1
+
2
+ module Rblearn
3
+ module CrossValidation
4
+ # x, y: Narray object
5
+ # We slice a matrix by x[Array<Integer>, true]
6
+ def self.train_test_split(x, y, test_size=0.33)
7
+ doc_size = x.shape[0]
8
+ random_indices = (0...doc_size).to_a.shuffle
9
+ endpoint = (doc_size * test_size).to_i
10
+ train_indices = random_indices[endpoint..-1]
11
+ test_indices = random_indices[0...endpoint]
12
+
13
+ return [x[train_indices, true], y[train_indices, true], x[test_indices, true], y[test_indices, true]]
14
+ end
15
+
16
+ class KFold
17
+ # TODO: make indices and n_folds private
18
+
19
+ def initialize(n, n_folds, shuffle)
20
+ indices = (0...n).to_a
21
+ indices.shuffle! if shuffle
22
+ @indices = indices
23
+ @n_folds = n_folds
24
+ end
25
+
26
+ def create
27
+ groups_nfolds = @indices.each_slice((@indices.size.to_f / @n_folds).ceil).to_a
28
+ groups = []
29
+
30
+ @n_folds.times do |k|
31
+ validation_set = []
32
+ test_set = []
33
+
34
+ @n_folds.times do |j|
35
+ test_set += groups_nfolds[j] if k == j
36
+ validation_set += groups_nfolds[j] unless k == j
37
+ end
38
+ groups << [validation_set, test_set]
39
+ end
40
+
41
+ return groups
42
+ end
43
+ end
44
+ end
45
+ end
@@ -1,3 +1,3 @@
1
1
  module Rblearn
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/rblearn.rb CHANGED
@@ -1,4 +1,9 @@
1
- require "rblearn/version"
1
+ require 'numo/narray'
2
+ require 'stopwords'
3
+ require 'stemmify'
4
+ require 'rblearn/version'
5
+ require 'rblearn/CountVectorizer'
6
+ require 'rblearn/CrossValidation'
2
7
 
3
8
  module Rblearn
4
9
  # Your code goes here...
data/rblearn.gemspec CHANGED
@@ -25,4 +25,8 @@ Gem::Specification.new do |spec|
25
25
  spec.add_development_dependency "bundler", "~> 1.11"
26
26
  spec.add_development_dependency "rake", "~> 10.0"
27
27
  spec.add_development_dependency "rspec", "~> 3.0"
28
+
29
+ spec.add_runtime_dependency "numo-narray"
30
+ spec.add_runtime_dependency 'stopwords'
31
+ spec.add_runtime_dependency 'stemmify'
28
32
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rblearn
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - himkt
@@ -52,6 +52,48 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: numo-narray
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: stopwords
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: stemmify
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
55
97
  description: rblearn (ruby-learn) provides methods for feature extracting and some
56
98
  algorithms.
57
99
  email:
@@ -71,6 +113,8 @@ files:
71
113
  - bin/console
72
114
  - bin/setup
73
115
  - lib/rblearn.rb
116
+ - lib/rblearn/CountVectorizer.rb
117
+ - lib/rblearn/CrossValidation.rb
74
118
  - lib/rblearn/version.rb
75
119
  - rblearn.gemspec
76
120
  homepage: https://github.com/himkt/rblearn
@@ -98,3 +142,4 @@ signing_key:
98
142
  specification_version: 4
99
143
  summary: Simple repository for machine learning
100
144
  test_files: []
145
+ has_rdoc: