rumale 0.14.5 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 53beac1d67735cd6f03714c82bbd149379df845f
4
- data.tar.gz: 303720c35bc1b14b720379abb6464812778398a2
3
+ metadata.gz: d8823e97350be198c39b1896dc88978a47b526f1
4
+ data.tar.gz: d65a2d3274d104eae9aa20dd97078159614783e1
5
5
  SHA512:
6
- metadata.gz: 44e7893da77fcb9bcc245d096b69553b87bb740c160f879f7a1d8f00d30f1fc39f59831b17db6ff267772634cf356bceac5c5aa9f91792f0a6998938c60e2cb3
7
- data.tar.gz: 21b3c761487805a31fc61d3570582892d02b97c3c14c01bf0813dd9893b79797a9092f1a9c8ecc47434fbff079a495897a8bf81109e026ac0c2a212615df7831
6
+ metadata.gz: 5f06921658636e7765edc7a71aa9df28bf6a5cd4b36706671b2bf3a75c55755d190e8ab4dc9801cf94a4034b04429a6f14d8978c0323a9a2a6bf5d34456aa2e5
7
+ data.tar.gz: 74dfe7a75358e9e26da392dae39e71c1441a56c63ad51236db3bedfda4156f8cd86a1b69c40574e34b2db46fb8bdb9f22da60bde0a9423afa6849b11551a9494
@@ -6,6 +6,7 @@ rvm:
6
6
  - '2.4'
7
7
  - '2.5'
8
8
  - '2.6'
9
+ - '2.7'
9
10
 
10
11
  addons:
11
12
  apt:
@@ -1,3 +1,8 @@
1
+ # 0.15.0
2
+ - Add feature extractor classes:
3
+ - [HashVectorizer](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/HashVectorizer.html)
4
+ - [FeatureHasher](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/FeatureHasher.html)
5
+
1
6
  # 0.14.5
2
7
  - Fix to suppress deprecation warning about keyword argument in Ruby 2.7.
3
8
 
@@ -77,6 +77,8 @@ require 'rumale/manifold/mds'
77
77
  require 'rumale/neural_network/base_mlp'
78
78
  require 'rumale/neural_network/mlp_regressor'
79
79
  require 'rumale/neural_network/mlp_classifier'
80
+ require 'rumale/feature_extraction/hash_vectorizer'
81
+ require 'rumale/feature_extraction/feature_hasher'
80
82
  require 'rumale/preprocessing/l2_normalizer'
81
83
  require 'rumale/preprocessing/min_max_scaler'
82
84
  require 'rumale/preprocessing/max_abs_scaler'
@@ -0,0 +1,149 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+
6
+ module Rumale
7
+ module FeatureExtraction
8
+ # Encode array of feature-value hash to vectors with feature hashing (hashing trick).
9
+ # This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
10
+ # This encoder employs signed 32-bit Murmurhash3 as the hash function.
11
+ #
12
+ # @example
13
+ # encoder = Rumale::FeatureExtraction::FeatureHasher.new(n_features: 10)
14
+ # x = encoder.transform([
15
+ # { dog: 1, cat: 2, elephant: 4 },
16
+ # { dog: 2, run: 5 }
17
+ # ])
18
+ # # > pp x
19
+ # # Numo::DFloat#shape=[2,10]
20
+ # # [[0, 0, -4, -1, 0, 0, 0, 0, 0, 2],
21
+ # # [0, 0, 0, -2, -5, 0, 0, 0, 0, 0]]
22
+ class FeatureHasher
23
+ include Base::BaseEstimator
24
+ include Base::Transformer
25
+
26
+ # Create a new encoder for converting array of hash consisting of feature names and values to vectors
27
+ # with feature hashing algorith.
28
+ #
29
+ # @param n_features [Integer] The number of features of encoded samples.
30
+ # @param alternate_sign [Boolean] The flag indicating whether to reflect the sign of the hash value to the feature value.
31
+ def initialize(n_features: 1024, alternate_sign: true)
32
+ check_params_numeric(n_features: n_features)
33
+ check_params_boolean(alternate_sign: alternate_sign)
34
+ @params = {}
35
+ @params[:n_features] = n_features
36
+ @params[:alternate_sign] = alternate_sign
37
+ end
38
+
39
+ # This method does not do anything. The encoder does not require training.
40
+ #
41
+ # @overload fit(x) -> FeatureHasher
42
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
43
+ # @return [FeatureHasher]
44
+ def fit(_x = nil, _y = nil)
45
+ self
46
+ end
47
+
48
+ # Encode given the array of feature-value hash.
49
+ # This method has the same output as the transform method
50
+ # because the encoder does not require training.
51
+ #
52
+ # @overload fit_transform(x) -> Numo::DFloat
53
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
54
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
55
+ def fit_transform(x, _y = nil)
56
+ fit(x).transform(x)
57
+ end
58
+
59
+ # Encode given the array of feature-value hash.
60
+ #
61
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
62
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
63
+ def transform(x)
64
+ x = [x] unless x.is_a?(Array)
65
+ n_samples = x.size
66
+
67
+ z = Numo::DFloat.zeros(n_samples, n_features)
68
+
69
+ x.each_with_index do |f, i|
70
+ f.each do |k, v|
71
+ k = "#{k}=#{v}" if v.is_a?(String)
72
+ val = v.is_a?(String) ? 1 : v
73
+ next if val.zero?
74
+
75
+ h = murmur_hash(k.to_s)
76
+ fid = h.abs % n_features
77
+ val *= h >= 0 ? 1 : -1 if alternate_sign?
78
+ z[i, fid] = val
79
+ end
80
+ end
81
+
82
+ z
83
+ end
84
+
85
+ private
86
+
87
+ def n_features
88
+ @params[:n_features]
89
+ end
90
+
91
+ def alternate_sign?
92
+ @params[:alternate_sign]
93
+ end
94
+
95
+ # MurmurHash3_32
96
+ # References:
97
+ # - https://en.wikipedia.org/wiki/MurmurHash
98
+ # - https://github.com/aappleby/smhasher
99
+ def murmur_hash(key_str, seed = 0)
100
+ keyb = key_str.bytes
101
+ key_len = keyb.size
102
+ n_blocks = key_len / 4
103
+
104
+ h = seed
105
+ (0...n_blocks * 4).step(4) do |bstart|
106
+ k = keyb[bstart + 3] << 24 | keyb[bstart + 2] << 16 | keyb[bstart + 1] << 8 | keyb[bstart + 0]
107
+ h ^= murmur_scramble(k)
108
+ h = murmur_rotl(h, 13)
109
+ h = (h * 5 + 0xe6546b64) & 0xFFFFFFFF
110
+ end
111
+
112
+ tail_id = n_blocks * 4
113
+ tail_sz = key_len & 3
114
+
115
+ k = 0
116
+ k ^= keyb[tail_id + 2] << 16 if tail_sz >= 3
117
+ k ^= keyb[tail_id + 1] << 8 if tail_sz >= 2
118
+ k ^= keyb[tail_id + 0] if tail_sz >= 1
119
+ h ^= murmur_scramble(k) if tail_sz.positive?
120
+
121
+ h = murmur_fmix(h ^ key_len)
122
+
123
+ if (h & 0x80000000).zero?
124
+ h
125
+ else
126
+ -((h ^ 0xFFFFFFFF) + 1)
127
+ end
128
+ end
129
+
130
+ def murmur_rotl(x, r)
131
+ (x << r | x >> (32 - r)) & 0xFFFFFFFF
132
+ end
133
+
134
+ def murmur_scramble(k)
135
+ k = (k * 0xcc9e2d51) & 0xFFFFFFFF
136
+ k = murmur_rotl(k, 15)
137
+ (k * 0x1b873593) & 0xFFFFFFFF
138
+ end
139
+
140
+ def murmur_fmix(h)
141
+ h ^= h >> 16
142
+ h = (h * 0x85ebca6b) & 0xFFFFFFFF
143
+ h ^= h >> 13
144
+ h = (h * 0xc2b2ae35) & 0xFFFFFFFF
145
+ h ^ (h >> 16)
146
+ end
147
+ end
148
+ end
149
+ end
@@ -0,0 +1,154 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+
6
+ module Rumale
7
+ # This module consists of the classes that extract features from raw data.
8
+ module FeatureExtraction
9
+ # Encode array of feature-value hash to vectors.
10
+ # This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
11
+ #
12
+ # @example
13
+ # encoder = Rumale::FeatureExtraction::HashVectorizer.new
14
+ # x = encoder.fit_transform([
15
+ # { foo: 1, bar: 2 },
16
+ # { foo: 3, baz: 1 }
17
+ # ])
18
+ # # > pp x
19
+ # # Numo::DFloat#shape=[2,3]
20
+ # # [[2, 0, 1],
21
+ # # [0, 1, 3]]
22
+ #
23
+ # x = encoder.fit_transform([
24
+ # { city: 'Dubai', temperature: 33 },
25
+ # { city: 'London', temperature: 12 },
26
+ # { city: 'San Francisco', temperature: 18 }
27
+ # ])
28
+ # # > pp x
29
+ # # Numo::DFloat#shape=[3,4]
30
+ # # [[1, 0, 0, 33],
31
+ # # [0, 1, 0, 12],
32
+ # # [0, 0, 1, 18]]
33
+ # # > pp encoder.inverse_transform(x)
34
+ # # [{:city=>"Dubai", :temperature=>33.0},
35
+ # # {:city=>"London", :temperature=>12.0},
36
+ # # {:city=>"San Francisco", :temperature=>18.0}]
37
+ class HashVectorizer
38
+ include Base::BaseEstimator
39
+ include Base::Transformer
40
+
41
+ # Return the list of feature names.
42
+ # @return [Array] (size: [n_features])
43
+ attr_reader :feature_names
44
+
45
+ # Return the hash consisting of pairs of feature names and indices.
46
+ # @return [Hash] (size: [n_features])
47
+ attr_reader :vocabulary
48
+
49
+ # Create a new encoder for converting array of hash consisting of feature names and values to vectors.
50
+ #
51
+ # @param separator [String] The separator string used for constructing new feature names for categorical feature.
52
+ # @param sort [Boolean] The flag indicating whether to sort feature names.
53
+ def initialize(separator: '=', sort: true)
54
+ check_params_string(separator: separator)
55
+ check_params_boolean(sort: sort)
56
+ @params = {}
57
+ @params[:separator] = separator
58
+ @params[:sort] = sort
59
+ end
60
+
61
+ # Fit the encoder with given training data.
62
+ #
63
+ # @overload fit(x) -> HashVectorizer
64
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
65
+ # @return [HashVectorizer]
66
+ def fit(x, _y = nil)
67
+ @feature_names = []
68
+ @vocabulary = {}
69
+
70
+ x.each do |f|
71
+ f.each do |k, v|
72
+ k = "#{k}#{separator}#{v}".to_sym if v.is_a?(String)
73
+ next if @vocabulary.key?(k)
74
+ @feature_names.push(k)
75
+ @vocabulary[k] = @vocabulary.size
76
+ end
77
+ end
78
+
79
+ if sort_feature?
80
+ @feature_names.sort!
81
+ @feature_names.each_with_index { |k, i| @vocabulary[k] = i }
82
+ end
83
+
84
+ self
85
+ end
86
+
87
+ # Fit the encoder with given training data, then return encoded data.
88
+ #
89
+ # @overload fit_transform(x) -> Numo::DFloat
90
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
91
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
92
+ def fit_transform(x, _y = nil)
93
+ fit(x).transform(x)
94
+ end
95
+
96
+ # Encode given the array of feature-value hash.
97
+ #
98
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
99
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
100
+ def transform(x)
101
+ x = [x] unless x.is_a?(Array)
102
+ n_samples = x.size
103
+ n_features = @vocabulary.size
104
+ z = Numo::DFloat.zeros(n_samples, n_features)
105
+
106
+ x.each_with_index do |f, i|
107
+ f.each do |k, v|
108
+ if v.is_a?(String)
109
+ k = "#{k}#{separator}#{v}".to_sym
110
+ v = 1
111
+ end
112
+ z[i, @vocabulary[k]] = v if @vocabulary.key?(k)
113
+ end
114
+ end
115
+
116
+ z
117
+ end
118
+
119
+ # Decode sample matirx to the array of feature-value hash.
120
+ #
121
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
122
+ # @return [Array<Hash>] The array of hash consisting of feature names and values.
123
+ def inverse_transform(x)
124
+ n_samples = x.shape[0]
125
+ reconst = []
126
+
127
+ n_samples.times do |i|
128
+ f = {}
129
+ x[i, true].each_with_index do |el, j|
130
+ feature_key_val(@feature_names[j], el).tap { |k, v| f[k.to_sym] = v } unless el.zero?
131
+ end
132
+ reconst.push(f)
133
+ end
134
+
135
+ reconst
136
+ end
137
+
138
+ private
139
+
140
+ def feature_key_val(fname, fval)
141
+ f = fname.to_s.split(separator)
142
+ f.size == 2 ? f : [fname, fval]
143
+ end
144
+
145
+ def separator
146
+ @params[:separator]
147
+ end
148
+
149
+ def sort_feature?
150
+ @params[:sort]
151
+ end
152
+ end
153
+ end
154
+ end
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.14.5'
6
+ VERSION = '0.15.0'
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.5
4
+ version: 0.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-12-28 00:00:00.000000000 Z
11
+ date: 2020-01-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -207,6 +207,8 @@ files:
207
207
  - lib/rumale/evaluation_measure/recall.rb
208
208
  - lib/rumale/evaluation_measure/roc_auc.rb
209
209
  - lib/rumale/evaluation_measure/silhouette_score.rb
210
+ - lib/rumale/feature_extraction/feature_hasher.rb
211
+ - lib/rumale/feature_extraction/hash_vectorizer.rb
210
212
  - lib/rumale/kernel_approximation/rbf.rb
211
213
  - lib/rumale/kernel_machine/kernel_pca.rb
212
214
  - lib/rumale/kernel_machine/kernel_ridge.rb