rumale 0.14.5 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 53beac1d67735cd6f03714c82bbd149379df845f
4
- data.tar.gz: 303720c35bc1b14b720379abb6464812778398a2
3
+ metadata.gz: d8823e97350be198c39b1896dc88978a47b526f1
4
+ data.tar.gz: d65a2d3274d104eae9aa20dd97078159614783e1
5
5
  SHA512:
6
- metadata.gz: 44e7893da77fcb9bcc245d096b69553b87bb740c160f879f7a1d8f00d30f1fc39f59831b17db6ff267772634cf356bceac5c5aa9f91792f0a6998938c60e2cb3
7
- data.tar.gz: 21b3c761487805a31fc61d3570582892d02b97c3c14c01bf0813dd9893b79797a9092f1a9c8ecc47434fbff079a495897a8bf81109e026ac0c2a212615df7831
6
+ metadata.gz: 5f06921658636e7765edc7a71aa9df28bf6a5cd4b36706671b2bf3a75c55755d190e8ab4dc9801cf94a4034b04429a6f14d8978c0323a9a2a6bf5d34456aa2e5
7
+ data.tar.gz: 74dfe7a75358e9e26da392dae39e71c1441a56c63ad51236db3bedfda4156f8cd86a1b69c40574e34b2db46fb8bdb9f22da60bde0a9423afa6849b11551a9494
@@ -6,6 +6,7 @@ rvm:
6
6
  - '2.4'
7
7
  - '2.5'
8
8
  - '2.6'
9
+ - '2.7'
9
10
 
10
11
  addons:
11
12
  apt:
@@ -1,3 +1,8 @@
1
+ # 0.15.0
2
+ - Add feature extractor classes:
3
+ - [HashVectorizer](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/HashVectorizer.html)
4
+ - [FeatureHasher](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/FeatureHasher.html)
5
+
1
6
  # 0.14.5
2
7
  - Fix to suppress deprecation warning about keyword argument in Ruby 2.7.
3
8
 
@@ -77,6 +77,8 @@ require 'rumale/manifold/mds'
77
77
  require 'rumale/neural_network/base_mlp'
78
78
  require 'rumale/neural_network/mlp_regressor'
79
79
  require 'rumale/neural_network/mlp_classifier'
80
+ require 'rumale/feature_extraction/hash_vectorizer'
81
+ require 'rumale/feature_extraction/feature_hasher'
80
82
  require 'rumale/preprocessing/l2_normalizer'
81
83
  require 'rumale/preprocessing/min_max_scaler'
82
84
  require 'rumale/preprocessing/max_abs_scaler'
@@ -0,0 +1,149 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+
6
+ module Rumale
7
+ module FeatureExtraction
8
+ # Encode array of feature-value hash to vectors with feature hashing (hashing trick).
9
+ # This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
10
+ # This encoder employs signed 32-bit Murmurhash3 as the hash function.
11
+ #
12
+ # @example
13
+ # encoder = Rumale::FeatureExtraction::FeatureHasher.new(n_features: 10)
14
+ # x = encoder.transform([
15
+ # { dog: 1, cat: 2, elephant: 4 },
16
+ # { dog: 2, run: 5 }
17
+ # ])
18
+ # # > pp x
19
+ # # Numo::DFloat#shape=[2,10]
20
+ # # [[0, 0, -4, -1, 0, 0, 0, 0, 0, 2],
21
+ # # [0, 0, 0, -2, -5, 0, 0, 0, 0, 0]]
22
+ class FeatureHasher
23
+ include Base::BaseEstimator
24
+ include Base::Transformer
25
+
26
+ # Create a new encoder for converting array of hash consisting of feature names and values to vectors
27
+ # with feature hashing algorith.
28
+ #
29
+ # @param n_features [Integer] The number of features of encoded samples.
30
+ # @param alternate_sign [Boolean] The flag indicating whether to reflect the sign of the hash value to the feature value.
31
+ def initialize(n_features: 1024, alternate_sign: true)
32
+ check_params_numeric(n_features: n_features)
33
+ check_params_boolean(alternate_sign: alternate_sign)
34
+ @params = {}
35
+ @params[:n_features] = n_features
36
+ @params[:alternate_sign] = alternate_sign
37
+ end
38
+
39
+ # This method does not do anything. The encoder does not require training.
40
+ #
41
+ # @overload fit(x) -> FeatureHasher
42
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
43
+ # @return [FeatureHasher]
44
+ def fit(_x = nil, _y = nil)
45
+ self
46
+ end
47
+
48
+ # Encode given the array of feature-value hash.
49
+ # This method has the same output as the transform method
50
+ # because the encoder does not require training.
51
+ #
52
+ # @overload fit_transform(x) -> Numo::DFloat
53
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
54
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
55
+ def fit_transform(x, _y = nil)
56
+ fit(x).transform(x)
57
+ end
58
+
59
+ # Encode given the array of feature-value hash.
60
+ #
61
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
62
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
63
+ def transform(x)
64
+ x = [x] unless x.is_a?(Array)
65
+ n_samples = x.size
66
+
67
+ z = Numo::DFloat.zeros(n_samples, n_features)
68
+
69
+ x.each_with_index do |f, i|
70
+ f.each do |k, v|
71
+ k = "#{k}=#{v}" if v.is_a?(String)
72
+ val = v.is_a?(String) ? 1 : v
73
+ next if val.zero?
74
+
75
+ h = murmur_hash(k.to_s)
76
+ fid = h.abs % n_features
77
+ val *= h >= 0 ? 1 : -1 if alternate_sign?
78
+ z[i, fid] = val
79
+ end
80
+ end
81
+
82
+ z
83
+ end
84
+
85
+ private
86
+
87
+ def n_features
88
+ @params[:n_features]
89
+ end
90
+
91
+ def alternate_sign?
92
+ @params[:alternate_sign]
93
+ end
94
+
95
+ # MurmurHash3_32
96
+ # References:
97
+ # - https://en.wikipedia.org/wiki/MurmurHash
98
+ # - https://github.com/aappleby/smhasher
99
+ def murmur_hash(key_str, seed = 0)
100
+ keyb = key_str.bytes
101
+ key_len = keyb.size
102
+ n_blocks = key_len / 4
103
+
104
+ h = seed
105
+ (0...n_blocks * 4).step(4) do |bstart|
106
+ k = keyb[bstart + 3] << 24 | keyb[bstart + 2] << 16 | keyb[bstart + 1] << 8 | keyb[bstart + 0]
107
+ h ^= murmur_scramble(k)
108
+ h = murmur_rotl(h, 13)
109
+ h = (h * 5 + 0xe6546b64) & 0xFFFFFFFF
110
+ end
111
+
112
+ tail_id = n_blocks * 4
113
+ tail_sz = key_len & 3
114
+
115
+ k = 0
116
+ k ^= keyb[tail_id + 2] << 16 if tail_sz >= 3
117
+ k ^= keyb[tail_id + 1] << 8 if tail_sz >= 2
118
+ k ^= keyb[tail_id + 0] if tail_sz >= 1
119
+ h ^= murmur_scramble(k) if tail_sz.positive?
120
+
121
+ h = murmur_fmix(h ^ key_len)
122
+
123
+ if (h & 0x80000000).zero?
124
+ h
125
+ else
126
+ -((h ^ 0xFFFFFFFF) + 1)
127
+ end
128
+ end
129
+
130
+ def murmur_rotl(x, r)
131
+ (x << r | x >> (32 - r)) & 0xFFFFFFFF
132
+ end
133
+
134
+ def murmur_scramble(k)
135
+ k = (k * 0xcc9e2d51) & 0xFFFFFFFF
136
+ k = murmur_rotl(k, 15)
137
+ (k * 0x1b873593) & 0xFFFFFFFF
138
+ end
139
+
140
+ def murmur_fmix(h)
141
+ h ^= h >> 16
142
+ h = (h * 0x85ebca6b) & 0xFFFFFFFF
143
+ h ^= h >> 13
144
+ h = (h * 0xc2b2ae35) & 0xFFFFFFFF
145
+ h ^ (h >> 16)
146
+ end
147
+ end
148
+ end
149
+ end
@@ -0,0 +1,154 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+
6
+ module Rumale
7
+ # This module consists of the classes that extract features from raw data.
8
+ module FeatureExtraction
9
+ # Encode array of feature-value hash to vectors.
10
+ # This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
11
+ #
12
+ # @example
13
+ # encoder = Rumale::FeatureExtraction::HashVectorizer.new
14
+ # x = encoder.fit_transform([
15
+ # { foo: 1, bar: 2 },
16
+ # { foo: 3, baz: 1 }
17
+ # ])
18
+ # # > pp x
19
+ # # Numo::DFloat#shape=[2,3]
20
+ # # [[2, 0, 1],
21
+ # # [0, 1, 3]]
22
+ #
23
+ # x = encoder.fit_transform([
24
+ # { city: 'Dubai', temperature: 33 },
25
+ # { city: 'London', temperature: 12 },
26
+ # { city: 'San Francisco', temperature: 18 }
27
+ # ])
28
+ # # > pp x
29
+ # # Numo::DFloat#shape=[3,4]
30
+ # # [[1, 0, 0, 33],
31
+ # # [0, 1, 0, 12],
32
+ # # [0, 0, 1, 18]]
33
+ # # > pp encoder.inverse_transform(x)
34
+ # # [{:city=>"Dubai", :temperature=>33.0},
35
+ # # {:city=>"London", :temperature=>12.0},
36
+ # # {:city=>"San Francisco", :temperature=>18.0}]
37
+ class HashVectorizer
38
+ include Base::BaseEstimator
39
+ include Base::Transformer
40
+
41
+ # Return the list of feature names.
42
+ # @return [Array] (size: [n_features])
43
+ attr_reader :feature_names
44
+
45
+ # Return the hash consisting of pairs of feature names and indices.
46
+ # @return [Hash] (size: [n_features])
47
+ attr_reader :vocabulary
48
+
49
+ # Create a new encoder for converting array of hash consisting of feature names and values to vectors.
50
+ #
51
+ # @param separator [String] The separator string used for constructing new feature names for categorical feature.
52
+ # @param sort [Boolean] The flag indicating whether to sort feature names.
53
+ def initialize(separator: '=', sort: true)
54
+ check_params_string(separator: separator)
55
+ check_params_boolean(sort: sort)
56
+ @params = {}
57
+ @params[:separator] = separator
58
+ @params[:sort] = sort
59
+ end
60
+
61
+ # Fit the encoder with given training data.
62
+ #
63
+ # @overload fit(x) -> HashVectorizer
64
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
65
+ # @return [HashVectorizer]
66
+ def fit(x, _y = nil)
67
+ @feature_names = []
68
+ @vocabulary = {}
69
+
70
+ x.each do |f|
71
+ f.each do |k, v|
72
+ k = "#{k}#{separator}#{v}".to_sym if v.is_a?(String)
73
+ next if @vocabulary.key?(k)
74
+ @feature_names.push(k)
75
+ @vocabulary[k] = @vocabulary.size
76
+ end
77
+ end
78
+
79
+ if sort_feature?
80
+ @feature_names.sort!
81
+ @feature_names.each_with_index { |k, i| @vocabulary[k] = i }
82
+ end
83
+
84
+ self
85
+ end
86
+
87
+ # Fit the encoder with given training data, then return encoded data.
88
+ #
89
+ # @overload fit_transform(x) -> Numo::DFloat
90
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
91
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
92
+ def fit_transform(x, _y = nil)
93
+ fit(x).transform(x)
94
+ end
95
+
96
+ # Encode given the array of feature-value hash.
97
+ #
98
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
99
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
100
+ def transform(x)
101
+ x = [x] unless x.is_a?(Array)
102
+ n_samples = x.size
103
+ n_features = @vocabulary.size
104
+ z = Numo::DFloat.zeros(n_samples, n_features)
105
+
106
+ x.each_with_index do |f, i|
107
+ f.each do |k, v|
108
+ if v.is_a?(String)
109
+ k = "#{k}#{separator}#{v}".to_sym
110
+ v = 1
111
+ end
112
+ z[i, @vocabulary[k]] = v if @vocabulary.key?(k)
113
+ end
114
+ end
115
+
116
+ z
117
+ end
118
+
119
+ # Decode sample matirx to the array of feature-value hash.
120
+ #
121
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
122
+ # @return [Array<Hash>] The array of hash consisting of feature names and values.
123
+ def inverse_transform(x)
124
+ n_samples = x.shape[0]
125
+ reconst = []
126
+
127
+ n_samples.times do |i|
128
+ f = {}
129
+ x[i, true].each_with_index do |el, j|
130
+ feature_key_val(@feature_names[j], el).tap { |k, v| f[k.to_sym] = v } unless el.zero?
131
+ end
132
+ reconst.push(f)
133
+ end
134
+
135
+ reconst
136
+ end
137
+
138
+ private
139
+
140
+ def feature_key_val(fname, fval)
141
+ f = fname.to_s.split(separator)
142
+ f.size == 2 ? f : [fname, fval]
143
+ end
144
+
145
+ def separator
146
+ @params[:separator]
147
+ end
148
+
149
+ def sort_feature?
150
+ @params[:sort]
151
+ end
152
+ end
153
+ end
154
+ end
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.14.5'
6
+ VERSION = '0.15.0'
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.5
4
+ version: 0.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-12-28 00:00:00.000000000 Z
11
+ date: 2020-01-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -207,6 +207,8 @@ files:
207
207
  - lib/rumale/evaluation_measure/recall.rb
208
208
  - lib/rumale/evaluation_measure/roc_auc.rb
209
209
  - lib/rumale/evaluation_measure/silhouette_score.rb
210
+ - lib/rumale/feature_extraction/feature_hasher.rb
211
+ - lib/rumale/feature_extraction/hash_vectorizer.rb
210
212
  - lib/rumale/kernel_approximation/rbf.rb
211
213
  - lib/rumale/kernel_machine/kernel_pca.rb
212
214
  - lib/rumale/kernel_machine/kernel_ridge.rb