rumale 0.14.5 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/CHANGELOG.md +5 -0
- data/lib/rumale.rb +2 -0
- data/lib/rumale/feature_extraction/feature_hasher.rb +149 -0
- data/lib/rumale/feature_extraction/hash_vectorizer.rb +154 -0
- data/lib/rumale/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d8823e97350be198c39b1896dc88978a47b526f1
|
4
|
+
data.tar.gz: d65a2d3274d104eae9aa20dd97078159614783e1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f06921658636e7765edc7a71aa9df28bf6a5cd4b36706671b2bf3a75c55755d190e8ab4dc9801cf94a4034b04429a6f14d8978c0323a9a2a6bf5d34456aa2e5
|
7
|
+
data.tar.gz: 74dfe7a75358e9e26da392dae39e71c1441a56c63ad51236db3bedfda4156f8cd86a1b69c40574e34b2db46fb8bdb9f22da60bde0a9423afa6849b11551a9494
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
# 0.15.0
|
2
|
+
- Add feature extractor classes:
|
3
|
+
- [HashVectorizer](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/HashVectorizer.html)
|
4
|
+
- [FeatureHasher](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/FeatureHasher.html)
|
5
|
+
|
1
6
|
# 0.14.5
|
2
7
|
- Fix to suppress deprecation warning about keyword argument in Ruby 2.7.
|
3
8
|
|
data/lib/rumale.rb
CHANGED
@@ -77,6 +77,8 @@ require 'rumale/manifold/mds'
|
|
77
77
|
require 'rumale/neural_network/base_mlp'
|
78
78
|
require 'rumale/neural_network/mlp_regressor'
|
79
79
|
require 'rumale/neural_network/mlp_classifier'
|
80
|
+
require 'rumale/feature_extraction/hash_vectorizer'
|
81
|
+
require 'rumale/feature_extraction/feature_hasher'
|
80
82
|
require 'rumale/preprocessing/l2_normalizer'
|
81
83
|
require 'rumale/preprocessing/min_max_scaler'
|
82
84
|
require 'rumale/preprocessing/max_abs_scaler'
|
@@ -0,0 +1,149 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module FeatureExtraction
|
8
|
+
# Encode array of feature-value hash to vectors with feature hashing (hashing trick).
|
9
|
+
# This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
|
10
|
+
# This encoder employs signed 32-bit Murmurhash3 as the hash function.
|
11
|
+
#
|
12
|
+
# @example
|
13
|
+
# encoder = Rumale::FeatureExtraction::FeatureHasher.new(n_features: 10)
|
14
|
+
# x = encoder.transform([
|
15
|
+
# { dog: 1, cat: 2, elephant: 4 },
|
16
|
+
# { dog: 2, run: 5 }
|
17
|
+
# ])
|
18
|
+
# # > pp x
|
19
|
+
# # Numo::DFloat#shape=[2,10]
|
20
|
+
# # [[0, 0, -4, -1, 0, 0, 0, 0, 0, 2],
|
21
|
+
# # [0, 0, 0, -2, -5, 0, 0, 0, 0, 0]]
|
22
|
+
class FeatureHasher
|
23
|
+
include Base::BaseEstimator
|
24
|
+
include Base::Transformer
|
25
|
+
|
26
|
+
# Create a new encoder for converting array of hash consisting of feature names and values to vectors
|
27
|
+
# with feature hashing algorith.
|
28
|
+
#
|
29
|
+
# @param n_features [Integer] The number of features of encoded samples.
|
30
|
+
# @param alternate_sign [Boolean] The flag indicating whether to reflect the sign of the hash value to the feature value.
|
31
|
+
def initialize(n_features: 1024, alternate_sign: true)
|
32
|
+
check_params_numeric(n_features: n_features)
|
33
|
+
check_params_boolean(alternate_sign: alternate_sign)
|
34
|
+
@params = {}
|
35
|
+
@params[:n_features] = n_features
|
36
|
+
@params[:alternate_sign] = alternate_sign
|
37
|
+
end
|
38
|
+
|
39
|
+
# This method does not do anything. The encoder does not require training.
|
40
|
+
#
|
41
|
+
# @overload fit(x) -> FeatureHasher
|
42
|
+
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
43
|
+
# @return [FeatureHasher]
|
44
|
+
def fit(_x = nil, _y = nil)
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
# Encode given the array of feature-value hash.
|
49
|
+
# This method has the same output as the transform method
|
50
|
+
# because the encoder does not require training.
|
51
|
+
#
|
52
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
53
|
+
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
54
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
55
|
+
def fit_transform(x, _y = nil)
|
56
|
+
fit(x).transform(x)
|
57
|
+
end
|
58
|
+
|
59
|
+
# Encode given the array of feature-value hash.
|
60
|
+
#
|
61
|
+
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
62
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
63
|
+
def transform(x)
|
64
|
+
x = [x] unless x.is_a?(Array)
|
65
|
+
n_samples = x.size
|
66
|
+
|
67
|
+
z = Numo::DFloat.zeros(n_samples, n_features)
|
68
|
+
|
69
|
+
x.each_with_index do |f, i|
|
70
|
+
f.each do |k, v|
|
71
|
+
k = "#{k}=#{v}" if v.is_a?(String)
|
72
|
+
val = v.is_a?(String) ? 1 : v
|
73
|
+
next if val.zero?
|
74
|
+
|
75
|
+
h = murmur_hash(k.to_s)
|
76
|
+
fid = h.abs % n_features
|
77
|
+
val *= h >= 0 ? 1 : -1 if alternate_sign?
|
78
|
+
z[i, fid] = val
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
z
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def n_features
|
88
|
+
@params[:n_features]
|
89
|
+
end
|
90
|
+
|
91
|
+
def alternate_sign?
|
92
|
+
@params[:alternate_sign]
|
93
|
+
end
|
94
|
+
|
95
|
+
# MurmurHash3_32
|
96
|
+
# References:
|
97
|
+
# - https://en.wikipedia.org/wiki/MurmurHash
|
98
|
+
# - https://github.com/aappleby/smhasher
|
99
|
+
def murmur_hash(key_str, seed = 0)
|
100
|
+
keyb = key_str.bytes
|
101
|
+
key_len = keyb.size
|
102
|
+
n_blocks = key_len / 4
|
103
|
+
|
104
|
+
h = seed
|
105
|
+
(0...n_blocks * 4).step(4) do |bstart|
|
106
|
+
k = keyb[bstart + 3] << 24 | keyb[bstart + 2] << 16 | keyb[bstart + 1] << 8 | keyb[bstart + 0]
|
107
|
+
h ^= murmur_scramble(k)
|
108
|
+
h = murmur_rotl(h, 13)
|
109
|
+
h = (h * 5 + 0xe6546b64) & 0xFFFFFFFF
|
110
|
+
end
|
111
|
+
|
112
|
+
tail_id = n_blocks * 4
|
113
|
+
tail_sz = key_len & 3
|
114
|
+
|
115
|
+
k = 0
|
116
|
+
k ^= keyb[tail_id + 2] << 16 if tail_sz >= 3
|
117
|
+
k ^= keyb[tail_id + 1] << 8 if tail_sz >= 2
|
118
|
+
k ^= keyb[tail_id + 0] if tail_sz >= 1
|
119
|
+
h ^= murmur_scramble(k) if tail_sz.positive?
|
120
|
+
|
121
|
+
h = murmur_fmix(h ^ key_len)
|
122
|
+
|
123
|
+
if (h & 0x80000000).zero?
|
124
|
+
h
|
125
|
+
else
|
126
|
+
-((h ^ 0xFFFFFFFF) + 1)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def murmur_rotl(x, r)
|
131
|
+
(x << r | x >> (32 - r)) & 0xFFFFFFFF
|
132
|
+
end
|
133
|
+
|
134
|
+
def murmur_scramble(k)
|
135
|
+
k = (k * 0xcc9e2d51) & 0xFFFFFFFF
|
136
|
+
k = murmur_rotl(k, 15)
|
137
|
+
(k * 0x1b873593) & 0xFFFFFFFF
|
138
|
+
end
|
139
|
+
|
140
|
+
def murmur_fmix(h)
|
141
|
+
h ^= h >> 16
|
142
|
+
h = (h * 0x85ebca6b) & 0xFFFFFFFF
|
143
|
+
h ^= h >> 13
|
144
|
+
h = (h * 0xc2b2ae35) & 0xFFFFFFFF
|
145
|
+
h ^ (h >> 16)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
# This module consists of the classes that extract features from raw data.
|
8
|
+
module FeatureExtraction
|
9
|
+
# Encode array of feature-value hash to vectors.
|
10
|
+
# This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
|
11
|
+
#
|
12
|
+
# @example
|
13
|
+
# encoder = Rumale::FeatureExtraction::HashVectorizer.new
|
14
|
+
# x = encoder.fit_transform([
|
15
|
+
# { foo: 1, bar: 2 },
|
16
|
+
# { foo: 3, baz: 1 }
|
17
|
+
# ])
|
18
|
+
# # > pp x
|
19
|
+
# # Numo::DFloat#shape=[2,3]
|
20
|
+
# # [[2, 0, 1],
|
21
|
+
# # [0, 1, 3]]
|
22
|
+
#
|
23
|
+
# x = encoder.fit_transform([
|
24
|
+
# { city: 'Dubai', temperature: 33 },
|
25
|
+
# { city: 'London', temperature: 12 },
|
26
|
+
# { city: 'San Francisco', temperature: 18 }
|
27
|
+
# ])
|
28
|
+
# # > pp x
|
29
|
+
# # Numo::DFloat#shape=[3,4]
|
30
|
+
# # [[1, 0, 0, 33],
|
31
|
+
# # [0, 1, 0, 12],
|
32
|
+
# # [0, 0, 1, 18]]
|
33
|
+
# # > pp encoder.inverse_transform(x)
|
34
|
+
# # [{:city=>"Dubai", :temperature=>33.0},
|
35
|
+
# # {:city=>"London", :temperature=>12.0},
|
36
|
+
# # {:city=>"San Francisco", :temperature=>18.0}]
|
37
|
+
class HashVectorizer
|
38
|
+
include Base::BaseEstimator
|
39
|
+
include Base::Transformer
|
40
|
+
|
41
|
+
# Return the list of feature names.
|
42
|
+
# @return [Array] (size: [n_features])
|
43
|
+
attr_reader :feature_names
|
44
|
+
|
45
|
+
# Return the hash consisting of pairs of feature names and indices.
|
46
|
+
# @return [Hash] (size: [n_features])
|
47
|
+
attr_reader :vocabulary
|
48
|
+
|
49
|
+
# Create a new encoder for converting array of hash consisting of feature names and values to vectors.
|
50
|
+
#
|
51
|
+
# @param separator [String] The separator string used for constructing new feature names for categorical feature.
|
52
|
+
# @param sort [Boolean] The flag indicating whether to sort feature names.
|
53
|
+
def initialize(separator: '=', sort: true)
|
54
|
+
check_params_string(separator: separator)
|
55
|
+
check_params_boolean(sort: sort)
|
56
|
+
@params = {}
|
57
|
+
@params[:separator] = separator
|
58
|
+
@params[:sort] = sort
|
59
|
+
end
|
60
|
+
|
61
|
+
# Fit the encoder with given training data.
|
62
|
+
#
|
63
|
+
# @overload fit(x) -> HashVectorizer
|
64
|
+
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
65
|
+
# @return [HashVectorizer]
|
66
|
+
def fit(x, _y = nil)
|
67
|
+
@feature_names = []
|
68
|
+
@vocabulary = {}
|
69
|
+
|
70
|
+
x.each do |f|
|
71
|
+
f.each do |k, v|
|
72
|
+
k = "#{k}#{separator}#{v}".to_sym if v.is_a?(String)
|
73
|
+
next if @vocabulary.key?(k)
|
74
|
+
@feature_names.push(k)
|
75
|
+
@vocabulary[k] = @vocabulary.size
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
if sort_feature?
|
80
|
+
@feature_names.sort!
|
81
|
+
@feature_names.each_with_index { |k, i| @vocabulary[k] = i }
|
82
|
+
end
|
83
|
+
|
84
|
+
self
|
85
|
+
end
|
86
|
+
|
87
|
+
# Fit the encoder with given training data, then return encoded data.
|
88
|
+
#
|
89
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
90
|
+
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
91
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
92
|
+
def fit_transform(x, _y = nil)
|
93
|
+
fit(x).transform(x)
|
94
|
+
end
|
95
|
+
|
96
|
+
# Encode given the array of feature-value hash.
|
97
|
+
#
|
98
|
+
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
99
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
100
|
+
def transform(x)
|
101
|
+
x = [x] unless x.is_a?(Array)
|
102
|
+
n_samples = x.size
|
103
|
+
n_features = @vocabulary.size
|
104
|
+
z = Numo::DFloat.zeros(n_samples, n_features)
|
105
|
+
|
106
|
+
x.each_with_index do |f, i|
|
107
|
+
f.each do |k, v|
|
108
|
+
if v.is_a?(String)
|
109
|
+
k = "#{k}#{separator}#{v}".to_sym
|
110
|
+
v = 1
|
111
|
+
end
|
112
|
+
z[i, @vocabulary[k]] = v if @vocabulary.key?(k)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
z
|
117
|
+
end
|
118
|
+
|
119
|
+
# Decode sample matirx to the array of feature-value hash.
|
120
|
+
#
|
121
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
122
|
+
# @return [Array<Hash>] The array of hash consisting of feature names and values.
|
123
|
+
def inverse_transform(x)
|
124
|
+
n_samples = x.shape[0]
|
125
|
+
reconst = []
|
126
|
+
|
127
|
+
n_samples.times do |i|
|
128
|
+
f = {}
|
129
|
+
x[i, true].each_with_index do |el, j|
|
130
|
+
feature_key_val(@feature_names[j], el).tap { |k, v| f[k.to_sym] = v } unless el.zero?
|
131
|
+
end
|
132
|
+
reconst.push(f)
|
133
|
+
end
|
134
|
+
|
135
|
+
reconst
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
|
140
|
+
def feature_key_val(fname, fval)
|
141
|
+
f = fname.to_s.split(separator)
|
142
|
+
f.size == 2 ? f : [fname, fval]
|
143
|
+
end
|
144
|
+
|
145
|
+
def separator
|
146
|
+
@params[:separator]
|
147
|
+
end
|
148
|
+
|
149
|
+
def sort_feature?
|
150
|
+
@params[:sort]
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
data/lib/rumale/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -207,6 +207,8 @@ files:
|
|
207
207
|
- lib/rumale/evaluation_measure/recall.rb
|
208
208
|
- lib/rumale/evaluation_measure/roc_auc.rb
|
209
209
|
- lib/rumale/evaluation_measure/silhouette_score.rb
|
210
|
+
- lib/rumale/feature_extraction/feature_hasher.rb
|
211
|
+
- lib/rumale/feature_extraction/hash_vectorizer.rb
|
210
212
|
- lib/rumale/kernel_approximation/rbf.rb
|
211
213
|
- lib/rumale/kernel_machine/kernel_pca.rb
|
212
214
|
- lib/rumale/kernel_machine/kernel_ridge.rb
|