rumale 0.14.5 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/CHANGELOG.md +5 -0
- data/lib/rumale.rb +2 -0
- data/lib/rumale/feature_extraction/feature_hasher.rb +149 -0
- data/lib/rumale/feature_extraction/hash_vectorizer.rb +154 -0
- data/lib/rumale/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d8823e97350be198c39b1896dc88978a47b526f1
|
4
|
+
data.tar.gz: d65a2d3274d104eae9aa20dd97078159614783e1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f06921658636e7765edc7a71aa9df28bf6a5cd4b36706671b2bf3a75c55755d190e8ab4dc9801cf94a4034b04429a6f14d8978c0323a9a2a6bf5d34456aa2e5
|
7
|
+
data.tar.gz: 74dfe7a75358e9e26da392dae39e71c1441a56c63ad51236db3bedfda4156f8cd86a1b69c40574e34b2db46fb8bdb9f22da60bde0a9423afa6849b11551a9494
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
# 0.15.0
|
2
|
+
- Add feature extractor classes:
|
3
|
+
- [HashVectorizer](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/HashVectorizer.html)
|
4
|
+
- [FeatureHasher](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/FeatureHasher.html)
|
5
|
+
|
1
6
|
# 0.14.5
|
2
7
|
- Fix to suppress deprecation warning about keyword argument in Ruby 2.7.
|
3
8
|
|
data/lib/rumale.rb
CHANGED
@@ -77,6 +77,8 @@ require 'rumale/manifold/mds'
|
|
77
77
|
require 'rumale/neural_network/base_mlp'
|
78
78
|
require 'rumale/neural_network/mlp_regressor'
|
79
79
|
require 'rumale/neural_network/mlp_classifier'
|
80
|
+
require 'rumale/feature_extraction/hash_vectorizer'
|
81
|
+
require 'rumale/feature_extraction/feature_hasher'
|
80
82
|
require 'rumale/preprocessing/l2_normalizer'
|
81
83
|
require 'rumale/preprocessing/min_max_scaler'
|
82
84
|
require 'rumale/preprocessing/max_abs_scaler'
|
@@ -0,0 +1,149 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module FeatureExtraction
|
8
|
+
# Encode array of feature-value hash to vectors with feature hashing (hashing trick).
|
9
|
+
# This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
|
10
|
+
# This encoder employs signed 32-bit Murmurhash3 as the hash function.
|
11
|
+
#
|
12
|
+
# @example
|
13
|
+
# encoder = Rumale::FeatureExtraction::FeatureHasher.new(n_features: 10)
|
14
|
+
# x = encoder.transform([
|
15
|
+
# { dog: 1, cat: 2, elephant: 4 },
|
16
|
+
# { dog: 2, run: 5 }
|
17
|
+
# ])
|
18
|
+
# # > pp x
|
19
|
+
# # Numo::DFloat#shape=[2,10]
|
20
|
+
# # [[0, 0, -4, -1, 0, 0, 0, 0, 0, 2],
|
21
|
+
# # [0, 0, 0, -2, -5, 0, 0, 0, 0, 0]]
|
22
|
+
class FeatureHasher
|
23
|
+
include Base::BaseEstimator
|
24
|
+
include Base::Transformer
|
25
|
+
|
26
|
+
# Create a new encoder for converting array of hash consisting of feature names and values to vectors
|
27
|
+
# with feature hashing algorith.
|
28
|
+
#
|
29
|
+
# @param n_features [Integer] The number of features of encoded samples.
|
30
|
+
# @param alternate_sign [Boolean] The flag indicating whether to reflect the sign of the hash value to the feature value.
|
31
|
+
def initialize(n_features: 1024, alternate_sign: true)
|
32
|
+
check_params_numeric(n_features: n_features)
|
33
|
+
check_params_boolean(alternate_sign: alternate_sign)
|
34
|
+
@params = {}
|
35
|
+
@params[:n_features] = n_features
|
36
|
+
@params[:alternate_sign] = alternate_sign
|
37
|
+
end
|
38
|
+
|
39
|
+
# This method does not do anything. The encoder does not require training.
|
40
|
+
#
|
41
|
+
# @overload fit(x) -> FeatureHasher
|
42
|
+
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
43
|
+
# @return [FeatureHasher]
|
44
|
+
def fit(_x = nil, _y = nil)
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
# Encode given the array of feature-value hash.
|
49
|
+
# This method has the same output as the transform method
|
50
|
+
# because the encoder does not require training.
|
51
|
+
#
|
52
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
53
|
+
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
54
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
55
|
+
def fit_transform(x, _y = nil)
|
56
|
+
fit(x).transform(x)
|
57
|
+
end
|
58
|
+
|
59
|
+
# Encode given the array of feature-value hash.
|
60
|
+
#
|
61
|
+
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
62
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
63
|
+
def transform(x)
|
64
|
+
x = [x] unless x.is_a?(Array)
|
65
|
+
n_samples = x.size
|
66
|
+
|
67
|
+
z = Numo::DFloat.zeros(n_samples, n_features)
|
68
|
+
|
69
|
+
x.each_with_index do |f, i|
|
70
|
+
f.each do |k, v|
|
71
|
+
k = "#{k}=#{v}" if v.is_a?(String)
|
72
|
+
val = v.is_a?(String) ? 1 : v
|
73
|
+
next if val.zero?
|
74
|
+
|
75
|
+
h = murmur_hash(k.to_s)
|
76
|
+
fid = h.abs % n_features
|
77
|
+
val *= h >= 0 ? 1 : -1 if alternate_sign?
|
78
|
+
z[i, fid] = val
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
z
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def n_features
|
88
|
+
@params[:n_features]
|
89
|
+
end
|
90
|
+
|
91
|
+
def alternate_sign?
|
92
|
+
@params[:alternate_sign]
|
93
|
+
end
|
94
|
+
|
95
|
+
# MurmurHash3_32
|
96
|
+
# References:
|
97
|
+
# - https://en.wikipedia.org/wiki/MurmurHash
|
98
|
+
# - https://github.com/aappleby/smhasher
|
99
|
+
def murmur_hash(key_str, seed = 0)
|
100
|
+
keyb = key_str.bytes
|
101
|
+
key_len = keyb.size
|
102
|
+
n_blocks = key_len / 4
|
103
|
+
|
104
|
+
h = seed
|
105
|
+
(0...n_blocks * 4).step(4) do |bstart|
|
106
|
+
k = keyb[bstart + 3] << 24 | keyb[bstart + 2] << 16 | keyb[bstart + 1] << 8 | keyb[bstart + 0]
|
107
|
+
h ^= murmur_scramble(k)
|
108
|
+
h = murmur_rotl(h, 13)
|
109
|
+
h = (h * 5 + 0xe6546b64) & 0xFFFFFFFF
|
110
|
+
end
|
111
|
+
|
112
|
+
tail_id = n_blocks * 4
|
113
|
+
tail_sz = key_len & 3
|
114
|
+
|
115
|
+
k = 0
|
116
|
+
k ^= keyb[tail_id + 2] << 16 if tail_sz >= 3
|
117
|
+
k ^= keyb[tail_id + 1] << 8 if tail_sz >= 2
|
118
|
+
k ^= keyb[tail_id + 0] if tail_sz >= 1
|
119
|
+
h ^= murmur_scramble(k) if tail_sz.positive?
|
120
|
+
|
121
|
+
h = murmur_fmix(h ^ key_len)
|
122
|
+
|
123
|
+
if (h & 0x80000000).zero?
|
124
|
+
h
|
125
|
+
else
|
126
|
+
-((h ^ 0xFFFFFFFF) + 1)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def murmur_rotl(x, r)
|
131
|
+
(x << r | x >> (32 - r)) & 0xFFFFFFFF
|
132
|
+
end
|
133
|
+
|
134
|
+
def murmur_scramble(k)
|
135
|
+
k = (k * 0xcc9e2d51) & 0xFFFFFFFF
|
136
|
+
k = murmur_rotl(k, 15)
|
137
|
+
(k * 0x1b873593) & 0xFFFFFFFF
|
138
|
+
end
|
139
|
+
|
140
|
+
def murmur_fmix(h)
|
141
|
+
h ^= h >> 16
|
142
|
+
h = (h * 0x85ebca6b) & 0xFFFFFFFF
|
143
|
+
h ^= h >> 13
|
144
|
+
h = (h * 0xc2b2ae35) & 0xFFFFFFFF
|
145
|
+
h ^ (h >> 16)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
# This module consists of the classes that extract features from raw data.
|
8
|
+
module FeatureExtraction
|
9
|
+
# Encode array of feature-value hash to vectors.
|
10
|
+
# This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
|
11
|
+
#
|
12
|
+
# @example
|
13
|
+
# encoder = Rumale::FeatureExtraction::HashVectorizer.new
|
14
|
+
# x = encoder.fit_transform([
|
15
|
+
# { foo: 1, bar: 2 },
|
16
|
+
# { foo: 3, baz: 1 }
|
17
|
+
# ])
|
18
|
+
# # > pp x
|
19
|
+
# # Numo::DFloat#shape=[2,3]
|
20
|
+
# # [[2, 0, 1],
|
21
|
+
# # [0, 1, 3]]
|
22
|
+
#
|
23
|
+
# x = encoder.fit_transform([
|
24
|
+
# { city: 'Dubai', temperature: 33 },
|
25
|
+
# { city: 'London', temperature: 12 },
|
26
|
+
# { city: 'San Francisco', temperature: 18 }
|
27
|
+
# ])
|
28
|
+
# # > pp x
|
29
|
+
# # Numo::DFloat#shape=[3,4]
|
30
|
+
# # [[1, 0, 0, 33],
|
31
|
+
# # [0, 1, 0, 12],
|
32
|
+
# # [0, 0, 1, 18]]
|
33
|
+
# # > pp encoder.inverse_transform(x)
|
34
|
+
# # [{:city=>"Dubai", :temperature=>33.0},
|
35
|
+
# # {:city=>"London", :temperature=>12.0},
|
36
|
+
# # {:city=>"San Francisco", :temperature=>18.0}]
|
37
|
+
class HashVectorizer
|
38
|
+
include Base::BaseEstimator
|
39
|
+
include Base::Transformer
|
40
|
+
|
41
|
+
# Return the list of feature names.
|
42
|
+
# @return [Array] (size: [n_features])
|
43
|
+
attr_reader :feature_names
|
44
|
+
|
45
|
+
# Return the hash consisting of pairs of feature names and indices.
|
46
|
+
# @return [Hash] (size: [n_features])
|
47
|
+
attr_reader :vocabulary
|
48
|
+
|
49
|
+
# Create a new encoder for converting array of hash consisting of feature names and values to vectors.
|
50
|
+
#
|
51
|
+
# @param separator [String] The separator string used for constructing new feature names for categorical feature.
|
52
|
+
# @param sort [Boolean] The flag indicating whether to sort feature names.
|
53
|
+
def initialize(separator: '=', sort: true)
|
54
|
+
check_params_string(separator: separator)
|
55
|
+
check_params_boolean(sort: sort)
|
56
|
+
@params = {}
|
57
|
+
@params[:separator] = separator
|
58
|
+
@params[:sort] = sort
|
59
|
+
end
|
60
|
+
|
61
|
+
# Fit the encoder with given training data.
|
62
|
+
#
|
63
|
+
# @overload fit(x) -> HashVectorizer
|
64
|
+
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
65
|
+
# @return [HashVectorizer]
|
66
|
+
def fit(x, _y = nil)
|
67
|
+
@feature_names = []
|
68
|
+
@vocabulary = {}
|
69
|
+
|
70
|
+
x.each do |f|
|
71
|
+
f.each do |k, v|
|
72
|
+
k = "#{k}#{separator}#{v}".to_sym if v.is_a?(String)
|
73
|
+
next if @vocabulary.key?(k)
|
74
|
+
@feature_names.push(k)
|
75
|
+
@vocabulary[k] = @vocabulary.size
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
if sort_feature?
|
80
|
+
@feature_names.sort!
|
81
|
+
@feature_names.each_with_index { |k, i| @vocabulary[k] = i }
|
82
|
+
end
|
83
|
+
|
84
|
+
self
|
85
|
+
end
|
86
|
+
|
87
|
+
# Fit the encoder with given training data, then return encoded data.
|
88
|
+
#
|
89
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
90
|
+
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
91
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
92
|
+
def fit_transform(x, _y = nil)
|
93
|
+
fit(x).transform(x)
|
94
|
+
end
|
95
|
+
|
96
|
+
# Encode given the array of feature-value hash.
|
97
|
+
#
|
98
|
+
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
99
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
100
|
+
def transform(x)
|
101
|
+
x = [x] unless x.is_a?(Array)
|
102
|
+
n_samples = x.size
|
103
|
+
n_features = @vocabulary.size
|
104
|
+
z = Numo::DFloat.zeros(n_samples, n_features)
|
105
|
+
|
106
|
+
x.each_with_index do |f, i|
|
107
|
+
f.each do |k, v|
|
108
|
+
if v.is_a?(String)
|
109
|
+
k = "#{k}#{separator}#{v}".to_sym
|
110
|
+
v = 1
|
111
|
+
end
|
112
|
+
z[i, @vocabulary[k]] = v if @vocabulary.key?(k)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
z
|
117
|
+
end
|
118
|
+
|
119
|
+
# Decode sample matirx to the array of feature-value hash.
|
120
|
+
#
|
121
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
122
|
+
# @return [Array<Hash>] The array of hash consisting of feature names and values.
|
123
|
+
def inverse_transform(x)
|
124
|
+
n_samples = x.shape[0]
|
125
|
+
reconst = []
|
126
|
+
|
127
|
+
n_samples.times do |i|
|
128
|
+
f = {}
|
129
|
+
x[i, true].each_with_index do |el, j|
|
130
|
+
feature_key_val(@feature_names[j], el).tap { |k, v| f[k.to_sym] = v } unless el.zero?
|
131
|
+
end
|
132
|
+
reconst.push(f)
|
133
|
+
end
|
134
|
+
|
135
|
+
reconst
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
|
140
|
+
def feature_key_val(fname, fval)
|
141
|
+
f = fname.to_s.split(separator)
|
142
|
+
f.size == 2 ? f : [fname, fval]
|
143
|
+
end
|
144
|
+
|
145
|
+
def separator
|
146
|
+
@params[:separator]
|
147
|
+
end
|
148
|
+
|
149
|
+
def sort_feature?
|
150
|
+
@params[:sort]
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
data/lib/rumale/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -207,6 +207,8 @@ files:
|
|
207
207
|
- lib/rumale/evaluation_measure/recall.rb
|
208
208
|
- lib/rumale/evaluation_measure/roc_auc.rb
|
209
209
|
- lib/rumale/evaluation_measure/silhouette_score.rb
|
210
|
+
- lib/rumale/feature_extraction/feature_hasher.rb
|
211
|
+
- lib/rumale/feature_extraction/hash_vectorizer.rb
|
210
212
|
- lib/rumale/kernel_approximation/rbf.rb
|
211
213
|
- lib/rumale/kernel_machine/kernel_pca.rb
|
212
214
|
- lib/rumale/kernel_machine/kernel_ridge.rb
|