rumale-feature_extraction 0.24.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7927d78c3c8294fdaba1f509c5bfa0d3d5960d5813cba42aaa5c2765317064dd
4
+ data.tar.gz: 43422862894245c61da3b8973a3991cccf80d87f901fbab635077a00fe7670d8
5
+ SHA512:
6
+ metadata.gz: 9127e6789c784861dc6302cbd69b6abc6afc841e8ba22ef0e4b1b42cd0a575433fe79e37c3797eee632560cf7d0a7585aee1e2a28ee7d1df8ae770c5be2f587f
7
+ data.tar.gz: a0455a7c16fc510d2428d9476e22d883bb1377779552daba8243ce20bdd332df69be4f3143aa1d4abe2bc4b319210c06872ff6239a25565fa13da82298113b13
data/LICENSE.txt ADDED
@@ -0,0 +1,27 @@
1
+ Copyright (c) 2022 Atsushi Tatsuma
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ * Neither the name of the copyright holder nor the names of its
15
+ contributors may be used to endorse or promote products derived from
16
+ this software without specific prior written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,34 @@
1
+ # Rumale::FeatureExtraction
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/rumale-feature_extraction.svg)](https://badge.fury.io/rb/rumale-feature_extraction)
4
+ [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/main/rumale-feature_extraction/LICENSE.txt)
5
+ [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction.html)
6
+
7
+ Rumale is a machine learning library in Ruby.
8
+ Rumale::FeatureExtraction provides feature extraction methods,
9
+ such as TF-IDF and feature hashing,
10
+ with Rumale interface.
11
+
12
+ ## Installation
13
+
14
+ Add this line to your application's Gemfile:
15
+
16
+ ```ruby
17
+ gem 'rumale-feature_extraction'
18
+ ```
19
+
20
+ And then execute:
21
+
22
+ $ bundle install
23
+
24
+ Or install it yourself as:
25
+
26
+ $ gem install rumale-feature_extraction
27
+
28
+ ## Documentation
29
+
30
+ - [Rumale API Documentation - FeatureExtraction](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction.html)
31
+
32
+ ## License
33
+
34
+ The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
@@ -0,0 +1,100 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mmh3'
4
+
5
+ require 'rumale/base/estimator'
6
+ require 'rumale/base/transformer'
7
+
8
+ module Rumale
9
+ module FeatureExtraction
10
+ # Encode array of feature-value hash to vectors with feature hashing (hashing trick).
11
+ # This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
12
+ # This encoder employs signed 32-bit Murmurhash3 as the hash function.
13
+ #
14
+ # @example
15
+ # require 'rumale/feature_extraction/feature_hasher'
16
+ #
17
+ # encoder = Rumale::FeatureExtraction::FeatureHasher.new(n_features: 10)
18
+ # x = encoder.transform([
19
+ # { dog: 1, cat: 2, elephant: 4 },
20
+ # { dog: 2, run: 5 }
21
+ # ])
22
+ #
23
+ # # > pp x
24
+ # # Numo::DFloat#shape=[2,10]
25
+ # # [[0, 0, -4, -1, 0, 0, 0, 0, 0, 2],
26
+ # # [0, 0, 0, -2, -5, 0, 0, 0, 0, 0]]
27
+ class FeatureHasher < ::Rumale::Base::Estimator
28
+ include ::Rumale::Base::Transformer
29
+
30
+ # Create a new encoder for converting array of hash consisting of feature names and values to vectors
31
+ # with feature hashing algorith.
32
+ #
33
+ # @param n_features [Integer] The number of features of encoded samples.
34
+ # @param alternate_sign [Boolean] The flag indicating whether to reflect the sign of the hash value to the feature value.
35
+ def initialize(n_features: 1024, alternate_sign: true)
36
+ super()
37
+ @params = {
38
+ n_features: n_features,
39
+ alternate_sign: alternate_sign
40
+ }
41
+ end
42
+
43
+ # This method does not do anything. The encoder does not require training.
44
+ #
45
+ # @overload fit(x) -> FeatureHasher
46
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
47
+ # @return [FeatureHasher]
48
+ def fit(_x = nil, _y = nil)
49
+ self
50
+ end
51
+
52
+ # Encode given the array of feature-value hash.
53
+ # This method has the same output as the transform method
54
+ # because the encoder does not require training.
55
+ #
56
+ # @overload fit_transform(x) -> Numo::DFloat
57
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
58
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
59
+ def fit_transform(x, _y = nil)
60
+ fit(x).transform(x)
61
+ end
62
+
63
+ # Encode given the array of feature-value hash.
64
+ #
65
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
66
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
67
+ def transform(x)
68
+ x = [x] unless x.is_a?(Array)
69
+ n_samples = x.size
70
+
71
+ z = Numo::DFloat.zeros(n_samples, n_features)
72
+
73
+ x.each_with_index do |f, i|
74
+ f.each do |k, v|
75
+ k = "#{k}=#{v}" if v.is_a?(String)
76
+ val = v.is_a?(String) ? 1 : v
77
+ next if val.zero?
78
+
79
+ h = Mmh3.hash32(k)
80
+ fid = h.abs % n_features
81
+ val *= h >= 0 ? 1 : -1 if alternate_sign?
82
+ z[i, fid] = val
83
+ end
84
+ end
85
+
86
+ z
87
+ end
88
+
89
+ private
90
+
91
+ def n_features
92
+ @params[:n_features]
93
+ end
94
+
95
+ def alternate_sign?
96
+ @params[:alternate_sign]
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,157 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/estimator'
4
+ require 'rumale/base/transformer'
5
+
6
+ module Rumale
7
+ module FeatureExtraction
8
+ # Encode array of feature-value hash to vectors.
9
+ # This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
10
+ #
11
+ # @example
12
+ # require 'rumale/feature_extraction/hash_vectorizer'
13
+ #
14
+ # encoder = Rumale::FeatureExtraction::HashVectorizer.new
15
+ # x = encoder.fit_transform([
16
+ # { foo: 1, bar: 2 },
17
+ # { foo: 3, baz: 1 }
18
+ # ])
19
+ #
20
+ # # > pp x
21
+ # # Numo::DFloat#shape=[2,3]
22
+ # # [[2, 0, 1],
23
+ # # [0, 1, 3]]
24
+ #
25
+ # x = encoder.fit_transform([
26
+ # { city: 'Dubai', temperature: 33 },
27
+ # { city: 'London', temperature: 12 },
28
+ # { city: 'San Francisco', temperature: 18 }
29
+ # ])
30
+ #
31
+ # # > pp x
32
+ # # Numo::DFloat#shape=[3,4]
33
+ # # [[1, 0, 0, 33],
34
+ # # [0, 1, 0, 12],
35
+ # # [0, 0, 1, 18]]
36
+ # # > pp encoder.inverse_transform(x)
37
+ # # [{:city=>"Dubai", :temperature=>33.0},
38
+ # # {:city=>"London", :temperature=>12.0},
39
+ # # {:city=>"San Francisco", :temperature=>18.0}]
40
+ class HashVectorizer < ::Rumale::Base::Estimator
41
+ include ::Rumale::Base::Transformer
42
+
43
+ # Return the list of feature names.
44
+ # @return [Array] (size: [n_features])
45
+ attr_reader :feature_names
46
+
47
+ # Return the hash consisting of pairs of feature names and indices.
48
+ # @return [Hash] (size: [n_features])
49
+ attr_reader :vocabulary
50
+
51
+ # Create a new encoder for converting array of hash consisting of feature names and values to vectors.
52
+ #
53
+ # @param separator [String] The separator string used for constructing new feature names for categorical feature.
54
+ # @param sort [Boolean] The flag indicating whether to sort feature names.
55
+ def initialize(separator: '=', sort: true)
56
+ super()
57
+ @params = {
58
+ separator: separator,
59
+ sort: sort
60
+ }
61
+ end
62
+
63
+ # Fit the encoder with given training data.
64
+ #
65
+ # @overload fit(x) -> HashVectorizer
66
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
67
+ # @return [HashVectorizer]
68
+ def fit(x, _y = nil)
69
+ @feature_names = []
70
+ @vocabulary = {}
71
+
72
+ x.each do |f|
73
+ f.each do |k, v|
74
+ k = "#{k}#{separator}#{v}".to_sym if v.is_a?(String)
75
+ next if @vocabulary.key?(k)
76
+
77
+ @feature_names.push(k)
78
+ @vocabulary[k] = @vocabulary.size
79
+ end
80
+ end
81
+
82
+ if sort_feature?
83
+ @feature_names.sort!
84
+ @feature_names.each_with_index { |k, i| @vocabulary[k] = i }
85
+ end
86
+
87
+ self
88
+ end
89
+
90
+ # Fit the encoder with given training data, then return encoded data.
91
+ #
92
+ # @overload fit_transform(x) -> Numo::DFloat
93
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
94
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
95
+ def fit_transform(x, _y = nil)
96
+ fit(x).transform(x)
97
+ end
98
+
99
+ # Encode given the array of feature-value hash.
100
+ #
101
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
102
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
103
+ def transform(x)
104
+ x = [x] unless x.is_a?(Array)
105
+ n_samples = x.size
106
+ n_features = @vocabulary.size
107
+ z = Numo::DFloat.zeros(n_samples, n_features)
108
+
109
+ x.each_with_index do |f, i|
110
+ f.each do |k, v|
111
+ if v.is_a?(String)
112
+ k = "#{k}#{separator}#{v}".to_sym
113
+ v = 1
114
+ end
115
+ z[i, @vocabulary[k]] = v if @vocabulary.key?(k)
116
+ end
117
+ end
118
+
119
+ z
120
+ end
121
+
122
+ # Decode sample matirx to the array of feature-value hash.
123
+ #
124
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
125
+ # @return [Array<Hash>] The array of hash consisting of feature names and values.
126
+ def inverse_transform(x)
127
+ n_samples = x.shape[0]
128
+ reconst = []
129
+
130
+ n_samples.times do |i|
131
+ f = {}
132
+ x[i, true].each_with_index do |el, j|
133
+ feature_key_val(@feature_names[j], el).tap { |k, v| f[k.to_sym] = v } unless el.zero?
134
+ end
135
+ reconst.push(f)
136
+ end
137
+
138
+ reconst
139
+ end
140
+
141
+ private
142
+
143
+ def feature_key_val(fname, fval)
144
+ f = fname.to_s.split(separator)
145
+ f.size == 2 ? f : [fname, fval]
146
+ end
147
+
148
+ def separator
149
+ @params[:separator]
150
+ end
151
+
152
+ def sort_feature?
153
+ @params[:sort]
154
+ end
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/estimator'
4
+ require 'rumale/base/transformer'
5
+ require 'rumale/utils'
6
+
7
+ module Rumale
8
+ module FeatureExtraction
9
+ # Transform sample matrix with term frequecy (tf) to a normalized tf-idf (inverse document frequency) reprensentation.
10
+ #
11
+ # @example
12
+ # require 'rumale/feature_extraction/hash_vectorizer'
13
+ # require 'rumale/feature_extraction/tfidf_transformer'
14
+ #
15
+ # encoder = Rumale::FeatureExtraction::HashVectorizer.new
16
+ # x = encoder.fit_transform([
17
+ # { foo: 1, bar: 2 },
18
+ # { foo: 3, baz: 1 }
19
+ # ])
20
+ #
21
+ # # > pp x
22
+ # # Numo::DFloat#shape=[2,3]
23
+ # # [[2, 0, 1],
24
+ # # [0, 1, 3]]
25
+ #
26
+ # transformer = Rumale::FeatureExtraction::TfidfTransformer.new
27
+ # x_tfidf = transformer.fit_transform(x)
28
+ #
29
+ # # > pp x_tfidf
30
+ # # Numo::DFloat#shape=[2,3]
31
+ # # [[0.959056, 0, 0.283217],
32
+ # # [0, 0.491506, 0.870874]]
33
+ #
34
+ # *Reference*
35
+ # - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
36
+ class TfidfTransformer < ::Rumale::Base::Estimator
37
+ include ::Rumale::Base::Transformer
38
+
39
+ # Return the vector consists of inverse document frequency.
40
+ # @return [Numo::DFloat] (shape: [n_features])
41
+ attr_reader :idf
42
+
43
+ # Create a new transfomer for converting tf vectors to tf-idf vectors.
44
+ #
45
+ # @param norm [String] The normalization method to be used ('l1', 'l2' and 'none').
46
+ # @param use_idf [Boolean] The flag indicating whether to use inverse document frequency weighting.
47
+ # @param smooth_idf [Boolean] The flag indicating whether to apply idf smoothing by log((n_samples + 1) / (df + 1)) + 1.
48
+ # @param sublinear_tf [Boolean] The flag indicating whether to perform subliner tf scaling by 1 + log(tf).
49
+ def initialize(norm: 'l2', use_idf: true, smooth_idf: false, sublinear_tf: false)
50
+ super()
51
+ @params = {
52
+ norm: norm,
53
+ use_idf: use_idf,
54
+ smooth_idf: smooth_idf,
55
+ sublinear_tf: sublinear_tf
56
+ }
57
+ end
58
+
59
+ # Calculate the inverse document frequency for weighting.
60
+ #
61
+ # @overload fit(x) -> TfidfTransformer
62
+ #
63
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the idf values.
64
+ # @return [TfidfTransformer]
65
+ def fit(x, _y = nil)
66
+ return self unless @params[:use_idf]
67
+
68
+ n_samples = x.shape[0]
69
+ df = x.class.cast(x.gt(0.0).count(0))
70
+
71
+ if @params[:smooth_idf]
72
+ df += 1
73
+ n_samples += 1
74
+ end
75
+
76
+ @idf = Numo::NMath.log(n_samples / df) + 1
77
+
78
+ self
79
+ end
80
+
81
+ # Calculate the idf values, and then transfrom samples to the tf-idf representation.
82
+ #
83
+ # @overload fit_transform(x) -> Numo::DFloat
84
+ #
85
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate idf and be transformed to tf-idf representation.
86
+ # @return [Numo::DFloat] The transformed samples.
87
+ def fit_transform(x, _y = nil)
88
+ fit(x).transform(x)
89
+ end
90
+
91
+ # Perform transforming the given samples to the tf-idf representation.
92
+ #
93
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
94
+ # @return [Numo::DFloat] The transformed samples.
95
+ def transform(x)
96
+ z = x.dup
97
+
98
+ z[z.ne(0)] = Numo::NMath.log(z[z.ne(0)]) + 1 if @params[:sublinear_tf]
99
+ z *= @idf if @params[:use_idf]
100
+ case @params[:norm]
101
+ when 'l2'
102
+ ::Rumale::Utils.normalize(z, 'l2')
103
+ when 'l1'
104
+ ::Rumale::Utils.normalize(z, 'l1')
105
+ else
106
+ z
107
+ end
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Rumale is a machine learning library in Ruby.
4
+ module Rumale
5
+ # This module consists of the classes that extract features from raw data.
6
+ module FeatureExtraction
7
+ # @!visibility private
8
+ VERSION = '0.24.0'
9
+ end
10
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ require_relative 'feature_extraction/feature_hasher'
6
+ require_relative 'feature_extraction/hash_vectorizer'
7
+ require_relative 'feature_extraction/tfidf_transformer'
8
+ require_relative 'feature_extraction/version'
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rumale-feature_extraction
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.24.0
5
+ platform: ruby
6
+ authors:
7
+ - yoshoku
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-12-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mmh3
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: numo-narray
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.9.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.9.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: rumale-core
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.24.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.24.0
55
+ description: |
56
+ Rumale::FeatureExtraction provides feature extraction methods,
57
+ such as TF-IDF and feature hashing,
58
+ with Rumale interface.
59
+ email:
60
+ - yoshoku@outlook.com
61
+ executables: []
62
+ extensions: []
63
+ extra_rdoc_files: []
64
+ files:
65
+ - LICENSE.txt
66
+ - README.md
67
+ - lib/rumale/feature_extraction.rb
68
+ - lib/rumale/feature_extraction/feature_hasher.rb
69
+ - lib/rumale/feature_extraction/hash_vectorizer.rb
70
+ - lib/rumale/feature_extraction/tfidf_transformer.rb
71
+ - lib/rumale/feature_extraction/version.rb
72
+ homepage: https://github.com/yoshoku/rumale
73
+ licenses:
74
+ - BSD-3-Clause
75
+ metadata:
76
+ homepage_uri: https://github.com/yoshoku/rumale
77
+ source_code_uri: https://github.com/yoshoku/rumale/tree/main/rumale-feature_extraction
78
+ changelog_uri: https://github.com/yoshoku/rumale/blob/main/CHANGELOG.md
79
+ documentation_uri: https://yoshoku.github.io/rumale/doc/
80
+ rubygems_mfa_required: 'true'
81
+ post_install_message:
82
+ rdoc_options: []
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ required_rubygems_version: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ requirements: []
96
+ rubygems_version: 3.3.26
97
+ signing_key:
98
+ specification_version: 4
99
+ summary: Rumale::FeatureExtraction provides feature extraction methods with Rumale
100
+ interface.
101
+ test_files: []