rumale-feature_extraction 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7927d78c3c8294fdaba1f509c5bfa0d3d5960d5813cba42aaa5c2765317064dd
4
+ data.tar.gz: 43422862894245c61da3b8973a3991cccf80d87f901fbab635077a00fe7670d8
5
+ SHA512:
6
+ metadata.gz: 9127e6789c784861dc6302cbd69b6abc6afc841e8ba22ef0e4b1b42cd0a575433fe79e37c3797eee632560cf7d0a7585aee1e2a28ee7d1df8ae770c5be2f587f
7
+ data.tar.gz: a0455a7c16fc510d2428d9476e22d883bb1377779552daba8243ce20bdd332df69be4f3143aa1d4abe2bc4b319210c06872ff6239a25565fa13da82298113b13
data/LICENSE.txt ADDED
@@ -0,0 +1,27 @@
1
+ Copyright (c) 2022 Atsushi Tatsuma
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ * Neither the name of the copyright holder nor the names of its
15
+ contributors may be used to endorse or promote products derived from
16
+ this software without specific prior written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,34 @@
1
+ # Rumale::FeatureExtraction
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/rumale-feature_extraction.svg)](https://badge.fury.io/rb/rumale-feature_extraction)
4
+ [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/main/rumale-feature_extraction/LICENSE.txt)
5
+ [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction.html)
6
+
7
+ Rumale is a machine learning library in Ruby.
8
+ Rumale::FeatureExtraction provides feature extraction methods,
9
+ such as TF-IDF and feature hashing,
10
+ with Rumale interface.
11
+
12
+ ## Installation
13
+
14
+ Add this line to your application's Gemfile:
15
+
16
+ ```ruby
17
+ gem 'rumale-feature_extraction'
18
+ ```
19
+
20
+ And then execute:
21
+
22
+ $ bundle install
23
+
24
+ Or install it yourself as:
25
+
26
+ $ gem install rumale-feature_extraction
27
+
28
+ ## Documentation
29
+
30
+ - [Rumale API Documentation - FeatureExtraction](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction.html)
31
+
32
+ ## License
33
+
34
+ The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
@@ -0,0 +1,100 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mmh3'
4
+
5
+ require 'rumale/base/estimator'
6
+ require 'rumale/base/transformer'
7
+
8
+ module Rumale
9
+ module FeatureExtraction
10
+ # Encode array of feature-value hash to vectors with feature hashing (hashing trick).
11
+ # This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
12
+ # This encoder employs signed 32-bit Murmurhash3 as the hash function.
13
+ #
14
+ # @example
15
+ # require 'rumale/feature_extraction/feature_hasher'
16
+ #
17
+ # encoder = Rumale::FeatureExtraction::FeatureHasher.new(n_features: 10)
18
+ # x = encoder.transform([
19
+ # { dog: 1, cat: 2, elephant: 4 },
20
+ # { dog: 2, run: 5 }
21
+ # ])
22
+ #
23
+ # # > pp x
24
+ # # Numo::DFloat#shape=[2,10]
25
+ # # [[0, 0, -4, -1, 0, 0, 0, 0, 0, 2],
26
+ # # [0, 0, 0, -2, -5, 0, 0, 0, 0, 0]]
27
+ class FeatureHasher < ::Rumale::Base::Estimator
28
+ include ::Rumale::Base::Transformer
29
+
30
+ # Create a new encoder for converting array of hash consisting of feature names and values to vectors
31
+ # with feature hashing algorith.
32
+ #
33
+ # @param n_features [Integer] The number of features of encoded samples.
34
+ # @param alternate_sign [Boolean] The flag indicating whether to reflect the sign of the hash value to the feature value.
35
+ def initialize(n_features: 1024, alternate_sign: true)
36
+ super()
37
+ @params = {
38
+ n_features: n_features,
39
+ alternate_sign: alternate_sign
40
+ }
41
+ end
42
+
43
+ # This method does not do anything. The encoder does not require training.
44
+ #
45
+ # @overload fit(x) -> FeatureHasher
46
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
47
+ # @return [FeatureHasher]
48
+ def fit(_x = nil, _y = nil)
49
+ self
50
+ end
51
+
52
+ # Encode given the array of feature-value hash.
53
+ # This method has the same output as the transform method
54
+ # because the encoder does not require training.
55
+ #
56
+ # @overload fit_transform(x) -> Numo::DFloat
57
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
58
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
59
+ def fit_transform(x, _y = nil)
60
+ fit(x).transform(x)
61
+ end
62
+
63
+ # Encode given the array of feature-value hash.
64
+ #
65
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
66
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
67
+ def transform(x)
68
+ x = [x] unless x.is_a?(Array)
69
+ n_samples = x.size
70
+
71
+ z = Numo::DFloat.zeros(n_samples, n_features)
72
+
73
+ x.each_with_index do |f, i|
74
+ f.each do |k, v|
75
+ k = "#{k}=#{v}" if v.is_a?(String)
76
+ val = v.is_a?(String) ? 1 : v
77
+ next if val.zero?
78
+
79
+ h = Mmh3.hash32(k)
80
+ fid = h.abs % n_features
81
+ val *= h >= 0 ? 1 : -1 if alternate_sign?
82
+ z[i, fid] = val
83
+ end
84
+ end
85
+
86
+ z
87
+ end
88
+
89
+ private
90
+
91
+ def n_features
92
+ @params[:n_features]
93
+ end
94
+
95
+ def alternate_sign?
96
+ @params[:alternate_sign]
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,157 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/estimator'
4
+ require 'rumale/base/transformer'
5
+
6
+ module Rumale
7
+ module FeatureExtraction
8
+ # Encode array of feature-value hash to vectors.
9
+ # This encoder turns array of mappings (Array<Hash>) with pairs of feature names and values into Numo::NArray.
10
+ #
11
+ # @example
12
+ # require 'rumale/feature_extraction/hash_vectorizer'
13
+ #
14
+ # encoder = Rumale::FeatureExtraction::HashVectorizer.new
15
+ # x = encoder.fit_transform([
16
+ # { foo: 1, bar: 2 },
17
+ # { foo: 3, baz: 1 }
18
+ # ])
19
+ #
20
+ # # > pp x
21
+ # # Numo::DFloat#shape=[2,3]
22
+ # # [[2, 0, 1],
23
+ # # [0, 1, 3]]
24
+ #
25
+ # x = encoder.fit_transform([
26
+ # { city: 'Dubai', temperature: 33 },
27
+ # { city: 'London', temperature: 12 },
28
+ # { city: 'San Francisco', temperature: 18 }
29
+ # ])
30
+ #
31
+ # # > pp x
32
+ # # Numo::DFloat#shape=[3,4]
33
+ # # [[1, 0, 0, 33],
34
+ # # [0, 1, 0, 12],
35
+ # # [0, 0, 1, 18]]
36
+ # # > pp encoder.inverse_transform(x)
37
+ # # [{:city=>"Dubai", :temperature=>33.0},
38
+ # # {:city=>"London", :temperature=>12.0},
39
+ # # {:city=>"San Francisco", :temperature=>18.0}]
40
+ class HashVectorizer < ::Rumale::Base::Estimator
41
+ include ::Rumale::Base::Transformer
42
+
43
+ # Return the list of feature names.
44
+ # @return [Array] (size: [n_features])
45
+ attr_reader :feature_names
46
+
47
+ # Return the hash consisting of pairs of feature names and indices.
48
+ # @return [Hash] (size: [n_features])
49
+ attr_reader :vocabulary
50
+
51
+ # Create a new encoder for converting array of hash consisting of feature names and values to vectors.
52
+ #
53
+ # @param separator [String] The separator string used for constructing new feature names for categorical feature.
54
+ # @param sort [Boolean] The flag indicating whether to sort feature names.
55
+ def initialize(separator: '=', sort: true)
56
+ super()
57
+ @params = {
58
+ separator: separator,
59
+ sort: sort
60
+ }
61
+ end
62
+
63
+ # Fit the encoder with given training data.
64
+ #
65
+ # @overload fit(x) -> HashVectorizer
66
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
67
+ # @return [HashVectorizer]
68
+ def fit(x, _y = nil)
69
+ @feature_names = []
70
+ @vocabulary = {}
71
+
72
+ x.each do |f|
73
+ f.each do |k, v|
74
+ k = "#{k}#{separator}#{v}".to_sym if v.is_a?(String)
75
+ next if @vocabulary.key?(k)
76
+
77
+ @feature_names.push(k)
78
+ @vocabulary[k] = @vocabulary.size
79
+ end
80
+ end
81
+
82
+ if sort_feature?
83
+ @feature_names.sort!
84
+ @feature_names.each_with_index { |k, i| @vocabulary[k] = i }
85
+ end
86
+
87
+ self
88
+ end
89
+
90
+ # Fit the encoder with given training data, then return encoded data.
91
+ #
92
+ # @overload fit_transform(x) -> Numo::DFloat
93
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
94
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
95
+ def fit_transform(x, _y = nil)
96
+ fit(x).transform(x)
97
+ end
98
+
99
+ # Encode given the array of feature-value hash.
100
+ #
101
+ # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
102
+ # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
103
+ def transform(x)
104
+ x = [x] unless x.is_a?(Array)
105
+ n_samples = x.size
106
+ n_features = @vocabulary.size
107
+ z = Numo::DFloat.zeros(n_samples, n_features)
108
+
109
+ x.each_with_index do |f, i|
110
+ f.each do |k, v|
111
+ if v.is_a?(String)
112
+ k = "#{k}#{separator}#{v}".to_sym
113
+ v = 1
114
+ end
115
+ z[i, @vocabulary[k]] = v if @vocabulary.key?(k)
116
+ end
117
+ end
118
+
119
+ z
120
+ end
121
+
122
+ # Decode sample matirx to the array of feature-value hash.
123
+ #
124
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
125
+ # @return [Array<Hash>] The array of hash consisting of feature names and values.
126
+ def inverse_transform(x)
127
+ n_samples = x.shape[0]
128
+ reconst = []
129
+
130
+ n_samples.times do |i|
131
+ f = {}
132
+ x[i, true].each_with_index do |el, j|
133
+ feature_key_val(@feature_names[j], el).tap { |k, v| f[k.to_sym] = v } unless el.zero?
134
+ end
135
+ reconst.push(f)
136
+ end
137
+
138
+ reconst
139
+ end
140
+
141
+ private
142
+
143
+ def feature_key_val(fname, fval)
144
+ f = fname.to_s.split(separator)
145
+ f.size == 2 ? f : [fname, fval]
146
+ end
147
+
148
+ def separator
149
+ @params[:separator]
150
+ end
151
+
152
+ def sort_feature?
153
+ @params[:sort]
154
+ end
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/estimator'
4
+ require 'rumale/base/transformer'
5
+ require 'rumale/utils'
6
+
7
+ module Rumale
8
+ module FeatureExtraction
9
+ # Transform sample matrix with term frequecy (tf) to a normalized tf-idf (inverse document frequency) reprensentation.
10
+ #
11
+ # @example
12
+ # require 'rumale/feature_extraction/hash_vectorizer'
13
+ # require 'rumale/feature_extraction/tfidf_transformer'
14
+ #
15
+ # encoder = Rumale::FeatureExtraction::HashVectorizer.new
16
+ # x = encoder.fit_transform([
17
+ # { foo: 1, bar: 2 },
18
+ # { foo: 3, baz: 1 }
19
+ # ])
20
+ #
21
+ # # > pp x
22
+ # # Numo::DFloat#shape=[2,3]
23
+ # # [[2, 0, 1],
24
+ # # [0, 1, 3]]
25
+ #
26
+ # transformer = Rumale::FeatureExtraction::TfidfTransformer.new
27
+ # x_tfidf = transformer.fit_transform(x)
28
+ #
29
+ # # > pp x_tfidf
30
+ # # Numo::DFloat#shape=[2,3]
31
+ # # [[0.959056, 0, 0.283217],
32
+ # # [0, 0.491506, 0.870874]]
33
+ #
34
+ # *Reference*
35
+ # - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
36
+ class TfidfTransformer < ::Rumale::Base::Estimator
37
+ include ::Rumale::Base::Transformer
38
+
39
+ # Return the vector consists of inverse document frequency.
40
+ # @return [Numo::DFloat] (shape: [n_features])
41
+ attr_reader :idf
42
+
43
+ # Create a new transfomer for converting tf vectors to tf-idf vectors.
44
+ #
45
+ # @param norm [String] The normalization method to be used ('l1', 'l2' and 'none').
46
+ # @param use_idf [Boolean] The flag indicating whether to use inverse document frequency weighting.
47
+ # @param smooth_idf [Boolean] The flag indicating whether to apply idf smoothing by log((n_samples + 1) / (df + 1)) + 1.
48
+ # @param sublinear_tf [Boolean] The flag indicating whether to perform subliner tf scaling by 1 + log(tf).
49
+ def initialize(norm: 'l2', use_idf: true, smooth_idf: false, sublinear_tf: false)
50
+ super()
51
+ @params = {
52
+ norm: norm,
53
+ use_idf: use_idf,
54
+ smooth_idf: smooth_idf,
55
+ sublinear_tf: sublinear_tf
56
+ }
57
+ end
58
+
59
+ # Calculate the inverse document frequency for weighting.
60
+ #
61
+ # @overload fit(x) -> TfidfTransformer
62
+ #
63
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the idf values.
64
+ # @return [TfidfTransformer]
65
+ def fit(x, _y = nil)
66
+ return self unless @params[:use_idf]
67
+
68
+ n_samples = x.shape[0]
69
+ df = x.class.cast(x.gt(0.0).count(0))
70
+
71
+ if @params[:smooth_idf]
72
+ df += 1
73
+ n_samples += 1
74
+ end
75
+
76
+ @idf = Numo::NMath.log(n_samples / df) + 1
77
+
78
+ self
79
+ end
80
+
81
+ # Calculate the idf values, and then transfrom samples to the tf-idf representation.
82
+ #
83
+ # @overload fit_transform(x) -> Numo::DFloat
84
+ #
85
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate idf and be transformed to tf-idf representation.
86
+ # @return [Numo::DFloat] The transformed samples.
87
+ def fit_transform(x, _y = nil)
88
+ fit(x).transform(x)
89
+ end
90
+
91
+ # Perform transforming the given samples to the tf-idf representation.
92
+ #
93
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
94
+ # @return [Numo::DFloat] The transformed samples.
95
+ def transform(x)
96
+ z = x.dup
97
+
98
+ z[z.ne(0)] = Numo::NMath.log(z[z.ne(0)]) + 1 if @params[:sublinear_tf]
99
+ z *= @idf if @params[:use_idf]
100
+ case @params[:norm]
101
+ when 'l2'
102
+ ::Rumale::Utils.normalize(z, 'l2')
103
+ when 'l1'
104
+ ::Rumale::Utils.normalize(z, 'l1')
105
+ else
106
+ z
107
+ end
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Rumale is a machine learning library in Ruby.
4
+ module Rumale
5
+ # This module consists of the classes that extract features from raw data.
6
+ module FeatureExtraction
7
+ # @!visibility private
8
+ VERSION = '0.24.0'
9
+ end
10
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'numo/narray'
4
+
5
+ require_relative 'feature_extraction/feature_hasher'
6
+ require_relative 'feature_extraction/hash_vectorizer'
7
+ require_relative 'feature_extraction/tfidf_transformer'
8
+ require_relative 'feature_extraction/version'
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rumale-feature_extraction
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.24.0
5
+ platform: ruby
6
+ authors:
7
+ - yoshoku
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-12-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mmh3
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: numo-narray
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.9.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.9.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: rumale-core
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.24.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.24.0
55
+ description: |
56
+ Rumale::FeatureExtraction provides feature extraction methods,
57
+ such as TF-IDF and feature hashing,
58
+ with Rumale interface.
59
+ email:
60
+ - yoshoku@outlook.com
61
+ executables: []
62
+ extensions: []
63
+ extra_rdoc_files: []
64
+ files:
65
+ - LICENSE.txt
66
+ - README.md
67
+ - lib/rumale/feature_extraction.rb
68
+ - lib/rumale/feature_extraction/feature_hasher.rb
69
+ - lib/rumale/feature_extraction/hash_vectorizer.rb
70
+ - lib/rumale/feature_extraction/tfidf_transformer.rb
71
+ - lib/rumale/feature_extraction/version.rb
72
+ homepage: https://github.com/yoshoku/rumale
73
+ licenses:
74
+ - BSD-3-Clause
75
+ metadata:
76
+ homepage_uri: https://github.com/yoshoku/rumale
77
+ source_code_uri: https://github.com/yoshoku/rumale/tree/main/rumale-feature_extraction
78
+ changelog_uri: https://github.com/yoshoku/rumale/blob/main/CHANGELOG.md
79
+ documentation_uri: https://yoshoku.github.io/rumale/doc/
80
+ rubygems_mfa_required: 'true'
81
+ post_install_message:
82
+ rdoc_options: []
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ required_rubygems_version: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ requirements: []
96
+ rubygems_version: 3.3.26
97
+ signing_key:
98
+ specification_version: 4
99
+ summary: Rumale::FeatureExtraction provides feature extraction methods with Rumale
100
+ interface.
101
+ test_files: []