eps 0.3.5 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +6 -12
- data/lib/eps/base_estimator.rb +21 -11
- data/lib/eps/evaluators/lightgbm.rb +2 -8
- data/lib/eps/lightgbm.rb +5 -2
- data/lib/eps/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d56573908e892d8d1959d66c7b6f2940f8930a2d0f2dfd5d4da75e2ff7cfdb63
|
4
|
+
data.tar.gz: 9eaf1a06c8c51ba15d9b4468796fc869f2933945494d027b54789304080c5d5b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 971dbd2a95a280ed50925df68a29018ba7b3bccb7094b1374923a8ce7d100720202245843e003b26447832e9c1f8285bafcc7692020f5971a56c0a8e89a12afb
|
7
|
+
data.tar.gz: de06585dc75608b0f8c62188cce351987a0cd53f3b12889d4d63de28ed81ae1b143e31f47ac8c53083eeb250e18c5f8b721fff94a378e14203fd8fa90ba3e440
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -343,9 +343,7 @@ df = Daru::DataFrame.from_csv("houses.csv")
|
|
343
343
|
Eps::Model.new(df, target: "price")
|
344
344
|
```
|
345
345
|
|
346
|
-
|
347
|
-
|
348
|
-
When importing data from CSV files, be sure to convert numeric fields. The `table` method does this automatically.
|
346
|
+
When reading CSV files directly, be sure to convert numeric fields. The `table` method does this automatically.
|
349
347
|
|
350
348
|
```ruby
|
351
349
|
CSV.table("data.csv").map { |row| row.to_h }
|
@@ -375,7 +373,11 @@ Eps::Model.new(data, learning_rate: 0.01)
|
|
375
373
|
|
376
374
|
### Linear Regression
|
377
375
|
|
378
|
-
|
376
|
+
By default, an intercept is included. Disable this with:
|
377
|
+
|
378
|
+
```ruby
|
379
|
+
Eps::Model.new(data, intercept: false)
|
380
|
+
```
|
379
381
|
|
380
382
|
To speed up training on large datasets with linear regression, [install GSL](https://github.com/ankane/gslr#gsl-installation). With Homebrew, you can use:
|
381
383
|
|
@@ -391,14 +393,6 @@ gem 'gslr', group: :development
|
|
391
393
|
|
392
394
|
It only needs to be available in environments used to build the model.
|
393
395
|
|
394
|
-
#### Options
|
395
|
-
|
396
|
-
By default, an intercept is included. Disable this with:
|
397
|
-
|
398
|
-
```ruby
|
399
|
-
Eps::Model.new(data, intercept: false)
|
400
|
-
```
|
401
|
-
|
402
396
|
## Probability
|
403
397
|
|
404
398
|
To get the probability of each category for predictions with classification, use:
|
data/lib/eps/base_estimator.rb
CHANGED
@@ -3,6 +3,7 @@ module Eps
|
|
3
3
|
def initialize(data = nil, y = nil, **options)
|
4
4
|
@options = options.dup
|
5
5
|
@trained = false
|
6
|
+
@text_encoders = {}
|
6
7
|
# TODO better pattern - don't pass most options to train
|
7
8
|
train(data, y, **options) if data
|
8
9
|
end
|
@@ -209,29 +210,38 @@ module Eps
|
|
209
210
|
[data, target]
|
210
211
|
end
|
211
212
|
|
212
|
-
def prep_text_features(train_set)
|
213
|
-
@text_encoders = {}
|
213
|
+
def prep_text_features(train_set, fit: true)
|
214
214
|
@text_features.each do |k, v|
|
215
|
-
|
216
|
-
|
215
|
+
if fit
|
216
|
+
# reset vocabulary
|
217
|
+
v.delete(:vocabulary)
|
218
|
+
|
219
|
+
# TODO determine max features automatically
|
220
|
+
# start based on number of rows
|
221
|
+
encoder = Eps::TextEncoder.new(**v)
|
222
|
+
counts = encoder.fit(train_set.columns.delete(k))
|
223
|
+
else
|
224
|
+
encoder = @text_encoders[k]
|
225
|
+
counts = encoder.transform(train_set.columns.delete(k))
|
226
|
+
end
|
217
227
|
|
218
|
-
# TODO determine max features automatically
|
219
|
-
# start based on number of rows
|
220
|
-
encoder = Eps::TextEncoder.new(**v)
|
221
|
-
counts = encoder.fit(train_set.columns.delete(k))
|
222
228
|
encoder.vocabulary.each do |word|
|
223
229
|
train_set.columns[[k, word]] = [0] * counts.size
|
224
230
|
end
|
231
|
+
|
225
232
|
counts.each_with_index do |ci, i|
|
226
233
|
ci.each do |word, count|
|
227
234
|
word_key = [k, word]
|
228
235
|
train_set.columns[word_key][i] = 1 if train_set.columns.key?(word_key)
|
229
236
|
end
|
230
237
|
end
|
231
|
-
@text_encoders[k] = encoder
|
232
238
|
|
233
|
-
|
234
|
-
|
239
|
+
if fit
|
240
|
+
@text_encoders[k] = encoder
|
241
|
+
|
242
|
+
# update vocabulary
|
243
|
+
v[:vocabulary] = encoder.vocabulary
|
244
|
+
end
|
235
245
|
end
|
236
246
|
|
237
247
|
raise "No features left" if train_set.columns.empty?
|
@@ -19,13 +19,7 @@ module Eps
|
|
19
19
|
# sparse matrix
|
20
20
|
@text_features.each do |k, v|
|
21
21
|
encoder = TextEncoder.new(**v)
|
22
|
-
|
23
|
-
values = data.columns.delete(k)
|
24
|
-
counts = encoder.transform(values)
|
25
|
-
|
26
|
-
encoder.vocabulary.each do |word|
|
27
|
-
data.columns[[k, word]] = [0] * values.size
|
28
|
-
end
|
22
|
+
counts = encoder.transform(data.columns[k])
|
29
23
|
|
30
24
|
counts.each_with_index do |xc, i|
|
31
25
|
row = rows[i]
|
@@ -52,7 +46,7 @@ module Eps
|
|
52
46
|
@trees.each_slice(num_trees).each do |trees|
|
53
47
|
tree_scores << sum_trees(rows, trees)
|
54
48
|
end
|
55
|
-
|
49
|
+
rows.size.times.map do |i|
|
56
50
|
v = tree_scores.map { |s| s[i] }
|
57
51
|
if probabilities
|
58
52
|
exp = v.map { |vi| Math.exp(vi) }
|
data/lib/eps/lightgbm.rb
CHANGED
@@ -57,7 +57,7 @@ module Eps
|
|
57
57
|
|
58
58
|
# text feature encoding
|
59
59
|
prep_text_features(train_set)
|
60
|
-
prep_text_features(validation_set) if validation_set
|
60
|
+
prep_text_features(validation_set, fit: false) if validation_set
|
61
61
|
|
62
62
|
# create params
|
63
63
|
params = {
|
@@ -144,7 +144,10 @@ module Eps
|
|
144
144
|
end
|
145
145
|
|
146
146
|
if bad_observations.any?
|
147
|
-
|
147
|
+
bad_observations.each do |obs|
|
148
|
+
p obs
|
149
|
+
end
|
150
|
+
raise "Bug detected in evaluator. Please report an issue."
|
148
151
|
end
|
149
152
|
end
|
150
153
|
|
data/lib/eps/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: eps
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-06-
|
11
|
+
date: 2020-06-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: lightgbm
|