eps 0.3.5 → 0.3.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1369016c3cae228f169fe580b54fca3c0d240cda202fa7d03ecc7a4e156ee8c7
4
- data.tar.gz: bf83ca424c509798d1a1436806b52cba0cfdbefecb8d827d5b17aec7b807b121
3
+ metadata.gz: d56573908e892d8d1959d66c7b6f2940f8930a2d0f2dfd5d4da75e2ff7cfdb63
4
+ data.tar.gz: 9eaf1a06c8c51ba15d9b4468796fc869f2933945494d027b54789304080c5d5b
5
5
  SHA512:
6
- metadata.gz: 2bf47d80a301eb546c348aaa71f847fa22ace5bed63d97a1f19eb14bc15388b056cd3f545ccf251b2bbf2afc485ef81e5559849ff7459e9dd9f88a71c7cbf83a
7
- data.tar.gz: 82d65d84e95a6518cd132c2a42cdec20afd05c0013192941b59ee0edb524874d12b2dd9082dd89be1422872c88e827e031469e43b80336c48c7eab7ff4fe611e
6
+ metadata.gz: 971dbd2a95a280ed50925df68a29018ba7b3bccb7094b1374923a8ce7d100720202245843e003b26447832e9c1f8285bafcc7692020f5971a56c0a8e89a12afb
7
+ data.tar.gz: de06585dc75608b0f8c62188cce351987a0cd53f3b12889d4d63de28ed81ae1b143e31f47ac8c53083eeb250e18c5f8b721fff94a378e14203fd8fa90ba3e440
@@ -1,3 +1,7 @@
1
+ ## 0.3.6 (2020-06-19)
2
+
3
+ - Fixed error with text features for LightGBM
4
+
1
5
  ## 0.3.5 (2020-06-10)
2
6
 
3
7
  - Added `learning_rate` option for LightGBM
data/README.md CHANGED
@@ -343,9 +343,7 @@ df = Daru::DataFrame.from_csv("houses.csv")
343
343
  Eps::Model.new(df, target: "price")
344
344
  ```
345
345
 
346
- ### CSVs
347
-
348
- When importing data from CSV files, be sure to convert numeric fields. The `table` method does this automatically.
346
+ When reading CSV files directly, be sure to convert numeric fields. The `table` method does this automatically.
349
347
 
350
348
  ```ruby
351
349
  CSV.table("data.csv").map { |row| row.to_h }
@@ -375,7 +373,11 @@ Eps::Model.new(data, learning_rate: 0.01)
375
373
 
376
374
  ### Linear Regression
377
375
 
378
- #### Performance
376
+ By default, an intercept is included. Disable this with:
377
+
378
+ ```ruby
379
+ Eps::Model.new(data, intercept: false)
380
+ ```
379
381
 
380
382
  To speed up training on large datasets with linear regression, [install GSL](https://github.com/ankane/gslr#gsl-installation). With Homebrew, you can use:
381
383
 
@@ -391,14 +393,6 @@ gem 'gslr', group: :development
391
393
 
392
394
  It only needs to be available in environments used to build the model.
393
395
 
394
- #### Options
395
-
396
- By default, an intercept is included. Disable this with:
397
-
398
- ```ruby
399
- Eps::Model.new(data, intercept: false)
400
- ```
401
-
402
396
  ## Probability
403
397
 
404
398
  To get the probability of each category for predictions with classification, use:
@@ -3,6 +3,7 @@ module Eps
3
3
  def initialize(data = nil, y = nil, **options)
4
4
  @options = options.dup
5
5
  @trained = false
6
+ @text_encoders = {}
6
7
  # TODO better pattern - don't pass most options to train
7
8
  train(data, y, **options) if data
8
9
  end
@@ -209,29 +210,38 @@ module Eps
209
210
  [data, target]
210
211
  end
211
212
 
212
- def prep_text_features(train_set)
213
- @text_encoders = {}
213
+ def prep_text_features(train_set, fit: true)
214
214
  @text_features.each do |k, v|
215
- # reset vocabulary
216
- v.delete(:vocabulary)
215
+ if fit
216
+ # reset vocabulary
217
+ v.delete(:vocabulary)
218
+
219
+ # TODO determine max features automatically
220
+ # start based on number of rows
221
+ encoder = Eps::TextEncoder.new(**v)
222
+ counts = encoder.fit(train_set.columns.delete(k))
223
+ else
224
+ encoder = @text_encoders[k]
225
+ counts = encoder.transform(train_set.columns.delete(k))
226
+ end
217
227
 
218
- # TODO determine max features automatically
219
- # start based on number of rows
220
- encoder = Eps::TextEncoder.new(**v)
221
- counts = encoder.fit(train_set.columns.delete(k))
222
228
  encoder.vocabulary.each do |word|
223
229
  train_set.columns[[k, word]] = [0] * counts.size
224
230
  end
231
+
225
232
  counts.each_with_index do |ci, i|
226
233
  ci.each do |word, count|
227
234
  word_key = [k, word]
228
235
  train_set.columns[word_key][i] = 1 if train_set.columns.key?(word_key)
229
236
  end
230
237
  end
231
- @text_encoders[k] = encoder
232
238
 
233
- # update vocabulary
234
- v[:vocabulary] = encoder.vocabulary
239
+ if fit
240
+ @text_encoders[k] = encoder
241
+
242
+ # update vocabulary
243
+ v[:vocabulary] = encoder.vocabulary
244
+ end
235
245
  end
236
246
 
237
247
  raise "No features left" if train_set.columns.empty?
@@ -19,13 +19,7 @@ module Eps
19
19
  # sparse matrix
20
20
  @text_features.each do |k, v|
21
21
  encoder = TextEncoder.new(**v)
22
-
23
- values = data.columns.delete(k)
24
- counts = encoder.transform(values)
25
-
26
- encoder.vocabulary.each do |word|
27
- data.columns[[k, word]] = [0] * values.size
28
- end
22
+ counts = encoder.transform(data.columns[k])
29
23
 
30
24
  counts.each_with_index do |xc, i|
31
25
  row = rows[i]
@@ -52,7 +46,7 @@ module Eps
52
46
  @trees.each_slice(num_trees).each do |trees|
53
47
  tree_scores << sum_trees(rows, trees)
54
48
  end
55
- data.size.times.map do |i|
49
+ rows.size.times.map do |i|
56
50
  v = tree_scores.map { |s| s[i] }
57
51
  if probabilities
58
52
  exp = v.map { |vi| Math.exp(vi) }
@@ -57,7 +57,7 @@ module Eps
57
57
 
58
58
  # text feature encoding
59
59
  prep_text_features(train_set)
60
- prep_text_features(validation_set) if validation_set
60
+ prep_text_features(validation_set, fit: false) if validation_set
61
61
 
62
62
  # create params
63
63
  params = {
@@ -144,7 +144,10 @@ module Eps
144
144
  end
145
145
 
146
146
  if bad_observations.any?
147
- raise "Bug detected in evaluator. Please report an issue. Bad data points: #{bad_observations.inspect}"
147
+ bad_observations.each do |obs|
148
+ p obs
149
+ end
150
+ raise "Bug detected in evaluator. Please report an issue."
148
151
  end
149
152
  end
150
153
 
@@ -1,3 +1,3 @@
1
1
  module Eps
2
- VERSION = "0.3.5"
2
+ VERSION = "0.3.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: eps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-11 00:00:00.000000000 Z
11
+ date: 2020-06-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: lightgbm