eps 0.3.5 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1369016c3cae228f169fe580b54fca3c0d240cda202fa7d03ecc7a4e156ee8c7
4
- data.tar.gz: bf83ca424c509798d1a1436806b52cba0cfdbefecb8d827d5b17aec7b807b121
3
+ metadata.gz: d56573908e892d8d1959d66c7b6f2940f8930a2d0f2dfd5d4da75e2ff7cfdb63
4
+ data.tar.gz: 9eaf1a06c8c51ba15d9b4468796fc869f2933945494d027b54789304080c5d5b
5
5
  SHA512:
6
- metadata.gz: 2bf47d80a301eb546c348aaa71f847fa22ace5bed63d97a1f19eb14bc15388b056cd3f545ccf251b2bbf2afc485ef81e5559849ff7459e9dd9f88a71c7cbf83a
7
- data.tar.gz: 82d65d84e95a6518cd132c2a42cdec20afd05c0013192941b59ee0edb524874d12b2dd9082dd89be1422872c88e827e031469e43b80336c48c7eab7ff4fe611e
6
+ metadata.gz: 971dbd2a95a280ed50925df68a29018ba7b3bccb7094b1374923a8ce7d100720202245843e003b26447832e9c1f8285bafcc7692020f5971a56c0a8e89a12afb
7
+ data.tar.gz: de06585dc75608b0f8c62188cce351987a0cd53f3b12889d4d63de28ed81ae1b143e31f47ac8c53083eeb250e18c5f8b721fff94a378e14203fd8fa90ba3e440
@@ -1,3 +1,7 @@
1
+ ## 0.3.6 (2020-06-19)
2
+
3
+ - Fixed error with text features for LightGBM
4
+
1
5
  ## 0.3.5 (2020-06-10)
2
6
 
3
7
  - Added `learning_rate` option for LightGBM
data/README.md CHANGED
@@ -343,9 +343,7 @@ df = Daru::DataFrame.from_csv("houses.csv")
343
343
  Eps::Model.new(df, target: "price")
344
344
  ```
345
345
 
346
- ### CSVs
347
-
348
- When importing data from CSV files, be sure to convert numeric fields. The `table` method does this automatically.
346
+ When reading CSV files directly, be sure to convert numeric fields. The `table` method does this automatically.
349
347
 
350
348
  ```ruby
351
349
  CSV.table("data.csv").map { |row| row.to_h }
@@ -375,7 +373,11 @@ Eps::Model.new(data, learning_rate: 0.01)
375
373
 
376
374
  ### Linear Regression
377
375
 
378
- #### Performance
376
+ By default, an intercept is included. Disable this with:
377
+
378
+ ```ruby
379
+ Eps::Model.new(data, intercept: false)
380
+ ```
379
381
 
380
382
  To speed up training on large datasets with linear regression, [install GSL](https://github.com/ankane/gslr#gsl-installation). With Homebrew, you can use:
381
383
 
@@ -391,14 +393,6 @@ gem 'gslr', group: :development
391
393
 
392
394
  It only needs to be available in environments used to build the model.
393
395
 
394
- #### Options
395
-
396
- By default, an intercept is included. Disable this with:
397
-
398
- ```ruby
399
- Eps::Model.new(data, intercept: false)
400
- ```
401
-
402
396
  ## Probability
403
397
 
404
398
  To get the probability of each category for predictions with classification, use:
@@ -3,6 +3,7 @@ module Eps
3
3
  def initialize(data = nil, y = nil, **options)
4
4
  @options = options.dup
5
5
  @trained = false
6
+ @text_encoders = {}
6
7
  # TODO better pattern - don't pass most options to train
7
8
  train(data, y, **options) if data
8
9
  end
@@ -209,29 +210,38 @@ module Eps
209
210
  [data, target]
210
211
  end
211
212
 
212
- def prep_text_features(train_set)
213
- @text_encoders = {}
213
+ def prep_text_features(train_set, fit: true)
214
214
  @text_features.each do |k, v|
215
- # reset vocabulary
216
- v.delete(:vocabulary)
215
+ if fit
216
+ # reset vocabulary
217
+ v.delete(:vocabulary)
218
+
219
+ # TODO determine max features automatically
220
+ # start based on number of rows
221
+ encoder = Eps::TextEncoder.new(**v)
222
+ counts = encoder.fit(train_set.columns.delete(k))
223
+ else
224
+ encoder = @text_encoders[k]
225
+ counts = encoder.transform(train_set.columns.delete(k))
226
+ end
217
227
 
218
- # TODO determine max features automatically
219
- # start based on number of rows
220
- encoder = Eps::TextEncoder.new(**v)
221
- counts = encoder.fit(train_set.columns.delete(k))
222
228
  encoder.vocabulary.each do |word|
223
229
  train_set.columns[[k, word]] = [0] * counts.size
224
230
  end
231
+
225
232
  counts.each_with_index do |ci, i|
226
233
  ci.each do |word, count|
227
234
  word_key = [k, word]
228
235
  train_set.columns[word_key][i] = 1 if train_set.columns.key?(word_key)
229
236
  end
230
237
  end
231
- @text_encoders[k] = encoder
232
238
 
233
- # update vocabulary
234
- v[:vocabulary] = encoder.vocabulary
239
+ if fit
240
+ @text_encoders[k] = encoder
241
+
242
+ # update vocabulary
243
+ v[:vocabulary] = encoder.vocabulary
244
+ end
235
245
  end
236
246
 
237
247
  raise "No features left" if train_set.columns.empty?
@@ -19,13 +19,7 @@ module Eps
19
19
  # sparse matrix
20
20
  @text_features.each do |k, v|
21
21
  encoder = TextEncoder.new(**v)
22
-
23
- values = data.columns.delete(k)
24
- counts = encoder.transform(values)
25
-
26
- encoder.vocabulary.each do |word|
27
- data.columns[[k, word]] = [0] * values.size
28
- end
22
+ counts = encoder.transform(data.columns[k])
29
23
 
30
24
  counts.each_with_index do |xc, i|
31
25
  row = rows[i]
@@ -52,7 +46,7 @@ module Eps
52
46
  @trees.each_slice(num_trees).each do |trees|
53
47
  tree_scores << sum_trees(rows, trees)
54
48
  end
55
- data.size.times.map do |i|
49
+ rows.size.times.map do |i|
56
50
  v = tree_scores.map { |s| s[i] }
57
51
  if probabilities
58
52
  exp = v.map { |vi| Math.exp(vi) }
@@ -57,7 +57,7 @@ module Eps
57
57
 
58
58
  # text feature encoding
59
59
  prep_text_features(train_set)
60
- prep_text_features(validation_set) if validation_set
60
+ prep_text_features(validation_set, fit: false) if validation_set
61
61
 
62
62
  # create params
63
63
  params = {
@@ -144,7 +144,10 @@ module Eps
144
144
  end
145
145
 
146
146
  if bad_observations.any?
147
- raise "Bug detected in evaluator. Please report an issue. Bad data points: #{bad_observations.inspect}"
147
+ bad_observations.each do |obs|
148
+ p obs
149
+ end
150
+ raise "Bug detected in evaluator. Please report an issue."
148
151
  end
149
152
  end
150
153
 
@@ -1,3 +1,3 @@
1
1
  module Eps
2
- VERSION = "0.3.5"
2
+ VERSION = "0.3.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: eps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-11 00:00:00.000000000 Z
11
+ date: 2020-06-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: lightgbm