eps 0.3.5 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +6 -12
- data/lib/eps/base_estimator.rb +21 -11
- data/lib/eps/evaluators/lightgbm.rb +2 -8
- data/lib/eps/lightgbm.rb +5 -2
- data/lib/eps/version.rb +1 -1
- metadata +2 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: d56573908e892d8d1959d66c7b6f2940f8930a2d0f2dfd5d4da75e2ff7cfdb63
         | 
| 4 | 
            +
              data.tar.gz: 9eaf1a06c8c51ba15d9b4468796fc869f2933945494d027b54789304080c5d5b
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 971dbd2a95a280ed50925df68a29018ba7b3bccb7094b1374923a8ce7d100720202245843e003b26447832e9c1f8285bafcc7692020f5971a56c0a8e89a12afb
         | 
| 7 | 
            +
              data.tar.gz: de06585dc75608b0f8c62188cce351987a0cd53f3b12889d4d63de28ed81ae1b143e31f47ac8c53083eeb250e18c5f8b721fff94a378e14203fd8fa90ba3e440
         | 
    
        data/CHANGELOG.md
    CHANGED
    
    
    
        data/README.md
    CHANGED
    
    | @@ -343,9 +343,7 @@ df = Daru::DataFrame.from_csv("houses.csv") | |
| 343 343 | 
             
            Eps::Model.new(df, target: "price")
         | 
| 344 344 | 
             
            ```
         | 
| 345 345 |  | 
| 346 | 
            -
             | 
| 347 | 
            -
             | 
| 348 | 
            -
            When importing data from CSV files, be sure to convert numeric fields. The `table` method does this automatically.
         | 
| 346 | 
            +
            When reading CSV files directly, be sure to convert numeric fields. The `table` method does this automatically.
         | 
| 349 347 |  | 
| 350 348 | 
             
            ```ruby
         | 
| 351 349 | 
             
            CSV.table("data.csv").map { |row| row.to_h }
         | 
| @@ -375,7 +373,11 @@ Eps::Model.new(data, learning_rate: 0.01) | |
| 375 373 |  | 
| 376 374 | 
             
            ### Linear Regression
         | 
| 377 375 |  | 
| 378 | 
            -
             | 
| 376 | 
            +
            By default, an intercept is included. Disable this with:
         | 
| 377 | 
            +
             | 
| 378 | 
            +
            ```ruby
         | 
| 379 | 
            +
            Eps::Model.new(data, intercept: false)
         | 
| 380 | 
            +
            ```
         | 
| 379 381 |  | 
| 380 382 | 
             
            To speed up training on large datasets with linear regression, [install GSL](https://github.com/ankane/gslr#gsl-installation). With Homebrew, you can use:
         | 
| 381 383 |  | 
| @@ -391,14 +393,6 @@ gem 'gslr', group: :development | |
| 391 393 |  | 
| 392 394 | 
             
            It only needs to be available in environments used to build the model.
         | 
| 393 395 |  | 
| 394 | 
            -
            #### Options
         | 
| 395 | 
            -
             | 
| 396 | 
            -
            By default, an intercept is included. Disable this with:
         | 
| 397 | 
            -
             | 
| 398 | 
            -
            ```ruby
         | 
| 399 | 
            -
            Eps::Model.new(data, intercept: false)
         | 
| 400 | 
            -
            ```
         | 
| 401 | 
            -
             | 
| 402 396 | 
             
            ## Probability
         | 
| 403 397 |  | 
| 404 398 | 
             
            To get the probability of each category for predictions with classification, use:
         | 
    
        data/lib/eps/base_estimator.rb
    CHANGED
    
    | @@ -3,6 +3,7 @@ module Eps | |
| 3 3 | 
             
                def initialize(data = nil, y = nil, **options)
         | 
| 4 4 | 
             
                  @options = options.dup
         | 
| 5 5 | 
             
                  @trained = false
         | 
| 6 | 
            +
                  @text_encoders = {}
         | 
| 6 7 | 
             
                  # TODO better pattern - don't pass most options to train
         | 
| 7 8 | 
             
                  train(data, y, **options) if data
         | 
| 8 9 | 
             
                end
         | 
| @@ -209,29 +210,38 @@ module Eps | |
| 209 210 | 
             
                  [data, target]
         | 
| 210 211 | 
             
                end
         | 
| 211 212 |  | 
| 212 | 
            -
                def prep_text_features(train_set)
         | 
| 213 | 
            -
                  @text_encoders = {}
         | 
| 213 | 
            +
                def prep_text_features(train_set, fit: true)
         | 
| 214 214 | 
             
                  @text_features.each do |k, v|
         | 
| 215 | 
            -
                     | 
| 216 | 
            -
             | 
| 215 | 
            +
                    if fit
         | 
| 216 | 
            +
                      # reset vocabulary
         | 
| 217 | 
            +
                      v.delete(:vocabulary)
         | 
| 218 | 
            +
             | 
| 219 | 
            +
                      # TODO determine max features automatically
         | 
| 220 | 
            +
                      # start based on number of rows
         | 
| 221 | 
            +
                      encoder = Eps::TextEncoder.new(**v)
         | 
| 222 | 
            +
                      counts = encoder.fit(train_set.columns.delete(k))
         | 
| 223 | 
            +
                    else
         | 
| 224 | 
            +
                      encoder = @text_encoders[k]
         | 
| 225 | 
            +
                      counts = encoder.transform(train_set.columns.delete(k))
         | 
| 226 | 
            +
                    end
         | 
| 217 227 |  | 
| 218 | 
            -
                    # TODO determine max features automatically
         | 
| 219 | 
            -
                    # start based on number of rows
         | 
| 220 | 
            -
                    encoder = Eps::TextEncoder.new(**v)
         | 
| 221 | 
            -
                    counts = encoder.fit(train_set.columns.delete(k))
         | 
| 222 228 | 
             
                    encoder.vocabulary.each do |word|
         | 
| 223 229 | 
             
                      train_set.columns[[k, word]] = [0] * counts.size
         | 
| 224 230 | 
             
                    end
         | 
| 231 | 
            +
             | 
| 225 232 | 
             
                    counts.each_with_index do |ci, i|
         | 
| 226 233 | 
             
                      ci.each do |word, count|
         | 
| 227 234 | 
             
                        word_key = [k, word]
         | 
| 228 235 | 
             
                        train_set.columns[word_key][i] = 1 if train_set.columns.key?(word_key)
         | 
| 229 236 | 
             
                      end
         | 
| 230 237 | 
             
                    end
         | 
| 231 | 
            -
                    @text_encoders[k] = encoder
         | 
| 232 238 |  | 
| 233 | 
            -
                     | 
| 234 | 
            -
             | 
| 239 | 
            +
                    if fit
         | 
| 240 | 
            +
                      @text_encoders[k] = encoder
         | 
| 241 | 
            +
             | 
| 242 | 
            +
                      # update vocabulary
         | 
| 243 | 
            +
                      v[:vocabulary] = encoder.vocabulary
         | 
| 244 | 
            +
                    end
         | 
| 235 245 | 
             
                  end
         | 
| 236 246 |  | 
| 237 247 | 
             
                  raise "No features left" if train_set.columns.empty?
         | 
| @@ -19,13 +19,7 @@ module Eps | |
| 19 19 | 
             
                    # sparse matrix
         | 
| 20 20 | 
             
                    @text_features.each do |k, v|
         | 
| 21 21 | 
             
                      encoder = TextEncoder.new(**v)
         | 
| 22 | 
            -
             | 
| 23 | 
            -
                      values = data.columns.delete(k)
         | 
| 24 | 
            -
                      counts = encoder.transform(values)
         | 
| 25 | 
            -
             | 
| 26 | 
            -
                      encoder.vocabulary.each do |word|
         | 
| 27 | 
            -
                        data.columns[[k, word]] = [0] * values.size
         | 
| 28 | 
            -
                      end
         | 
| 22 | 
            +
                      counts = encoder.transform(data.columns[k])
         | 
| 29 23 |  | 
| 30 24 | 
             
                      counts.each_with_index do |xc, i|
         | 
| 31 25 | 
             
                        row = rows[i]
         | 
| @@ -52,7 +46,7 @@ module Eps | |
| 52 46 | 
             
                      @trees.each_slice(num_trees).each do |trees|
         | 
| 53 47 | 
             
                        tree_scores << sum_trees(rows, trees)
         | 
| 54 48 | 
             
                      end
         | 
| 55 | 
            -
                       | 
| 49 | 
            +
                      rows.size.times.map do |i|
         | 
| 56 50 | 
             
                        v = tree_scores.map { |s| s[i] }
         | 
| 57 51 | 
             
                        if probabilities
         | 
| 58 52 | 
             
                          exp = v.map { |vi| Math.exp(vi) }
         | 
    
        data/lib/eps/lightgbm.rb
    CHANGED
    
    | @@ -57,7 +57,7 @@ module Eps | |
| 57 57 |  | 
| 58 58 | 
             
                  # text feature encoding
         | 
| 59 59 | 
             
                  prep_text_features(train_set)
         | 
| 60 | 
            -
                  prep_text_features(validation_set) if validation_set
         | 
| 60 | 
            +
                  prep_text_features(validation_set, fit: false) if validation_set
         | 
| 61 61 |  | 
| 62 62 | 
             
                  # create params
         | 
| 63 63 | 
             
                  params = {
         | 
| @@ -144,7 +144,10 @@ module Eps | |
| 144 144 | 
             
                  end
         | 
| 145 145 |  | 
| 146 146 | 
             
                  if bad_observations.any?
         | 
| 147 | 
            -
                     | 
| 147 | 
            +
                    bad_observations.each do |obs|
         | 
| 148 | 
            +
                      p obs
         | 
| 149 | 
            +
                    end
         | 
| 150 | 
            +
                    raise "Bug detected in evaluator. Please report an issue."
         | 
| 148 151 | 
             
                  end
         | 
| 149 152 | 
             
                end
         | 
| 150 153 |  | 
    
        data/lib/eps/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: eps
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.3. | 
| 4 | 
            +
              version: 0.3.6
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Andrew Kane
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2020-06- | 
| 11 | 
            +
            date: 2020-06-19 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: lightgbm
         |