easy_ml 0.2.0.pre.rc56 → 0.2.0.pre.rc58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/apis_controller.rb +8 -0
  3. data/app/controllers/easy_ml/models_controller.rb +3 -0
  4. data/app/controllers/easy_ml/predictions_controller.rb +10 -5
  5. data/app/frontend/components/ModelForm.tsx +1 -1
  6. data/app/frontend/components/SearchableSelect.tsx +0 -1
  7. data/app/frontend/components/dataset/PreprocessingConfig.tsx +1 -1
  8. data/app/frontend/pages/DatasourcesPage.tsx +0 -2
  9. data/app/jobs/easy_ml/compute_feature_job.rb +1 -0
  10. data/app/models/easy_ml/column.rb +55 -4
  11. data/app/models/easy_ml/column_history.rb +5 -1
  12. data/app/models/easy_ml/column_list.rb +46 -14
  13. data/app/models/easy_ml/dataset.rb +47 -27
  14. data/app/models/easy_ml/datasource.rb +1 -0
  15. data/app/models/easy_ml/feature.rb +10 -3
  16. data/app/models/easy_ml/model.rb +30 -6
  17. data/app/models/easy_ml/model_history.rb +1 -0
  18. data/app/models/easy_ml/models/xgboost/evals_callback.rb +4 -3
  19. data/app/models/easy_ml/retraining_run.rb +1 -0
  20. data/config/initializers/inflections.rb +2 -0
  21. data/config/routes.rb +3 -0
  22. data/lib/easy_ml/core/evaluators/base_evaluator.rb +1 -1
  23. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +9 -9
  24. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +4 -4
  25. data/lib/easy_ml/core/model_evaluator.rb +18 -3
  26. data/lib/easy_ml/core/tuner.rb +23 -17
  27. data/lib/easy_ml/data/preprocessor.rb +10 -53
  28. data/lib/easy_ml/data/splits/in_memory_split.rb +4 -0
  29. data/lib/easy_ml/data/statistics_learner.rb +79 -14
  30. data/lib/easy_ml/data/synced_directory.rb +4 -2
  31. data/lib/easy_ml/predict.rb +13 -2
  32. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +3 -0
  33. data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +14 -0
  34. data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt +6 -0
  35. data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt +20 -0
  36. data/lib/easy_ml/version.rb +1 -1
  37. data/public/easy_ml/assets/.vite/manifest.json +1 -1
  38. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-DTZ2348z.js → Application.tsx-DmkdJsDd.js} +34 -34
  39. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-DTZ2348z.js.map → Application.tsx-DmkdJsDd.js.map} +1 -1
  40. metadata +8 -4
@@ -17,6 +17,7 @@
17
17
  # is_training :boolean
18
18
  # created_at :datetime not null
19
19
  # updated_at :datetime not null
20
+ # slug :string not null
20
21
  #
21
22
  require_relative "models/hyperparameters"
22
23
 
@@ -66,6 +67,7 @@ module EasyML
66
67
  after_initialize :bump_version, if: -> { new_record? }
67
68
  after_initialize :set_defaults, if: -> { new_record? }
68
69
  before_save :save_model_file, if: -> { is_fit? && !is_history_class? && model_changed? && !@skip_save_model_file }
70
+ before_validation :set_slug, if: :name_changed?
69
71
 
70
72
  VALID_TASKS = %i[regression classification].freeze
71
73
 
@@ -91,6 +93,7 @@ module EasyML
91
93
  }
92
94
  validates :model_type, inclusion: { in: MODEL_NAMES }
93
95
  validates :dataset_id, presence: true
96
+ validates :slug, presence: true, uniqueness: true
94
97
  validate :validate_metrics_allowed
95
98
  before_save :set_root_dir
96
99
 
@@ -189,6 +192,7 @@ module EasyML
189
192
  evaluator: evaluator,
190
193
  model: self,
191
194
  dataset: dataset,
195
+ metrics: metrics,
192
196
  }.compact
193
197
  tuner.merge!(extra_params)
194
198
  tuner_instance = EasyML::Core::Tuner.new(tuner)
@@ -307,7 +311,6 @@ module EasyML
307
311
 
308
312
  dataset.refresh
309
313
  adapter.fit(tuning: tuning, x_train: x_train, y_train: y_train, x_valid: x_valid, y_valid: y_valid, &progress_block)
310
- @is_fit = true
311
314
  end
312
315
 
313
316
  def batch_args
@@ -334,11 +337,8 @@ module EasyML
334
337
 
335
338
  def fit_in_batches(tuning: false, batch_size: nil, batch_overlap: nil, batch_key: nil, checkpoint_dir: Rails.root.join("tmp", "xgboost_checkpoints"), &progress_block)
336
339
  adapter.fit_in_batches(tuning: tuning, batch_size: batch_size, batch_overlap: batch_overlap, batch_key: batch_key, checkpoint_dir: checkpoint_dir, &progress_block)
337
- @is_fit = true
338
340
  end
339
341
 
340
- attr_accessor :is_fit
341
-
342
342
  def is_fit?
343
343
  model_file = get_model_file
344
344
  return true if model_file.present? && model_file.fit?
@@ -354,15 +354,16 @@ module EasyML
354
354
  dataset.decode_labels(ys, col: col)
355
355
  end
356
356
 
357
- def evaluate(y_pred: nil, y_true: nil, x_true: nil, evaluator: nil)
357
+ def evaluate(y_pred: nil, y_true: nil, x_true: nil, evaluator: nil, dataset: nil)
358
358
  evaluator ||= self.evaluator
359
359
  if y_pred.nil?
360
360
  inputs = default_evaluation_inputs
361
361
  y_pred = inputs[:y_pred]
362
362
  y_true = inputs[:y_true]
363
363
  x_true = inputs[:x_true]
364
+ dataset = inputs[:dataset]
364
365
  end
365
- EasyML::Core::ModelEvaluator.evaluate(model: self, y_pred: y_pred, y_true: y_true, x_true: x_true, evaluator: evaluator)
366
+ EasyML::Core::ModelEvaluator.evaluate(model: self, y_pred: y_pred, y_true: y_true, x_true: x_true, dataset: dataset, evaluator: evaluator)
366
367
  end
367
368
 
368
369
  def evaluator
@@ -446,6 +447,21 @@ module EasyML
446
447
  )
447
448
  end
448
449
 
450
+ include Rails.application.routes.mounted_helpers
451
+
452
+ def api_fields
453
+ {
454
+ url: EasyML::Engine.routes.url_helpers.predictions_path,
455
+ method: "POST",
456
+ data: {
457
+ model: slug,
458
+ input: dataset.columns.api_inputs.sort_by_required.map(&:to_api).each_with_object({}) do |field, hash|
459
+ hash[field[:name]] = field.except(:name)
460
+ end,
461
+ },
462
+ }
463
+ end
464
+
449
465
  class CannotdeployError < StandardError
450
466
  end
451
467
 
@@ -524,11 +540,13 @@ module EasyML
524
540
 
525
541
  def default_evaluation_inputs
526
542
  x_true, y_true = dataset.test(split_ys: true)
543
+ ds = dataset.test(all_columns: true)
527
544
  y_pred = predict(x_true)
528
545
  {
529
546
  x_true: x_true,
530
547
  y_true: y_true,
531
548
  y_pred: y_pred,
549
+ dataset: ds,
532
550
  }
533
551
  end
534
552
 
@@ -603,6 +621,12 @@ module EasyML
603
621
  errors.add(:metrics,
604
622
  "don't know how to handle #{"metrics".pluralize(unknown_metrics)} #{unknown_metrics.join(", ")}, use EasyML::Core::ModelEvaluator.register(:name, Evaluator, :regression|:classification)")
605
623
  end
624
+
625
+ def set_slug
626
+ if slug.nil? && name.present?
627
+ self.slug = name.gsub(/\s/, "_").downcase
628
+ end
629
+ end
606
630
  end
607
631
  end
608
632
 
@@ -22,6 +22,7 @@
22
22
  # history_ended_at :datetime
23
23
  # history_user_id :integer
24
24
  # snapshot_id :string
25
+ # slug :string
25
26
  #
26
27
  module EasyML
27
28
  class ModelHistory < ActiveRecord::Base
@@ -32,7 +32,7 @@ module EasyML
32
32
  false
33
33
  end
34
34
 
35
- def validation_dataset
35
+ def test_dataset
36
36
  if tuner.present?
37
37
  [tuner.x_true, tuner.y_true]
38
38
  else
@@ -46,11 +46,12 @@ module EasyML
46
46
  log_frequency = 10
47
47
  if epoch % log_frequency == 0
48
48
  model.adapter.external_model = booster
49
- x_true, y_true = validation_dataset
49
+ x_true, y_true = test_dataset
50
50
  @preprocessed ||= model.preprocess(x_true)
51
51
  y_pred = model.predict(@preprocessed)
52
+ dataset = model.dataset.test(all_columns: true)
52
53
 
53
- metrics = model.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true)
54
+ metrics = model.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true, dataset: dataset)
54
55
  Wandb.log(metrics)
55
56
  end
56
57
 
@@ -158,6 +158,7 @@ module EasyML
158
158
  model: training_model,
159
159
  y_pred: y_pred,
160
160
  y_true: y_true,
161
+ dataset: training_model.dataset.test(all_columns: true),
161
162
  evaluator: evaluator,
162
163
  )
163
164
  metric_value = metrics[metric]
@@ -12,6 +12,8 @@ module EasyML
12
12
  inflect.acronym "EST"
13
13
  inflect.acronym "UTC"
14
14
  inflect.acronym "HTML"
15
+ inflect.acronym "API"
16
+ inflect.acronym "APIs"
15
17
  end
16
18
  end
17
19
  end
data/config/routes.rb CHANGED
@@ -11,6 +11,9 @@ EasyML::Engine.routes.draw do
11
11
  # Predictions API
12
12
  resources :predictions, only: [:create]
13
13
 
14
+ # API Documentation
15
+ get "api", to: "apis#show"
16
+
14
17
  resources :models, as: :easy_ml_models do
15
18
  member do
16
19
  post :train
@@ -32,7 +32,7 @@ module EasyML
32
32
  end
33
33
 
34
34
  # Instance methods that evaluators must implement
35
- def evaluate(y_pred: nil, y_true: nil, x_true: nil)
35
+ def evaluate(y_pred: nil, y_true: nil, x_true: nil, dataset: nil)
36
36
  raise NotImplementedError, "#{self.class} must implement #evaluate"
37
37
  end
38
38
 
@@ -5,7 +5,7 @@ module EasyML
5
5
  class AccuracyScore
6
6
  include BaseEvaluator
7
7
 
8
- def evaluate(y_pred:, y_true:, x_true: nil)
8
+ def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
9
9
  y_pred = Numo::Int32.cast(y_pred)
10
10
  y_true = Numo::Int32.cast(y_true)
11
11
  y_pred.eq(y_true).count_true.to_f / y_pred.size
@@ -23,7 +23,7 @@ module EasyML
23
23
  class PrecisionScore
24
24
  include BaseEvaluator
25
25
 
26
- def evaluate(y_pred:, y_true:, x_true: nil)
26
+ def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
27
27
  y_pred = Numo::Int32.cast(y_pred)
28
28
  y_true = Numo::Int32.cast(y_true)
29
29
  true_positives = (y_pred.eq(1) & y_true.eq(1)).count_true
@@ -45,7 +45,7 @@ module EasyML
45
45
  class RecallScore
46
46
  include BaseEvaluator
47
47
 
48
- def evaluate(y_pred:, y_true:, x_true: nil)
48
+ def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
49
49
  y_pred = Numo::Int32.cast(y_pred)
50
50
  y_true = Numo::Int32.cast(y_true)
51
51
  true_positives = (y_pred.eq(1) & y_true.eq(1)).count_true
@@ -65,9 +65,9 @@ module EasyML
65
65
  class F1Score
66
66
  include BaseEvaluator
67
67
 
68
- def evaluate(y_pred:, y_true:, x_true: nil)
69
- precision = PrecisionScore.new.evaluate(y_pred: y_pred, y_true: y_true)
70
- recall = RecallScore.new.evaluate(y_pred: y_pred, y_true: y_true)
68
+ def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
69
+ precision = PrecisionScore.new.evaluate(y_pred: y_pred, y_true: y_true, dataset: dataset)
70
+ recall = RecallScore.new.evaluate(y_pred: y_pred, y_true: y_true, dataset: dataset)
71
71
  return 0 unless (precision + recall) > 0
72
72
 
73
73
  2 * (precision * recall) / (precision + recall)
@@ -85,7 +85,7 @@ module EasyML
85
85
  class AUC
86
86
  include BaseEvaluator
87
87
 
88
- def evaluate(y_pred:, y_true:, x_true: nil)
88
+ def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
89
89
  y_pred = Numo::DFloat.cast(y_pred)
90
90
  y_true = Numo::Int32.cast(y_true)
91
91
 
@@ -132,8 +132,8 @@ module EasyML
132
132
  class ROC_AUC
133
133
  include BaseEvaluator
134
134
 
135
- def evaluate(y_pred:, y_true:, x_true: nil)
136
- AUC.new.evaluate(y_pred: y_pred, y_true: y_true)
135
+ def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
136
+ AUC.new.evaluate(y_pred: y_pred, y_true: y_true, dataset: dataset)
137
137
  end
138
138
 
139
139
  def description
@@ -5,7 +5,7 @@ module EasyML
5
5
  class MeanAbsoluteError
6
6
  include BaseEvaluator
7
7
 
8
- def evaluate(y_pred:, y_true:, x_true: nil)
8
+ def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
9
9
  (Numo::DFloat.cast(y_pred) - Numo::DFloat.cast(y_true)).abs.mean
10
10
  end
11
11
 
@@ -21,7 +21,7 @@ module EasyML
21
21
  class MeanSquaredError
22
22
  include BaseEvaluator
23
23
 
24
- def evaluate(y_pred:, y_true:, x_true: nil)
24
+ def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
25
25
  ((Numo::DFloat.cast(y_pred) - Numo::DFloat.cast(y_true)) ** 2).mean
26
26
  end
27
27
 
@@ -37,7 +37,7 @@ module EasyML
37
37
  class RootMeanSquaredError
38
38
  include BaseEvaluator
39
39
 
40
- def evaluate(y_pred:, y_true:, x_true: nil)
40
+ def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
41
41
  Math.sqrt(((Numo::DFloat.cast(y_pred) - Numo::DFloat.cast(y_true)) ** 2).mean)
42
42
  end
43
43
 
@@ -61,7 +61,7 @@ module EasyML
61
61
  "maximize"
62
62
  end
63
63
 
64
- def evaluate(y_pred:, y_true:, x_true: nil)
64
+ def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
65
65
  y_true = Numo::DFloat.cast(y_true)
66
66
  y_pred = Numo::DFloat.cast(y_pred)
67
67
 
@@ -98,13 +98,21 @@ module EasyML
98
98
  end
99
99
  end
100
100
 
101
- def evaluate(model:, y_pred:, y_true:, x_true: nil, evaluator: nil)
101
+ def evaluate(model:, y_pred:, y_true:, x_true: nil, evaluator: nil, dataset: nil)
102
102
  y_pred = normalize_input(y_pred)
103
103
  y_true = normalize_input(y_true)
104
104
  check_size(y_pred, y_true)
105
105
 
106
106
  metrics_results = {}
107
107
 
108
+ if x_true.nil?
109
+ x_true = model.dataset.test
110
+ end
111
+
112
+ if dataset.nil?
113
+ dataset = model.dataset.test(all_columns: true)
114
+ end
115
+
108
116
  model.metrics.each do |metric|
109
117
  evaluator_class = get(metric.to_sym)
110
118
  next unless evaluator_class
@@ -115,6 +123,7 @@ module EasyML
115
123
  y_pred: y_pred,
116
124
  y_true: y_true,
117
125
  x_true: x_true,
126
+ dataset: dataset,
118
127
  )
119
128
  end
120
129
 
@@ -124,7 +133,7 @@ module EasyML
124
133
  raise "Unknown evaluator: #{evaluator}" unless evaluator_class
125
134
 
126
135
  evaluator_instance = evaluator_class.new
127
- response = evaluator_instance.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true)
136
+ response = evaluator_instance.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true, dataset: dataset)
128
137
 
129
138
  if response.is_a?(Hash)
130
139
  metrics_results.merge!(response)
@@ -145,6 +154,9 @@ module EasyML
145
154
  def normalize_input(input)
146
155
  case input
147
156
  when Array
157
+ if input.first.class == TrueClass || input.first.class == FalseClass
158
+ input = input.map { |value| value ? 1.0 : 0.0 }
159
+ end
148
160
  Numo::DFloat.cast(input)
149
161
  when Polars::DataFrame
150
162
  if input.columns.count > 1
@@ -152,7 +164,10 @@ module EasyML
152
164
  end
153
165
 
154
166
  normalize_input(input[input.columns.first])
155
- when Polars::Series, Array
167
+ when Polars::Series
168
+ if input.dtype == Polars::Boolean
169
+ input = input.cast(Polars::Int64)
170
+ end
156
171
  Numo::DFloat.cast(input)
157
172
  else
158
173
  raise ArgumentError, "Don't know how to evaluate model with y_pred type #{input.class}"
@@ -8,7 +8,7 @@ module EasyML
8
8
  :metrics, :objective, :n_trials, :direction, :evaluator,
9
9
  :study, :results, :adapter, :tune_started_at, :x_true, :y_true,
10
10
  :project_name, :job, :current_run, :trial_enumerator, :progress_block,
11
- :tuner_job
11
+ :tuner_job, :dataset
12
12
 
13
13
  def initialize(options = {})
14
14
  @model = options[:model]
@@ -77,6 +77,7 @@ module EasyML
77
77
  x_true, y_true = model.dataset.test(split_ys: true)
78
78
  self.x_true = x_true
79
79
  self.y_true = y_true
80
+ self.dataset = model.dataset.test(all_columns: true)
80
81
  adapter.tune_started_at = tune_started_at
81
82
  adapter.y_true = y_true
82
83
  adapter.x_true = x_true
@@ -96,14 +97,6 @@ module EasyML
96
97
  run_metrics = tune_once
97
98
  result = calculate_result(run_metrics)
98
99
  @results.push(result)
99
-
100
- params = {
101
- hyperparameters: model.hyperparameters.to_h,
102
- value: result,
103
- status: :success,
104
- }.compact
105
-
106
- @tuner_run.update!(params)
107
100
  @study.tell(@current_trial, result)
108
101
  rescue StandardError => e
109
102
  @tuner_run.update!(status: :failed, hyperparameters: {})
@@ -138,14 +131,27 @@ module EasyML
138
131
  )
139
132
  self.current_run = @tuner_run
140
133
 
141
- adapter.run_trial(@current_trial) do |model|
142
- model.fit(tuning: true, &progress_block)
143
- y_pred = model.predict(x_true)
144
- model.metrics = metrics
145
- metrics = model.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true)
146
- puts metrics
147
- metrics
134
+ model = adapter.run_trial(@current_trial) do |model|
135
+ model.tap do
136
+ model.fit(tuning: true, &progress_block)
137
+ end
148
138
  end
139
+
140
+ y_pred = model.predict(x_true)
141
+ model.metrics = metrics
142
+ metrics = model.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true, dataset: dataset)
143
+ metric = metrics.symbolize_keys.dig(model.evaluator[:metric].to_sym)
144
+
145
+ puts metrics
146
+
147
+ params = {
148
+ hyperparameters: model.hyperparameters.to_h,
149
+ value: metric,
150
+ status: :success,
151
+ }.compact
152
+
153
+ @tuner_run.update!(params)
154
+ metrics
149
155
  end
150
156
 
151
157
  private
@@ -167,7 +173,7 @@ module EasyML
167
173
  end
168
174
  raise ArgumentError, "Objectives required for EasyML::Core::Tuner" unless objective.present?
169
175
 
170
- self.metrics = EasyML::Model.new(task: task).allowed_metrics if metrics.nil? || metrics.empty?
176
+ self.metrics = EasyML::Model.new(task: task).default_metrics if metrics.nil? || metrics.empty?
171
177
  end
172
178
  end
173
179
  end
@@ -90,46 +90,19 @@ module EasyML::Data
90
90
  df
91
91
  end
92
92
 
93
- def learn_categorical_min(df, preprocessing_steps)
94
- preprocessing_steps ||= {}
95
- preprocessing_steps.deep_symbolize_keys!
96
-
97
- allowed_categories = {}
98
- (preprocessing_steps[:training] || {}).each_key do |col|
99
- next unless [
100
- preprocessing_steps.dig(:training, col, :params, :ordinal_encoding),
101
- preprocessing_steps.dig(:training, col, :params, :one_hot),
102
- preprocessing_steps.dig(:training, col, :method).to_sym == :categorical,
103
- ].any?
104
-
105
- cat_min = preprocessing_steps.dig(:training, col, :params, :categorical_min) || 1
106
- val_counts = df[col].value_counts
107
- allowed_categories[col] = val_counts[val_counts["count"] >= cat_min][col].to_a.compact
108
- end
109
- allowed_categories
110
- end
111
-
112
- def fit(df)
93
+ def fit(df, precomputed_stats = {})
113
94
  return if df.nil?
114
95
  return if preprocessing_steps.nil? || preprocessing_steps.keys.none?
115
96
 
116
97
  preprocessing_steps.deep_symbolize_keys!
117
98
  df = apply_clip(df, preprocessing_steps)
118
- allowed_categories = learn_categorical_min(df, preprocessing_steps)
119
-
120
- self.statistics = StatisticsLearner.learn_df(df, dataset: dataset).deep_symbolize_keys
121
99
 
122
- # Merge allowed categories into statistics
123
- allowed_categories.each do |col, categories|
124
- statistics[col] ||= {}
125
- statistics[col][:allowed_categories] = categories
126
- statistics[col].merge!(
127
- fit_categorical(df[col], preprocessing_steps)
128
- )
129
- end
100
+ self.statistics = StatisticsLearner.learn_df(df, dataset: dataset, type: :raw).deep_symbolize_keys.merge!(
101
+ precomputed_stats
102
+ ).deep_symbolize_keys
130
103
  end
131
104
 
132
- def postprocess(df, inference: false)
105
+ def postprocess(df, inference: false, computed: false)
133
106
  puts "Postprocessing..." if verbose
134
107
  return df if preprocessing_steps.nil? || preprocessing_steps.keys.none?
135
108
 
@@ -139,6 +112,11 @@ module EasyML::Data
139
112
  preprocessing_steps[:training]
140
113
  end
141
114
 
115
+ if computed
116
+ computed_cols = dataset.columns.computed.map(&:name).map(&:to_sym)
117
+ steps = steps.deep_dup.slice(*computed_cols)
118
+ end
119
+
142
120
  df = apply_transformations(df, steps)
143
121
 
144
122
  puts "Postprocessing complete." if @verbose
@@ -260,27 +238,6 @@ module EasyML::Data
260
238
  )
261
239
  end
262
240
 
263
- def fit_categorical(series, _preprocessing_steps)
264
- value_counts = series.value_counts
265
- column_names = value_counts.columns
266
- value_column = column_names[0]
267
- count_column = column_names[1]
268
-
269
- as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&:to_s)
270
- label_encoder = as_hash.keys.sort.each.with_index.reduce({}) do |h, (k, i)|
271
- h.tap do
272
- h[k] = i
273
- end
274
- end
275
- label_decoder = label_encoder.invert
276
-
277
- {
278
- value: as_hash,
279
- label_encoder: label_encoder,
280
- label_decoder: label_decoder,
281
- }
282
- end
283
-
284
241
  def prepare_for_imputation(df, col)
285
242
  df = df.with_column(Polars.col(col).cast(Polars::Float64))
286
243
  df.with_column(Polars.when(Polars.col(col).is_null).then(Float::NAN).otherwise(Polars.col(col)).alias(col))
@@ -41,6 +41,10 @@ module EasyML
41
41
  split_features_targets(df, split_ys, target)
42
42
  end
43
43
 
44
+ def query(**kwargs)
45
+ read("all", **kwargs)
46
+ end
47
+
44
48
  def cleanup
45
49
  @data.clear
46
50
  end
@@ -9,15 +9,16 @@ module EasyML::Data
9
9
  @verbose = options[:verbose]
10
10
  end
11
11
 
12
- def self.learn(df, dataset = nil)
13
- new(df, dataset).learn
12
+ def self.learn(df, dataset, type)
13
+ new(df, dataset, type).learn
14
14
  end
15
15
 
16
- attr_reader :df, :dataset
16
+ attr_reader :df, :dataset, :type
17
17
 
18
- def initialize(df, dataset)
18
+ def initialize(df, dataset, type)
19
19
  @df = df
20
20
  @dataset = dataset
21
+ @type = type.to_sym
21
22
  end
22
23
 
23
24
  def learn
@@ -27,18 +28,73 @@ module EasyML::Data
27
28
  def learn_split(split)
28
29
  df = split.read(:all)
29
30
  train_df = split.read(:train)
30
- all_stats = learn_df(df, dataset: dataset)
31
- train_stats = learn_df(train_df, dataset: dataset)
31
+ all_stats = learn_df(df)
32
+ train_stats = learn_df(train_df)
32
33
 
33
34
  all_stats.reduce({}) do |output, (k, _)|
34
35
  output.tap do
35
36
  output[k] = all_stats[k].slice(:num_rows, :null_count, :unique_count, :counts).merge!(
36
- train_stats[k].slice(:mean, :median, :min, :max, :std, :last_value, :most_frequent_value, :last_known_value)
37
+ train_stats[k].slice(:mean, :median, :min, :max, :std,
38
+ :last_value, :most_frequent_value, :last_known_value,
39
+ :allowed_categories, :label_encoder, :label_decoder)
37
40
  )
38
41
  end
39
42
  end
40
43
  end
41
44
 
45
+ def learn_categorical(df)
46
+ allowed_categories = learn_allowed_categories(df)
47
+ allowed_categories.reduce({}) do |statistics, (col, categories)|
48
+ statistics.tap do
49
+ statistics[col] ||= {}
50
+ statistics[col][:allowed_categories] = categories
51
+ statistics[col].merge!(
52
+ learn_categorical_encoder_decoder(df[col])
53
+ )
54
+ end
55
+ end
56
+ end
57
+
58
+ def learn_categorical_encoder_decoder(series)
59
+ value_counts = series.value_counts
60
+ column_names = value_counts.columns
61
+ value_column = column_names[0]
62
+ count_column = column_names[1]
63
+
64
+ as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&:to_s)
65
+ label_encoder = as_hash.keys.sort.each.with_index.reduce({}) do |h, (k, i)|
66
+ h.tap do
67
+ h[k] = i
68
+ end
69
+ end
70
+ label_decoder = label_encoder.invert
71
+
72
+ {
73
+ value: as_hash,
74
+ label_encoder: label_encoder,
75
+ label_decoder: label_decoder,
76
+ }
77
+ end
78
+
79
+ def learn_allowed_categories(df)
80
+ preprocessing_steps = dataset.preprocessing_steps || {}
81
+ preprocessing_steps.deep_symbolize_keys!
82
+
83
+ allowed_categories = {}
84
+ (preprocessing_steps[:training] || {}).each_key do |col|
85
+ next unless [
86
+ preprocessing_steps.dig(:training, col, :params, :ordinal_encoding),
87
+ preprocessing_steps.dig(:training, col, :params, :one_hot),
88
+ preprocessing_steps.dig(:training, col, :method).to_sym == :categorical,
89
+ ].any?
90
+
91
+ cat_min = preprocessing_steps.dig(:training, col, :params, :categorical_min) || 1
92
+ val_counts = df[col].value_counts
93
+ allowed_categories[col] = val_counts[val_counts["count"] >= cat_min][col].to_a.compact
94
+ end
95
+ allowed_categories
96
+ end
97
+
42
98
  def last_known_value(df, col, date_col)
43
99
  return nil if df.empty? || !df.columns.include?(date_col)
44
100
 
@@ -53,13 +109,22 @@ module EasyML::Data
53
109
  last_value
54
110
  end
55
111
 
56
- def learn_df(df, dataset: nil)
57
- self.class.learn_df(df, dataset: dataset)
112
+ def learn_df(df)
113
+ return if df.nil?
114
+
115
+ stats = learn_base_stats(df, dataset: dataset).stringify_keys
116
+ if type == :raw
117
+ categorical = learn_categorical(df).stringify_keys
118
+ categorical.each { |k, v| stats[k].merge!(v) }
119
+ end
120
+ stats
58
121
  end
59
122
 
60
- def self.learn_df(df, dataset: nil)
61
- return if df.nil?
123
+ def self.learn_df(df, dataset: nil, type: :raw)
124
+ new(df, dataset, type).learn_df(df)
125
+ end
62
126
 
127
+ def learn_base_stats(df, dataset: nil)
63
128
  base_stats = describe_to_h(df).deep_symbolize_keys
64
129
 
65
130
  # Add basic column statistics first
@@ -103,16 +168,16 @@ module EasyML::Data
103
168
  end
104
169
  end
105
170
 
106
- def self.id_column?(column)
171
+ def id_column?(column)
107
172
  col = column.to_s.downcase
108
173
  col.match?(/^id$/) || col.match?(/.*_id/)
109
174
  end
110
175
 
111
- def self.last_value(df, col, date_col)
176
+ def last_value(df, col, date_col)
112
177
  df.filter(Polars.col(col).is_not_null).sort(date_col)[col][-1]
113
178
  end
114
179
 
115
- def self.describe_to_h(df)
180
+ def describe_to_h(df)
116
181
  init_h = df.describe.to_h
117
182
  rows = init_h.values.map(&:to_a)
118
183
  keys = rows.first
@@ -127,8 +127,10 @@ module EasyML
127
127
  )
128
128
 
129
129
  Rails.logger.info("Downloaded #{object.key} to #{local_file_path}")
130
- ungzipped_file_path = ungzip_file(local_file_path)
131
- Rails.logger.info("Ungzipped to #{ungzipped_file_path}")
130
+ if object.key.end_with?(".gz")
131
+ ungzipped_file_path = ungzip_file(local_file_path)
132
+ Rails.logger.info("Ungzipped to #{ungzipped_file_path}")
133
+ end
132
134
  rescue Aws::S3::Errors::ServiceError, Net::OpenTimeout, Net::ReadTimeout, StandardError => e
133
135
  Rails.logger.error("Failed to process #{object.key}: #{e.message}")
134
136
  raise e