easy_ml 0.2.0.pre.rc57 → 0.2.0.pre.rc58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/apis_controller.rb +8 -0
  3. data/app/controllers/easy_ml/models_controller.rb +3 -0
  4. data/app/controllers/easy_ml/predictions_controller.rb +10 -5
  5. data/app/frontend/components/ModelForm.tsx +1 -1
  6. data/app/frontend/components/SearchableSelect.tsx +0 -1
  7. data/app/frontend/components/dataset/PreprocessingConfig.tsx +1 -1
  8. data/app/frontend/pages/DatasourcesPage.tsx +0 -2
  9. data/app/jobs/easy_ml/compute_feature_job.rb +1 -0
  10. data/app/models/easy_ml/column.rb +42 -4
  11. data/app/models/easy_ml/column_history.rb +5 -1
  12. data/app/models/easy_ml/column_list.rb +43 -11
  13. data/app/models/easy_ml/dataset.rb +45 -25
  14. data/app/models/easy_ml/datasource.rb +1 -0
  15. data/app/models/easy_ml/feature.rb +10 -3
  16. data/app/models/easy_ml/model.rb +25 -4
  17. data/app/models/easy_ml/model_history.rb +1 -0
  18. data/app/models/easy_ml/retraining_run.rb +1 -0
  19. data/config/initializers/inflections.rb +2 -0
  20. data/config/routes.rb +3 -0
  21. data/lib/easy_ml/core/tuner.rb +1 -1
  22. data/lib/easy_ml/data/preprocessor.rb +10 -53
  23. data/lib/easy_ml/data/splits/in_memory_split.rb +4 -0
  24. data/lib/easy_ml/data/statistics_learner.rb +79 -14
  25. data/lib/easy_ml/data/synced_directory.rb +4 -2
  26. data/lib/easy_ml/predict.rb +13 -2
  27. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +3 -0
  28. data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +14 -0
  29. data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt +6 -0
  30. data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt +20 -0
  31. data/lib/easy_ml/version.rb +1 -1
  32. data/public/easy_ml/assets/.vite/manifest.json +1 -1
  33. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-DTZ2348z.js → Application.tsx-DmkdJsDd.js} +34 -34
  34. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-DTZ2348z.js.map → Application.tsx-DmkdJsDd.js.map} +1 -1
  35. metadata +8 -4
@@ -173,7 +173,7 @@ module EasyML
173
173
  end
174
174
  raise ArgumentError, "Objectives required for EasyML::Core::Tuner" unless objective.present?
175
175
 
176
- self.metrics = EasyML::Model.new(task: task).allowed_metrics if metrics.nil? || metrics.empty?
176
+ self.metrics = EasyML::Model.new(task: task).default_metrics if metrics.nil? || metrics.empty?
177
177
  end
178
178
  end
179
179
  end
@@ -90,46 +90,19 @@ module EasyML::Data
90
90
  df
91
91
  end
92
92
 
93
- def learn_categorical_min(df, preprocessing_steps)
94
- preprocessing_steps ||= {}
95
- preprocessing_steps.deep_symbolize_keys!
96
-
97
- allowed_categories = {}
98
- (preprocessing_steps[:training] || {}).each_key do |col|
99
- next unless [
100
- preprocessing_steps.dig(:training, col, :params, :ordinal_encoding),
101
- preprocessing_steps.dig(:training, col, :params, :one_hot),
102
- preprocessing_steps.dig(:training, col, :method).to_sym == :categorical,
103
- ].any?
104
-
105
- cat_min = preprocessing_steps.dig(:training, col, :params, :categorical_min) || 1
106
- val_counts = df[col].value_counts
107
- allowed_categories[col] = val_counts[val_counts["count"] >= cat_min][col].to_a.compact
108
- end
109
- allowed_categories
110
- end
111
-
112
- def fit(df)
93
+ def fit(df, precomputed_stats = {})
113
94
  return if df.nil?
114
95
  return if preprocessing_steps.nil? || preprocessing_steps.keys.none?
115
96
 
116
97
  preprocessing_steps.deep_symbolize_keys!
117
98
  df = apply_clip(df, preprocessing_steps)
118
- allowed_categories = learn_categorical_min(df, preprocessing_steps)
119
-
120
- self.statistics = StatisticsLearner.learn_df(df, dataset: dataset).deep_symbolize_keys
121
99
 
122
- # Merge allowed categories into statistics
123
- allowed_categories.each do |col, categories|
124
- statistics[col] ||= {}
125
- statistics[col][:allowed_categories] = categories
126
- statistics[col].merge!(
127
- fit_categorical(df[col], preprocessing_steps)
128
- )
129
- end
100
+ self.statistics = StatisticsLearner.learn_df(df, dataset: dataset, type: :raw).deep_symbolize_keys.merge!(
101
+ precomputed_stats
102
+ ).deep_symbolize_keys
130
103
  end
131
104
 
132
- def postprocess(df, inference: false)
105
+ def postprocess(df, inference: false, computed: false)
133
106
  puts "Postprocessing..." if verbose
134
107
  return df if preprocessing_steps.nil? || preprocessing_steps.keys.none?
135
108
 
@@ -139,6 +112,11 @@ module EasyML::Data
139
112
  preprocessing_steps[:training]
140
113
  end
141
114
 
115
+ if computed
116
+ computed_cols = dataset.columns.computed.map(&:name).map(&:to_sym)
117
+ steps = steps.deep_dup.slice(*computed_cols)
118
+ end
119
+
142
120
  df = apply_transformations(df, steps)
143
121
 
144
122
  puts "Postprocessing complete." if @verbose
@@ -260,27 +238,6 @@ module EasyML::Data
260
238
  )
261
239
  end
262
240
 
263
- def fit_categorical(series, _preprocessing_steps)
264
- value_counts = series.value_counts
265
- column_names = value_counts.columns
266
- value_column = column_names[0]
267
- count_column = column_names[1]
268
-
269
- as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&:to_s)
270
- label_encoder = as_hash.keys.sort.each.with_index.reduce({}) do |h, (k, i)|
271
- h.tap do
272
- h[k] = i
273
- end
274
- end
275
- label_decoder = label_encoder.invert
276
-
277
- {
278
- value: as_hash,
279
- label_encoder: label_encoder,
280
- label_decoder: label_decoder,
281
- }
282
- end
283
-
284
241
  def prepare_for_imputation(df, col)
285
242
  df = df.with_column(Polars.col(col).cast(Polars::Float64))
286
243
  df.with_column(Polars.when(Polars.col(col).is_null).then(Float::NAN).otherwise(Polars.col(col)).alias(col))
@@ -41,6 +41,10 @@ module EasyML
41
41
  split_features_targets(df, split_ys, target)
42
42
  end
43
43
 
44
+ def query(**kwargs)
45
+ read("all", **kwargs)
46
+ end
47
+
44
48
  def cleanup
45
49
  @data.clear
46
50
  end
@@ -9,15 +9,16 @@ module EasyML::Data
9
9
  @verbose = options[:verbose]
10
10
  end
11
11
 
12
- def self.learn(df, dataset = nil)
13
- new(df, dataset).learn
12
+ def self.learn(df, dataset, type)
13
+ new(df, dataset, type).learn
14
14
  end
15
15
 
16
- attr_reader :df, :dataset
16
+ attr_reader :df, :dataset, :type
17
17
 
18
- def initialize(df, dataset)
18
+ def initialize(df, dataset, type)
19
19
  @df = df
20
20
  @dataset = dataset
21
+ @type = type.to_sym
21
22
  end
22
23
 
23
24
  def learn
@@ -27,18 +28,73 @@ module EasyML::Data
27
28
  def learn_split(split)
28
29
  df = split.read(:all)
29
30
  train_df = split.read(:train)
30
- all_stats = learn_df(df, dataset: dataset)
31
- train_stats = learn_df(train_df, dataset: dataset)
31
+ all_stats = learn_df(df)
32
+ train_stats = learn_df(train_df)
32
33
 
33
34
  all_stats.reduce({}) do |output, (k, _)|
34
35
  output.tap do
35
36
  output[k] = all_stats[k].slice(:num_rows, :null_count, :unique_count, :counts).merge!(
36
- train_stats[k].slice(:mean, :median, :min, :max, :std, :last_value, :most_frequent_value, :last_known_value)
37
+ train_stats[k].slice(:mean, :median, :min, :max, :std,
38
+ :last_value, :most_frequent_value, :last_known_value,
39
+ :allowed_categories, :label_encoder, :label_decoder)
37
40
  )
38
41
  end
39
42
  end
40
43
  end
41
44
 
45
+ def learn_categorical(df)
46
+ allowed_categories = learn_allowed_categories(df)
47
+ allowed_categories.reduce({}) do |statistics, (col, categories)|
48
+ statistics.tap do
49
+ statistics[col] ||= {}
50
+ statistics[col][:allowed_categories] = categories
51
+ statistics[col].merge!(
52
+ learn_categorical_encoder_decoder(df[col])
53
+ )
54
+ end
55
+ end
56
+ end
57
+
58
+ def learn_categorical_encoder_decoder(series)
59
+ value_counts = series.value_counts
60
+ column_names = value_counts.columns
61
+ value_column = column_names[0]
62
+ count_column = column_names[1]
63
+
64
+ as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&:to_s)
65
+ label_encoder = as_hash.keys.sort.each.with_index.reduce({}) do |h, (k, i)|
66
+ h.tap do
67
+ h[k] = i
68
+ end
69
+ end
70
+ label_decoder = label_encoder.invert
71
+
72
+ {
73
+ value: as_hash,
74
+ label_encoder: label_encoder,
75
+ label_decoder: label_decoder,
76
+ }
77
+ end
78
+
79
+ def learn_allowed_categories(df)
80
+ preprocessing_steps = dataset.preprocessing_steps || {}
81
+ preprocessing_steps.deep_symbolize_keys!
82
+
83
+ allowed_categories = {}
84
+ (preprocessing_steps[:training] || {}).each_key do |col|
85
+ next unless [
86
+ preprocessing_steps.dig(:training, col, :params, :ordinal_encoding),
87
+ preprocessing_steps.dig(:training, col, :params, :one_hot),
88
+ preprocessing_steps.dig(:training, col, :method).to_sym == :categorical,
89
+ ].any?
90
+
91
+ cat_min = preprocessing_steps.dig(:training, col, :params, :categorical_min) || 1
92
+ val_counts = df[col].value_counts
93
+ allowed_categories[col] = val_counts[val_counts["count"] >= cat_min][col].to_a.compact
94
+ end
95
+ allowed_categories
96
+ end
97
+
42
98
  def last_known_value(df, col, date_col)
43
99
  return nil if df.empty? || !df.columns.include?(date_col)
44
100
 
@@ -53,13 +109,22 @@ module EasyML::Data
53
109
  last_value
54
110
  end
55
111
 
56
- def learn_df(df, dataset: nil)
57
- self.class.learn_df(df, dataset: dataset)
112
+ def learn_df(df)
113
+ return if df.nil?
114
+
115
+ stats = learn_base_stats(df, dataset: dataset).stringify_keys
116
+ if type == :raw
117
+ categorical = learn_categorical(df).stringify_keys
118
+ categorical.each { |k, v| stats[k].merge!(v) }
119
+ end
120
+ stats
58
121
  end
59
122
 
60
- def self.learn_df(df, dataset: nil)
61
- return if df.nil?
123
+ def self.learn_df(df, dataset: nil, type: :raw)
124
+ new(df, dataset, type).learn_df(df)
125
+ end
62
126
 
127
+ def learn_base_stats(df, dataset: nil)
63
128
  base_stats = describe_to_h(df).deep_symbolize_keys
64
129
 
65
130
  # Add basic column statistics first
@@ -103,16 +168,16 @@ module EasyML::Data
103
168
  end
104
169
  end
105
170
 
106
- def self.id_column?(column)
171
+ def id_column?(column)
107
172
  col = column.to_s.downcase
108
173
  col.match?(/^id$/) || col.match?(/.*_id/)
109
174
  end
110
175
 
111
- def self.last_value(df, col, date_col)
176
+ def last_value(df, col, date_col)
112
177
  df.filter(Polars.col(col).is_not_null).sort(date_col)[col][-1]
113
178
  end
114
179
 
115
- def self.describe_to_h(df)
180
+ def describe_to_h(df)
116
181
  init_h = df.describe.to_h
117
182
  rows = init_h.values.map(&:to_a)
118
183
  keys = rows.first
@@ -127,8 +127,10 @@ module EasyML
127
127
  )
128
128
 
129
129
  Rails.logger.info("Downloaded #{object.key} to #{local_file_path}")
130
- ungzipped_file_path = ungzip_file(local_file_path)
131
- Rails.logger.info("Ungzipped to #{ungzipped_file_path}")
130
+ if object.key.end_with?(".gz")
131
+ ungzipped_file_path = ungzip_file(local_file_path)
132
+ Rails.logger.info("Ungzipped to #{ungzipped_file_path}")
133
+ end
132
134
  rescue Aws::S3::Errors::ServiceError, Net::OpenTimeout, Net::ReadTimeout, StandardError => e
133
135
  Rails.logger.error("Failed to process #{object.key}: #{e.message}")
134
136
  raise e
@@ -10,11 +10,17 @@ module EasyML
10
10
  @models = {}
11
11
  end
12
12
 
13
- def self.predict(model_name, df, serialize: false)
13
+ def self.normalize_input(df)
14
14
  if df.is_a?(Hash)
15
15
  df = Polars::DataFrame.new(df)
16
16
  end
17
+ df
18
+ end
19
+
20
+ def self.predict(model_name, df, serialize: false)
21
+ df = normalize_input(df)
17
22
  raw_input = df.to_hashes
23
+
18
24
  df = instance.normalize(model_name, df)
19
25
  normalized_input = df.to_hashes
20
26
  preds = instance.predict(model_name, df)
@@ -52,6 +58,11 @@ module EasyML
52
58
  get_model(model_name).predict(df)
53
59
  end
54
60
 
61
+ def self.validate_input(model_name, df)
62
+ df = normalize_input(df)
63
+ instance.get_model(model_name).dataset.validate_input(df)
64
+ end
65
+
55
66
  def normalize(model_name, df)
56
67
  get_model(model_name).dataset.normalize(df, inference: true)
57
68
  end
@@ -72,7 +83,7 @@ module EasyML
72
83
  private
73
84
 
74
85
  def load_model(model_name)
75
- current_model = EasyML::Model.find_by!(name: model_name).inference_version
86
+ current_model = EasyML::Model.find_by!(slug: model_name).inference_version
76
87
 
77
88
  # Load new model if not loaded or different version
78
89
  model_not_loaded = models[model_name].nil?
@@ -41,6 +41,9 @@ module EasyML
41
41
  add_workflow_status_to_easy_ml_features
42
42
  drop_path_from_easy_ml_model_files
43
43
  add_is_date_column_to_easy_ml_columns
44
+ add_computed_columns_to_easy_ml_columns
45
+ add_slug_to_easy_ml_models
46
+ add_default_to_is_target
44
47
  ].freeze
45
48
 
46
49
  # Specify the next migration number
@@ -0,0 +1,14 @@
1
+ class AddComputedColumnsToEasyMLColumns < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ add_column :easy_ml_columns, :computed_by, :string
4
+ add_column :easy_ml_columns, :is_computed, :boolean, default: false
5
+
6
+ add_index :easy_ml_columns, :computed_by
7
+ add_index :easy_ml_columns, :is_computed
8
+
9
+ add_column :easy_ml_column_histories, :computed_by, :string
10
+ add_index :easy_ml_column_histories, :computed_by
11
+ add_column :easy_ml_column_histories, :is_computed, :boolean, default: false
12
+ add_index :easy_ml_column_histories, :is_computed
13
+ end
14
+ end
@@ -0,0 +1,6 @@
1
+ class AddDefaultToIsTarget < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ change_column_default(:easy_ml_columns, :is_target, false)
4
+ change_column_default(:easy_ml_column_histories, :is_target, false)
5
+ end
6
+ end
@@ -0,0 +1,20 @@
1
+ class AddSlugToEasyMLModels < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ add_column :easy_ml_models, :slug, :string
4
+ add_index :easy_ml_models, :slug, unique: true
5
+
6
+ reversible do |dir|
7
+ dir.up do
8
+ execute <<-SQL
9
+ UPDATE easy_ml_models
10
+ SET slug = LOWER(REPLACE(name, ' ', '_'))
11
+ SQL
12
+ end
13
+ end
14
+
15
+ change_column_null :easy_ml_models, :slug, false
16
+
17
+ add_column :easy_ml_model_histories, :slug, :string
18
+ add_index :easy_ml_model_histories, :slug
19
+ end
20
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc57"
4
+ VERSION = "0.2.0-rc58"
5
5
 
6
6
  module Version
7
7
  end
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "entrypoints/Application.tsx": {
3
- "file": "assets/entrypoints/Application.tsx-DTZ2348z.js",
3
+ "file": "assets/entrypoints/Application.tsx-DmkdJsDd.js",
4
4
  "name": "entrypoints/Application.tsx",
5
5
  "src": "entrypoints/Application.tsx",
6
6
  "isEntry": true,