easy_ml 0.2.0.pre.rc40 → 0.2.0.pre.rc43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +11 -9
  3. data/app/controllers/easy_ml/settings_controller.rb +1 -4
  4. data/app/frontend/pages/SettingsPage.tsx +1 -80
  5. data/app/jobs/easy_ml/batch_job.rb +45 -1
  6. data/app/jobs/easy_ml/compute_feature_job.rb +68 -4
  7. data/app/models/concerns/easy_ml/dataframe_serialization.rb +30 -0
  8. data/app/models/easy_ml/dataset.rb +23 -22
  9. data/app/models/easy_ml/dataset_history.rb +1 -6
  10. data/app/models/easy_ml/datasources/polars_datasource.rb +4 -18
  11. data/app/models/easy_ml/event.rb +2 -1
  12. data/app/models/easy_ml/event_context.rb +58 -0
  13. data/app/models/easy_ml/feature.rb +43 -14
  14. data/app/models/easy_ml/model.rb +4 -7
  15. data/app/models/easy_ml/model_file.rb +17 -48
  16. data/app/models/easy_ml/splitter_history.rb +16 -0
  17. data/app/serializers/easy_ml/prediction_serializer.rb +6 -1
  18. data/config/initializers/zhong.rb +4 -0
  19. data/lib/easy_ml/data/date_converter.rb +1 -0
  20. data/lib/easy_ml/data/polars_reader.rb +17 -4
  21. data/lib/easy_ml/data/statistics_learner.rb +1 -1
  22. data/lib/easy_ml/engine.rb +22 -0
  23. data/lib/easy_ml/pending_migrations.rb +19 -0
  24. data/lib/easy_ml/predict.rb +25 -12
  25. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +39 -157
  26. data/lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_features.rb.tt +13 -0
  27. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +4 -2
  28. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +22 -20
  29. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +5 -3
  30. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +26 -24
  31. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +5 -3
  32. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +12 -10
  33. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +21 -19
  34. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_event_contexts.rb.tt +14 -0
  35. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +16 -14
  36. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +10 -8
  37. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +27 -25
  38. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +5 -3
  39. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +13 -11
  40. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +5 -3
  41. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +28 -26
  42. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +13 -11
  43. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +70 -66
  44. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +6 -4
  45. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +6 -4
  46. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +11 -9
  47. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +34 -30
  48. data/lib/easy_ml/railtie/templates/migration/drop_path_from_easy_ml_model_files.rb.tt +11 -0
  49. data/lib/easy_ml/version.rb +1 -1
  50. data/lib/easy_ml.rb +1 -0
  51. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  52. data/public/easy_ml/assets/assets/Application-zpGA_Q9c.css +1 -0
  53. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-DF5SSkYi.js → Application.tsx-jPsqOyb0.js} +87 -97
  54. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-jPsqOyb0.js.map +1 -0
  55. metadata +11 -19
  56. data/public/easy_ml/assets/assets/Application-Cu7lNJmG.css +0 -1
  57. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DF5SSkYi.js.map +0 -1
@@ -17,6 +17,7 @@
17
17
  # refresh_every :bigint
18
18
  # created_at :datetime not null
19
19
  # updated_at :datetime not null
20
+ # workflow_status :string
20
21
  #
21
22
  module EasyML
22
23
  class Feature < ActiveRecord::Base
@@ -24,6 +25,11 @@ module EasyML
24
25
  include Historiographer::Silent
25
26
  historiographer_mode :snapshot_only
26
27
 
28
+ enum workflow_status: {
29
+ analyzing: "analyzing",
30
+ ready: "ready",
31
+ failed: "failed",
32
+ }
27
33
  class << self
28
34
  def compute_sha(feature_class)
29
35
  require "digest"
@@ -135,13 +141,22 @@ module EasyML
135
141
  adapter.respond_to?(:batch) || config.dig(:batch_size).present?
136
142
  end
137
143
 
144
+ def primary_key
145
+ pkey = config.dig(:primary_key)
146
+ if pkey.is_a?(Array)
147
+ pkey
148
+ else
149
+ [pkey]
150
+ end
151
+ end
152
+
138
153
  def numeric_primary_key?
139
154
  if primary_key.nil?
140
155
  return false unless should_be_batchable?
141
156
  raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
142
157
  end
143
158
 
144
- dataset.raw.data(limit: 1, select: primary_key)[primary_key].to_a.flat_map(&:values).all? do |value|
159
+ dataset.raw.data(limit: 1, select: primary_key)[primary_key].to_a.flat_map { |h| h.respond_to?(:values) ? h.values : h }.all? do |value|
145
160
  case value
146
161
  when String then value.match?(/\A[-+]?\d+(\.\d+)?\z/)
147
162
  else
@@ -171,22 +186,25 @@ module EasyML
171
186
  unless primary_key.present?
172
187
  raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
173
188
  end
174
- df = reader.query(select: [primary_key.first])
189
+ df = reader.query(select: primary_key)
175
190
  rescue => e
176
191
  raise "Couldn't find primary key #{primary_key.first} for feature #{feature_class}: #{e.message}"
177
192
  end
178
193
  return [] if df.nil?
179
194
 
180
195
  min_id = df[primary_key.first].min
181
- max_id = df[primary_key.first].max
196
+ max_id = df[primary_key.last].max
182
197
  end
183
198
 
184
- (min_id..max_id).step(batch_size).map do |batch_start|
199
+ (min_id..max_id).step(batch_size).map.with_index do |batch_start, idx|
185
200
  batch_end = [batch_start + batch_size, max_id + 1].min - 1
186
201
  {
187
202
  feature_id: id,
188
203
  batch_start: batch_start,
189
204
  batch_end: batch_end,
205
+ batch_number: feature_position,
206
+ subbatch_number: idx,
207
+ parent_batch_id: Random.uuid,
190
208
  }
191
209
  end
192
210
  end
@@ -196,13 +214,16 @@ module EasyML
196
214
  end
197
215
 
198
216
  def fit(features: [self], async: false)
199
- jobs = features.flat_map(&:build_batches)
217
+ ordered_features = features.sort_by(&:feature_position)
218
+ jobs = ordered_features.map(&:build_batches)
219
+
200
220
  if async
201
- EasyML::ComputeFeatureJob.enqueue_batch(jobs)
221
+ EasyML::ComputeFeatureJob.enqueue_ordered_batches(jobs)
202
222
  else
203
- jobs.each do |job|
223
+ jobs.flatten.each do |job|
204
224
  EasyML::ComputeFeatureJob.perform(nil, job)
205
225
  end
226
+ features.each(&:after_fit) unless features.any?(&:failed?)
206
227
  end
207
228
  end
208
229
 
@@ -266,13 +287,11 @@ module EasyML
266
287
  batch_df = adapter.fit(df, self, batch_args)
267
288
  end
268
289
  end
269
- raise "Feature #{feature_class}#fit must return a dataframe" unless batch_df.present?
270
- store(batch_df)
271
- updates = {
272
- applied_at: Time.current,
273
- needs_fit: false,
274
- }.compact
275
- update!(updates)
290
+ if batch_df.present?
291
+ store(batch_df)
292
+ else
293
+ "Feature #{feature_class}#fit should return a dataframe, received #{batch_df.class}"
294
+ end
276
295
  batch_df
277
296
  end
278
297
 
@@ -335,6 +354,7 @@ module EasyML
335
354
  def apply_defaults
336
355
  self.name ||= self.feature_class.demodulize.titleize
337
356
  self.version ||= 1
357
+ self.workflow_status ||= :ready
338
358
  end
339
359
 
340
360
  def needs_columns
@@ -371,6 +391,15 @@ module EasyML
371
391
  (should_be_batchable? ? 10_000 : nil)
372
392
  end
373
393
 
394
+ def after_fit
395
+ updates = {
396
+ applied_at: Time.current,
397
+ needs_fit: false,
398
+ workflow_status: :ready,
399
+ }.compact
400
+ update!(updates)
401
+ end
402
+
374
403
  private
375
404
 
376
405
  def bulk_update_positions(features)
@@ -250,6 +250,7 @@ module EasyML
250
250
  bump_version(force: true)
251
251
  path = model_file.full_path(version)
252
252
  full_path = adapter.save_model_file(path)
253
+ puts "saving model to #{full_path}"
253
254
  model_file.upload(full_path)
254
255
 
255
256
  model_file.save
@@ -266,6 +267,7 @@ module EasyML
266
267
  end
267
268
 
268
269
  def cleanup
270
+ puts "keeping files #{files_to_keep}"
269
271
  get_model_file&.cleanup(files_to_keep)
270
272
  end
271
273
 
@@ -488,13 +490,9 @@ module EasyML
488
490
  end
489
491
 
490
492
  def root_dir
491
- persisted = read_attribute(:root_dir)
493
+ relative_dir = read_attribute(:root_dir) || default_root_dir
492
494
 
493
- if persisted.present? && !persisted.blank?
494
- EasyML::Engine.root_dir.join(persisted).to_s
495
- else
496
- default_root_dir
497
- end
495
+ EasyML::Engine.root_dir.join(relative_dir).to_s
498
496
  end
499
497
 
500
498
  def default_root_dir
@@ -544,7 +542,6 @@ module EasyML
544
542
 
545
543
  def new_model_file!
546
544
  build_model_file(
547
- root_dir: root_dir,
548
545
  model: self,
549
546
  s3_bucket: EasyML::Configuration.s3_bucket,
550
547
  s3_region: EasyML::Configuration.s3_region,
@@ -23,7 +23,7 @@ module EasyML
23
23
  belongs_to :model, class_name: "EasyML::Model"
24
24
 
25
25
  include EasyML::Concerns::Configurable
26
- add_configuration_attributes :s3_bucket, :s3_prefix, :s3_region, :s3_access_key_id, :s3_secret_access_key, :root_dir
26
+ add_configuration_attributes :s3_bucket, :s3_prefix, :s3_region, :s3_access_key_id, :s3_secret_access_key
27
27
 
28
28
  def synced_file
29
29
  EasyML::Support::SyncedFile.new(
@@ -37,6 +37,21 @@ module EasyML
37
37
  )
38
38
  end
39
39
 
40
+ def root_dir
41
+ Pathname.new(model.root_dir)
42
+ end
43
+
44
+ def model_root
45
+ File.expand_path("..", root_dir.to_s)
46
+ end
47
+
48
+ def full_path(filename = nil)
49
+ filename = self.filename if filename.nil?
50
+ return nil if filename.nil?
51
+
52
+ root_dir.join(filename).to_s
53
+ end
54
+
40
55
  def exist?
41
56
  fit?
42
57
  end
@@ -54,33 +69,7 @@ module EasyML
54
69
 
55
70
  def upload(path)
56
71
  synced_file.upload(path)
57
- set_path(path)
58
- end
59
-
60
- def set_path(path)
61
- path = get_full_path(path)
62
- basename = Pathname.new(path).basename.to_s
63
- unless path.start_with?(full_dir)
64
- new_path = File.join(full_dir, basename).to_s
65
- FileUtils.mkdir_p(Pathname.new(new_path).dirname.to_s)
66
- FileUtils.cp(path, new_path)
67
- path = new_path
68
- end
69
- self.filename = basename
70
- self.path = get_relative_path(path)
71
- end
72
-
73
- def get_full_path(path)
74
- path = path.to_s
75
-
76
- path = Rails.root.join(path) unless path.match?(Regexp.new(Rails.root.to_s))
77
- path
78
- end
79
-
80
- def get_relative_path(path)
81
- path = path.to_s
82
- path = path.to_s.split(Rails.root.to_s).last
83
- path.to_s.split("/")[0..-2].reject(&:empty?).join("/")
72
+ update(filename: Pathname.new(path).basename.to_s)
84
73
  end
85
74
 
86
75
  def download
@@ -94,26 +83,6 @@ module EasyML
94
83
  Digest::SHA256.file(full_path).hexdigest
95
84
  end
96
85
 
97
- def full_path(filename = nil)
98
- filename = self.filename if filename.nil?
99
- return nil if filename.nil?
100
- return nil if relative_dir.nil?
101
-
102
- Rails.root.join(relative_dir, filename).to_s
103
- end
104
-
105
- def relative_dir
106
- root_dir.to_s.gsub(Regexp.new(Rails.root.to_s), "").gsub!(%r{^/}, "")
107
- end
108
-
109
- def full_dir
110
- Rails.root.join(relative_dir).to_s
111
- end
112
-
113
- def model_root
114
- File.expand_path("..", full_dir)
115
- end
116
-
117
86
  def cleanup!
118
87
  [model_root].each do |dir|
119
88
  EasyML::Support::FileRotate.new(dir, []).cleanup(extension_allowlist)
@@ -1,3 +1,19 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_splitter_histories
4
+ #
5
+ # id :bigint not null, primary key
6
+ # splitter_id :integer not null
7
+ # splitter_type :string not null
8
+ # configuration :json
9
+ # dataset_id :integer not null
10
+ # created_at :datetime not null
11
+ # updated_at :datetime not null
12
+ # history_started_at :datetime not null
13
+ # history_ended_at :datetime
14
+ # history_user_id :integer
15
+ # snapshot_id :string
16
+ #
1
17
  module EasyML
2
18
  class SplitterHistory < ActiveRecord::Base
3
19
  self.table_name = "easy_ml_splitter_histories"
@@ -5,7 +5,12 @@ module EasyML
5
5
  include JSONAPI::Serializer
6
6
 
7
7
  attribute :prediction do |object|
8
- object.prediction_value.symbolize_keys.dig(:value)
8
+ case object.prediction_value
9
+ when Hash
10
+ object.prediction_value.symbolize_keys.dig(:value)
11
+ when Numeric
12
+ object.prediction_value
13
+ end
9
14
  end
10
15
 
11
16
  attributes :id,
@@ -7,5 +7,9 @@ if %w[zhong:start].include?(ARGV.first)
7
7
  every 1.hour, "cleanup" do
8
8
  EasyML::CleanJob.perform_later
9
9
  end
10
+
11
+ every 1.hour, "cleanup" do
12
+ EasyML::ScheduleRetrainingJob.perform_later
13
+ end
10
14
  end
11
15
  end
@@ -3,6 +3,7 @@ module EasyML
3
3
  module DateConverter
4
4
  COMMON_DATE_FORMATS = [
5
5
  "%Y-%m-%dT%H:%M:%S.%6N", # e.g., "2021-01-01T00:00:00.000000"
6
+ "%Y-%m-%d %H:%M:%S.%L Z", # e.g., "2025-01-03 23:04:49.492 Z"
6
7
  "%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
7
8
  "%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
8
9
  "%Y-%m-%d %H:%M:%S", # e.g., "2021-01-01 00:01:36"
@@ -196,15 +196,22 @@ module EasyML
196
196
  polars_args[:dtypes].merge!(dtypes)
197
197
  end
198
198
  ext = Pathname.new(file).extname.gsub(/\./, "")
199
+ date_cols = []
199
200
  case ext
200
201
  when "csv"
201
- filtered_args = filter_polars_args(Polars.method(:read_csv))
202
- filtered_args.merge!(infer_schema_length: 1_000_000, null_values: ["\\N", "\\\\N", "NULL"])
202
+ filtered_args, date_cols = filter_polars_args(Polars.method(:read_csv))
203
+ filtered_args.merge!(
204
+ infer_schema_length: 1_000_000,
205
+ null_values: ["\\N", "\\\\N", "NULL"],
206
+ )
203
207
  df = Polars.read_csv(file, **filtered_args)
204
208
  when "parquet"
205
- filtered_args = filter_polars_args(Polars.method(:read_parquet))
209
+ filtered_args, date_cols = filter_polars_args(Polars.method(:read_parquet))
206
210
  df = Polars.read_parquet(file, **filtered_args)
207
211
  end
212
+ date_cols.each do |col|
213
+ df = EasyML::Data::DateConverter.maybe_convert_date(df, col)
214
+ end
208
215
  df
209
216
  end
210
217
 
@@ -214,7 +221,13 @@ module EasyML
214
221
 
215
222
  def filter_polars_args(method)
216
223
  supported_params = method.parameters.map { |_, name| name }
217
- polars_args.select { |k, _| supported_params.include?(k) }
224
+ filtered = polars_args.select { |k, _| supported_params.include?(k) }
225
+
226
+ # Filter out any datetime columns, and use maybe_convert_date to convert later
227
+ date_cols = (filtered[:dtypes] || {}).select { |k, v| v.class == Polars::Datetime }.keys
228
+ filtered[:dtypes] = (filtered[:dtypes] || {}).reject { |k, v| v.class == Polars::Datetime }.compact.to_h
229
+ filtered = filtered.select { |k, _| supported_params.include?(k) }
230
+ return filtered, date_cols
218
231
  end
219
232
 
220
233
  def csv_files
@@ -59,7 +59,7 @@ module EasyML::Data
59
59
  stats[col].merge!(most_frequent_value: series.mode.sort.to_a&.first)
60
60
  if field_type == :categorical
61
61
  stats[col].merge!(
62
- unique_count: series.n_unique,
62
+ unique_count: series.cast(:str).n_unique,
63
63
  counts: Hash[series.value_counts.to_hashes.map(&:values)],
64
64
  )
65
65
  end
@@ -1,5 +1,6 @@
1
1
  require "aws-sdk"
2
2
  require "awesome_print"
3
+ require "rails/all"
3
4
  require "inertia_rails"
4
5
  require "jsonapi/serializer"
5
6
  require "numo/narray"
@@ -68,6 +69,16 @@ module EasyML
68
69
  end
69
70
  end
70
71
 
72
+ initializer "easy_ml.check_pending_migrations" do
73
+ if defined?(Rails::Server)
74
+ config.after_initialize do
75
+ if EasyML.pending_migrations?
76
+ puts "\e[33mWARNING: You have pending EasyML migrations. Run 'rails generate easy_ml:migration' to add them.\e[0m"
77
+ end
78
+ end
79
+ end
80
+ end
81
+
71
82
  initializer "easy_ml.active_job_config" do
72
83
  resque_initializer = File.expand_path("config/initializers/resque.rb", root)
73
84
  require resque_initializer if File.exist?(resque_initializer)
@@ -77,6 +88,17 @@ module EasyML
77
88
  end
78
89
  end
79
90
 
91
+ initializer "easy_ml.configure_secrets" do
92
+ EasyML::Configuration.configure do |config|
93
+ raise "S3_ACCESS_KEY_ID is missing. Set ENV['S3_ACCESS_KEY_ID']" unless ENV["S3_ACCESS_KEY_ID"]
94
+ raise "S3_SECRET_ACCESS_KEY is missing. Set ENV['S3_SECRET_ACCESS_KEY']" unless ENV["S3_SECRET_ACCESS_KEY"]
95
+
96
+ config.s3_access_key_id = ENV["S3_ACCESS_KEY_ID"]
97
+ config.s3_secret_access_key = ENV["S3_SECRET_ACCESS_KEY"]
98
+ config.wandb_api_key = ENV["WANDB_API_KEY"] if ENV["WANDB_API_KEY"]
99
+ end
100
+ end
101
+
80
102
  initializer "easy_ml.setup_generators" do |app|
81
103
  generators_path = EasyML::Engine.root.join("lib/easy_ml/railtie/generators")
82
104
  generators_dirs = Dir[File.join(generators_path, "**", "*.rb")]
@@ -0,0 +1,19 @@
1
+ module EasyML
2
+ def self.pending_migrations?
3
+ return false unless defined?(ActiveRecord)
4
+
5
+ # Get all migration files from our templates
6
+ template_dir = File.expand_path("../railtie/generators/templates/migration", __dir__)
7
+ template_migrations = Dir.glob(File.join(template_dir, "*.tt")).map do |f|
8
+ File.basename(f, ".tt").sub(/^create_/, "")
9
+ end
10
+
11
+ # Get all existing migrations
12
+ existing_migrations = Dir.glob(Rails.root.join("db/migrate/*_*.rb")).map do |f|
13
+ File.basename(f).sub(/^\d+_create_/, "").sub(/\.rb$/, "")
14
+ end
15
+
16
+ # Check if any template migrations are not in existing migrations
17
+ (template_migrations - existing_migrations).any?
18
+ end
19
+ end
@@ -10,25 +10,38 @@ module EasyML
10
10
  @models = {}
11
11
  end
12
12
 
13
- def self.predict(model_name, df)
13
+ def self.predict(model_name, df, serialize: false)
14
14
  if df.is_a?(Hash)
15
15
  df = Polars::DataFrame.new(df)
16
16
  end
17
- raw_input = df.to_hashes&.first
17
+ raw_input = df.to_hashes
18
18
  df = instance.normalize(model_name, df)
19
+ normalized_input = df.to_hashes
19
20
  preds = instance.predict(model_name, df)
20
21
  current_version = instance.get_model(model_name)
21
22
 
22
- EasyML::Prediction.create!(
23
- model: current_version.model,
24
- model_history: current_version,
25
- prediction_type: current_version.model.task,
26
- prediction_value: {
27
- value: preds.first,
28
- }.compact,
29
- raw_input: raw_input,
30
- normalized_input: df.to_hashes&.first,
31
- )
23
+ output = preds.zip(raw_input, normalized_input).map do |pred, raw, norm|
24
+ EasyML::Prediction.create!(
25
+ model: current_version.model,
26
+ model_history: current_version,
27
+ prediction_type: current_version.model.task,
28
+ prediction_value: pred,
29
+ raw_input: raw,
30
+ normalized_input: norm,
31
+ )
32
+ end
33
+
34
+ output = if output.is_a?(Array) && output.count == 1
35
+ output.first
36
+ else
37
+ output
38
+ end
39
+
40
+ if serialize
41
+ EasyML::PredictionSerializer.new(output).serializable_hash
42
+ else
43
+ output
44
+ end
32
45
  end
33
46
 
34
47
  def self.train(model_name, tuner: nil, evaluator: nil)