easy_ml 0.2.0.pre.rc77 → 0.2.0.pre.rc78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/models_controller.rb +3 -2
  3. data/app/frontend/components/ModelForm.tsx +16 -0
  4. data/app/frontend/components/ScheduleModal.tsx +0 -2
  5. data/app/frontend/components/dataset/PreprocessingConfig.tsx +7 -6
  6. data/app/jobs/easy_ml/application_job.rb +1 -0
  7. data/app/jobs/easy_ml/batch_job.rb +47 -6
  8. data/app/jobs/easy_ml/compute_feature_job.rb +10 -10
  9. data/app/jobs/easy_ml/reaper.rb +14 -10
  10. data/app/jobs/easy_ml/refresh_dataset_job.rb +2 -0
  11. data/app/jobs/easy_ml/sync_datasource_job.rb +1 -0
  12. data/app/models/concerns/easy_ml/dataframe_serialization.rb +1 -17
  13. data/app/models/easy_ml/column/imputers/base.rb +1 -1
  14. data/app/models/easy_ml/column/imputers/today.rb +1 -1
  15. data/app/models/easy_ml/column/selector.rb +0 -8
  16. data/app/models/easy_ml/column.rb +1 -1
  17. data/app/models/easy_ml/dataset/learner/base.rb +2 -2
  18. data/app/models/easy_ml/dataset/learner/eager.rb +3 -1
  19. data/app/models/easy_ml/dataset/learner/lazy.rb +4 -1
  20. data/app/models/easy_ml/dataset.rb +25 -27
  21. data/app/models/easy_ml/datasource.rb +0 -6
  22. data/app/models/easy_ml/feature.rb +12 -3
  23. data/app/models/easy_ml/model.rb +20 -2
  24. data/app/models/easy_ml/models/xgboost/evals_callback.rb +3 -2
  25. data/app/models/easy_ml/models/xgboost.rb +52 -36
  26. data/app/models/easy_ml/retraining_run.rb +1 -1
  27. data/app/serializers/easy_ml/model_serializer.rb +1 -0
  28. data/lib/easy_ml/core/tuner.rb +7 -4
  29. data/lib/easy_ml/data/dataset_manager/writer/base.rb +26 -9
  30. data/lib/easy_ml/data/dataset_manager/writer.rb +5 -1
  31. data/lib/easy_ml/data/dataset_manager.rb +8 -2
  32. data/lib/easy_ml/data/polars_column.rb +19 -5
  33. data/lib/easy_ml/engine.rb +16 -14
  34. data/lib/easy_ml/feature_store.rb +19 -16
  35. data/lib/easy_ml/support/lockable.rb +1 -5
  36. data/lib/easy_ml/version.rb +1 -1
  37. data/public/easy_ml/assets/.vite/manifest.json +1 -1
  38. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Bbf3mD_b.js +522 -0
  39. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-B1qLZuyu.js.map → Application.tsx-Bbf3mD_b.js.map} +1 -1
  40. metadata +6 -7
  41. data/app/models/easy_ml/datasources/polars_datasource.rb +0 -69
  42. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-B1qLZuyu.js +0 -522
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ee980703e3a768458e43d54a878bfa712d4f026967f3ccd8fa5bb2d1df50304c
4
- data.tar.gz: eb5eb31b580e9112886527f416d4f360ffe9b0ee73d9e2e7dd70d9a48528ea09
3
+ metadata.gz: 13858267adb9445f665a01214f2109bc23dd63a76d5ab0ae502c60ac94a6d2d4
4
+ data.tar.gz: bc1b37afabf4757ce1e7e311699d6e8ac0bea2230025d8e696ada4071b0b3563
5
5
  SHA512:
6
- metadata.gz: ddc8a0005b22caf186c13790e9209d7b843181d62be8e70710e33bd6e244e3f31e6cc02efc4effa82d9f38674ff8ba2d8abdc1e43db8ba339d712f0e12d10ec4
7
- data.tar.gz: 52193a2c0da5c0aca86bb627afff7efd9cbdfcf80ba2db8ae4e9baec061b716244e14c49f30451f52a221700b2b1a160a71935cb76cd77732a4e8fdcf25bd3a1
6
+ metadata.gz: ccd5fc9e0b9529da07012a1745f826cf8e88391b24e3df20ba636c9e6ccf853172d18916cccc3087692873971a9dd2b72aa7151e286824df5cb500255610d603
7
+ data.tar.gz: 6034abbae5e25a00f204a649c62b568a90a76481c6ff91aaadd766fe515fe76dbf6692bebabe905c5a4bc1b9642717c77f4cbfda6b43684624a5e32517f73d99
@@ -30,7 +30,7 @@ module EasyML
30
30
  def new
31
31
  render inertia: "pages/NewModelPage", props: {
32
32
  datasets: EasyML::Dataset.all.map do |dataset|
33
- dataset.slice(:id, :name, :num_rows)
33
+ dataset_to_json(dataset)
34
34
  end,
35
35
  constants: EasyML::Model.constants,
36
36
  }
@@ -41,7 +41,7 @@ module EasyML
41
41
  render inertia: "pages/EditModelPage", props: {
42
42
  model: model_to_json(model),
43
43
  datasets: EasyML::Dataset.all.map do |dataset|
44
- dataset.slice(:id, :name, :num_rows)
44
+ dataset_to_json(dataset)
45
45
  end,
46
46
  constants: EasyML::Model.constants,
47
47
  }
@@ -177,6 +177,7 @@ module EasyML
177
177
  :dataset_id,
178
178
  :task,
179
179
  :objective,
180
+ :weights_column,
180
181
  metrics: [],
181
182
  retraining_job_attributes: [
182
183
  :id,
@@ -16,6 +16,7 @@ interface ModelFormProps {
16
16
  task: string;
17
17
  objective?: string;
18
18
  metrics?: string[];
19
+ weights_column?: string;
19
20
  retraining_job?: {
20
21
  frequency: string;
21
22
  at: {
@@ -75,6 +76,7 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
75
76
  task: initialData?.task || 'classification',
76
77
  objective: initialData?.objective || 'binary:logistic',
77
78
  metrics: initialData?.metrics || ['accuracy_score'],
79
+ weights_column: initialData?.weights_column || '',
78
80
  retraining_job_attributes: initialData?.retraining_job ? {
79
81
  id: initialData.retraining_job.id,
80
82
  frequency: initialData.retraining_job.frequency,
@@ -165,6 +167,7 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
165
167
  };
166
168
 
167
169
  const selectedDataset = datasets.find(d => d.id === data.model.dataset_id);
170
+ const columns = selectedDataset?.columns || [];
168
171
 
169
172
  const filteredTunerJobConstants = constants.tuner_job_constants[data.model.model_type] || {};
170
173
 
@@ -246,6 +249,19 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
246
249
  <ErrorDisplay error={errors.dataset_id} />
247
250
  </div>
248
251
 
252
+ <div>
253
+ <label className="block text-sm font-medium text-gray-700 mb-1">
254
+ Weights Column (Optional)
255
+ </label>
256
+ <SearchableSelect
257
+ value={data.model.weights_column}
258
+ options={columns.map(col => ({ value: col.name, label: col.name }))}
259
+ onChange={(value) => setData('model.weights_column', value)}
260
+ isClearable={true}
261
+ />
262
+ <ErrorDisplay error={errors.weights_column} />
263
+ </div>
264
+
249
265
  <div>
250
266
  <label className="block text-sm font-medium text-gray-700 mb-1">
251
267
  Task
@@ -587,8 +587,6 @@ export function ScheduleModal({ isOpen, onClose, onSave, initialData, metrics, t
587
587
  value={formData.retraining_job_attributes.threshold}
588
588
  onChange={(e) => handleEvaluatorChange('threshold', parseFloat(e.target.value))}
589
589
  step={0.01}
590
- min={0}
591
- max={1}
592
590
  className="block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 py-2 px-4 shadow-sm border-gray-300 border"
593
591
  />
594
592
  </div>
@@ -250,16 +250,17 @@ export function PreprocessingConfig({
250
250
  setIsEditingDescription(true);
251
251
  };
252
252
 
253
- let nullCount = (column.statistics?.processed.null_count || column.statistics?.raw?.null_count) || 0;
254
- const nullPercentage = nullCount && column.statistics?.raw.num_rows
255
- ? ((nullCount / column.statistics.raw.num_rows) * 100)
253
+ let nullCount = (column.statistics?.processed?.null_count || column.statistics?.raw?.null_count) || 0;
254
+ let numRows = (column.statistics?.processed?.num_rows) || (column.statistics?.raw?.num_rows) || 0;
255
+ const nullPercentage = nullCount && numRows
256
+ ? ((nullCount / numRows) * 100)
256
257
  : 0;
257
258
 
258
- const nullPercentageProcessed = column.statistics?.processed?.null_count && column.statistics?.raw.num_rows
259
- ? ((column.statistics.processed.null_count / column.statistics.raw.num_rows) * 100)
259
+ const nullPercentageProcessed = column.statistics?.processed?.null_count && column.statistics?.processed?.num_rows
260
+ ? ((column.statistics.processed.null_count / column.statistics.processed.num_rows) * 100)
260
261
  : 0;
261
262
 
262
- const totalRows = column.statistics?.raw.num_rows ?? 0;
263
+ const totalRows = numRows;
263
264
 
264
265
  const renderStrategySpecificInfo = (type: 'training' | 'inference') => {
265
266
  const strategy = type === 'training' ? training : inference;
@@ -1,5 +1,6 @@
1
1
  module EasyML
2
2
  class ApplicationJob < ActiveJob::Base
3
+ @queue = :easy_ml
3
4
  queue_as :easy_ml
4
5
 
5
6
  def create_event(model, status, error = nil)
@@ -39,15 +39,15 @@ module EasyML
39
39
  rest.map do |batch|
40
40
  Resque.redis.rpush("batch:#{parent_id}:remaining", batch.to_json)
41
41
  end
42
-
42
+ track_batch(parent_id)
43
43
  handle_batch(parent_id, batch)
44
44
  end
45
45
 
46
46
  def handle_batch(parent_id, batch)
47
47
  if batch.size > 1
48
- enqueue_batch(batch)
48
+ enqueue_batch(batch, parent_id)
49
49
  else
50
- run_one_batch(parent_id, batch.first)
50
+ new.perform(parent_id, batch.first)
51
51
  after_batch_hook(parent_id, batch)
52
52
  end
53
53
  end
@@ -60,7 +60,21 @@ module EasyML
60
60
  end
61
61
 
62
62
  def next_batch?(parent_id)
63
- batches_remaining(parent_id) > 0
63
+ (batches_remaining(parent_id) > 0)
64
+ end
65
+
66
+ def list_batches
67
+ Resque.redis.hkeys("batches:tracking")
68
+ end
69
+
70
+ def track_batch(parent_id)
71
+ Resque.redis.hset("batches:tracking", parent_id, 1)
72
+ end
73
+
74
+ def cleanup_all
75
+ list_batches.each do |batch_id|
76
+ cleanup_batch(batch_id)
77
+ end
64
78
  end
65
79
 
66
80
  def batches_remaining(parent_id)
@@ -69,12 +83,39 @@ module EasyML
69
83
 
70
84
  def cleanup_batch(parent_id)
71
85
  Resque.redis.del("batch:#{parent_id}:remaining")
86
+ Resque.redis.hdel("batches:tracking", parent_id)
72
87
  end
73
88
 
74
- private
89
+ def batch_args
90
+ list_batches.map do |batch_id|
91
+ fetch_batch_arguments(batch_id)
92
+ end
93
+ end
94
+
95
+ def select_batches(&block)
96
+ list_batches.select do |batch_id|
97
+ yield fetch_batch_arguments(batch_id)
98
+ end
99
+ end
100
+
101
+ def poll
102
+ while true
103
+ sleep 2
104
+ EasyML::BatchJob.list_batches.map do |batch|
105
+ puts "Batch #{batch} | Remaining : #{EasyML::BatchJob.batches_remaining(batch)}"
106
+ end
107
+ end
108
+ end
75
109
 
76
110
  def get_parent_batch_id(args_list)
77
- args_list.dup.flatten.first.dig(:parent_batch_id)
111
+ args_list.dup.flatten.detect { |arg| arg.dig(:parent_batch_id) }.dig(:parent_batch_id)
112
+ end
113
+
114
+ private
115
+
116
+ def get_args_list(batch_id)
117
+ redis_key = "#{batch(batch_id)}:original_args"
118
+ redis.get(redis_key)
78
119
  end
79
120
 
80
121
  # Store batch arguments in Redis
@@ -14,31 +14,31 @@ module EasyML
14
14
  #
15
15
  # https://github.com/drfeelngood/resque-batched-job/blob/master/lib/resque/plugins/batched_job.rb#L86
16
16
  batch_args = batch_args.dup
17
- run_one_batch(batch_id, batch_args)
17
+ EasyML::ComputeFeatureJob.new.perform(batch_id, batch_args)
18
18
  end
19
19
 
20
- def self.run_one_batch(batch_id, batch_args)
20
+ def perform(batch_id, batch_args = {})
21
21
  EasyML::Feature.fit_one_batch(batch_id, batch_args)
22
22
  end
23
23
 
24
24
  def self.after_batch_hook(batch_id, *args)
25
- batch_args = fetch_batch_arguments(batch_id).flatten.map(&:symbolize_keys)
26
- feature_ids = batch_args.pluck(:feature_id).uniq
27
- parent_id = batch_args.pluck(:parent_batch_id).first
25
+ args = args.flatten.first.with_indifferent_access
26
+ feature_id = args.dig(:feature_id)
28
27
 
29
- feature = EasyML::Feature.find_by(id: feature_ids.first)
28
+ feature = EasyML::Feature.find_by(id: feature_id)
30
29
 
31
30
  if feature.failed?
32
31
  dataset.features.where(workflow_status: :analyzing).update_all(workflow_status: :ready)
33
- return BatchJob.cleanup_batch(parent_id)
32
+ return BatchJob.cleanup_batch(batch_id)
34
33
  end
35
34
 
36
35
  feature.after_fit
37
36
 
38
- if BatchJob.next_batch?(parent_id)
39
- BatchJob.enqueue_next_batch(self, parent_id)
37
+ if BatchJob.next_batch?(batch_id)
38
+ BatchJob.enqueue_next_batch(self, batch_id)
40
39
  else
41
- dataset = EasyML::Feature.find_by(id: feature_ids.first).dataset
40
+ cleanup_batch(batch_id)
41
+ dataset = feature.dataset
42
42
  dataset.after_fit_features
43
43
  end
44
44
  end
@@ -9,8 +9,8 @@ module EasyML
9
9
  {
10
10
  worker: worker,
11
11
  working: true,
12
- class: args.dig("job_class"),
13
- args: args.dig("arguments"),
12
+ class: args.is_a?(Hash) ? args.dig("job_class") : nil,
13
+ args: args.is_a?(Hash) ? args.dig("arguments") : nil,
14
14
  pid: worker.pid,
15
15
  }
16
16
  else
@@ -19,17 +19,23 @@ module EasyML
19
19
  end
20
20
  end
21
21
 
22
- def find_job(worker_class, *args)
22
+ def find_job(worker_class, *args, &block)
23
23
  list_workers.select do |config|
24
- config.dig(:class) == worker_class.to_s && config.dig(:args) == args
24
+ selected = config.dig(:class) == worker_class.to_s
25
+ if block_given?
26
+ selected &&= yield(config)
27
+ else
28
+ selected &= config.dig(:args) == args
29
+ end
30
+ selected
25
31
  end
26
32
  end
27
33
 
28
- def kill(worker_class, *args)
29
- find_job(worker_class, *args).each do |job|
34
+ def kill(worker_class, *args, &block)
35
+ find_job(worker_class, *args, &block).each do |job|
30
36
  begin
31
- # Send TERM signal to the process
32
- Process.kill("TERM", job[:pid])
37
+ # Send HUP signal to the process
38
+ Process.kill("USR1", job[:pid])
33
39
 
34
40
  # Remove the worker from Redis so it doesn't show up as a zombie
35
41
  # in the Resque web interface. This is important because:
@@ -37,12 +43,10 @@ module EasyML
37
43
  # 2. Prevents confusion about running workers
38
44
  # 3. Allows proper worker cleanup in Redis
39
45
  job[:worker].done_working
40
- job[:worker].unregister_worker
41
46
  rescue Errno::ESRCH
42
47
  # Process already gone, but still try to clean up Redis
43
48
  begin
44
49
  job[:worker].done_working
45
- job[:worker].unregister_worker
46
50
  rescue => e
47
51
  # Redis cleanup failed, worker might already be unregistered
48
52
  puts "Failed to unregister worker: #{e.message}"
@@ -1,5 +1,7 @@
1
1
  module EasyML
2
2
  class RefreshDatasetJob < ApplicationJob
3
+ @queue = :easy_ml
4
+
3
5
  def perform(id)
4
6
  begin
5
7
  dataset = EasyML::Dataset.find(id)
@@ -8,6 +8,7 @@ module EasyML
8
8
 
9
9
  begin
10
10
  datasource.refresh
11
+ datasource.after_sync
11
12
  rescue StandardError => e
12
13
  datasource.update!(is_syncing: false)
13
14
  handle_error(datasource, e)
@@ -8,23 +8,7 @@ module EasyML
8
8
  end
9
9
 
10
10
  def deserialize_dataframe(df_data)
11
- return unless df_data.present? && df_data.key?("columns")
12
-
13
- columns = df_data["columns"].map do |col|
14
- dtype = case col["datatype"]
15
- when Hash
16
- if col["datatype"]["Datetime"]
17
- Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
18
- else
19
- Polars::Utf8
20
- end
21
- else
22
- Polars.const_get(col["datatype"])
23
- end
24
- Polars::Series.new(col["name"], col["values"], dtype: dtype)
25
- end
26
-
27
- Polars::DataFrame.new(columns)
11
+ Polars::DataFrame.new(df_data)
28
12
  end
29
13
  end
30
14
  end
@@ -63,7 +63,7 @@ module EasyML
63
63
  if column.is_computed
64
64
  column.statistics.dig(:processed, *args)
65
65
  else
66
- column.statistics.dig(:clipped, *args) || column.statistics.dig(:raw, *args)
66
+ column.statistics.dig(:raw, *args)
67
67
  end
68
68
  end
69
69
 
@@ -10,7 +10,7 @@ module EasyML
10
10
 
11
11
  def transform(df)
12
12
  df = df.with_column(
13
- Polars.col(column.name).fill_null(Polars.lit(UTC.today.beginning_of_day)).alias(column.name)
13
+ Polars.col(column.name).fill_null(Polars.lit(EasyML::Support::UTC.today.beginning_of_day)).alias(column.name)
14
14
  )
15
15
  df
16
16
  end
@@ -24,14 +24,6 @@ module EasyML
24
24
  end
25
25
  end
26
26
 
27
- def clipped
28
- Selector.new(column, :raw) do |df|
29
- column.imputers.training.clip(df)
30
- end
31
- end
32
-
33
- measure_method_timing :clipped
34
-
35
27
  def processed
36
28
  Selector.new(column, :processed)
37
29
  end
@@ -140,7 +140,7 @@ module EasyML
140
140
  end
141
141
  end
142
142
 
143
- delegate :raw, :processed, :data, :train, :test, :valid, :clipped, to: :data_selector
143
+ delegate :raw, :processed, :data, :train, :test, :valid, to: :data_selector
144
144
 
145
145
  def empty?
146
146
  data.blank?
@@ -15,8 +15,8 @@ module EasyML
15
15
  (column.one_hot? && type.to_sym == :processed)
16
16
  end
17
17
 
18
- TYPES_ALL = %i(raw clipped processed)
19
- TYPES_RAW = %i(raw clipped)
18
+ TYPES_ALL = %i(raw processed)
19
+ TYPES_RAW = %i(raw)
20
20
  TYPES_PROCESSED = %i(processed)
21
21
 
22
22
  def types(type = :all)
@@ -19,7 +19,9 @@ module EasyML
19
19
  end
20
20
 
21
21
  def fetch_df(split, type)
22
- @dataset.send(type).send(split, all_columns: true)
22
+ dataset.columns.apply_clip(
23
+ @dataset.send(type).send(split, all_columns: true)
24
+ )
23
25
  end
24
26
 
25
27
  def execute_queries(split, type)
@@ -21,7 +21,10 @@ module EasyML
21
21
 
22
22
  def run_queries(split, type)
23
23
  queries = build_queries(split, type)
24
- @dataset.send(type).send(split, all_columns: true, lazy: true).select(queries).collect
24
+
25
+ dataset.columns.apply_clip(
26
+ @dataset.send(type).send(split, all_columns: true, lazy: true)
27
+ ).select(queries).collect
25
28
  end
26
29
 
27
30
  def get_column_statistics(query_results)
@@ -180,6 +180,8 @@ module EasyML
180
180
  EasyML::Reaper.kill(EasyML::RefreshDatasetJob, id)
181
181
  update(workflow_status: :ready)
182
182
  unlock!
183
+ features.update_all(needs_fit: true, workflow_status: "ready")
184
+ features.each(&:wipe)
183
185
  end
184
186
 
185
187
  def refresh_async
@@ -201,12 +203,6 @@ module EasyML
201
203
  @raw = initialize_split("raw")
202
204
  end
203
205
 
204
- def clipped
205
- return @clipped if @clipped && @clipped.dataset
206
-
207
- @clipped = initialize_split("clipped")
208
- end
209
-
210
206
  def processed
211
207
  return @processed if @processed && @processed.dataset
212
208
 
@@ -287,6 +283,7 @@ module EasyML
287
283
 
288
284
  def fit_features(async: false, features: self.features, force: false)
289
285
  features_to_compute = force ? features : features.needs_fit
286
+ puts "Features to compute.... #{features_to_compute}"
290
287
  return after_fit_features if features_to_compute.empty?
291
288
 
292
289
  features.first.fit(features: features_to_compute, async: async)
@@ -295,10 +292,12 @@ module EasyML
295
292
  measure_method_timing :fit_features
296
293
 
297
294
  def after_fit_features
295
+ puts "After fit features"
298
296
  unlock!
299
297
  reload
300
298
  return if failed?
301
299
 
300
+ puts "Actually refresh..."
302
301
  actually_refresh
303
302
  end
304
303
 
@@ -385,6 +384,7 @@ module EasyML
385
384
 
386
385
  def unlock!
387
386
  Support::Lockable.unlock!(lock_key)
387
+ features.each(&:unlock!)
388
388
  end
389
389
 
390
390
  def locked?
@@ -490,6 +490,24 @@ module EasyML
490
490
  df
491
491
  end
492
492
 
493
+ # Massage out one-hot cats to their canonical name
494
+ #
495
+ # Takes: ["Sex_male", "Sex_female", "Embarked_c", "PassengerId"]
496
+ # Returns: ["Embarked", "Sex", "PassengerId"]
497
+ def regular_columns(col_list)
498
+ one_hot_cats = columns.allowed_categories.invert.reduce({}) do |h, (k, v)|
499
+ h.tap do
500
+ k.each do |k2|
501
+ h["#{v}_#{k2}"] = v
502
+ end
503
+ end
504
+ end
505
+
506
+ col_list.map do |col|
507
+ one_hot_cats.key?(col) ? one_hot_cats[col] : col
508
+ end.uniq.sort
509
+ end
510
+
493
511
  measure_method_timing :normalize
494
512
 
495
513
  def missing_required_fields(df)
@@ -537,7 +555,6 @@ module EasyML
537
555
 
538
556
  def cleanup
539
557
  raw.cleanup
540
- clipped.cleanup
541
558
  processed.cleanup
542
559
  end
543
560
 
@@ -730,10 +747,8 @@ module EasyML
730
747
 
731
748
  def initialize_splits
732
749
  @raw = nil
733
- @clipped = nil
734
750
  @processed = nil
735
751
  raw
736
- clipped
737
752
  processed
738
753
  end
739
754
 
@@ -778,7 +793,7 @@ module EasyML
778
793
  processed.cleanup
779
794
 
780
795
  SPLIT_ORDER.each do |segment|
781
- df = clipped.read(segment)
796
+ df = raw.read(segment)
782
797
  learn_computed_columns(df) if segment == :train
783
798
  processed_df = normalize(df, all_columns: true)
784
799
  processed.save(segment, processed_df)
@@ -825,26 +840,9 @@ module EasyML
825
840
  end
826
841
 
827
842
  def fit
828
- apply_clip
829
843
  learn_statistics(type: :raw)
830
844
  end
831
845
 
832
- def apply_clip
833
- clipped.cleanup
834
-
835
- SPLIT_ORDER.each do |segment|
836
- df = raw.send(segment, lazy: true, all_columns: true)
837
- clipped.save(
838
- segment,
839
- columns.apply_clip(df) # Ensuring this returns a LazyFrame means we'll automatically use sink_parquet
840
- )
841
- end
842
- end
843
-
844
- measure_method_timing :apply_clip
845
-
846
- # log_method :fit, "Learning statistics", verbose: true
847
-
848
846
  def split_data!
849
847
  split_data(force: true)
850
848
  end
@@ -22,7 +22,6 @@ module EasyML
22
22
  DATASOURCE_OPTIONS = {
23
23
  "s3" => "EasyML::Datasources::S3Datasource",
24
24
  "file" => "EasyML::Datasources::FileDatasource",
25
- "polars" => "EasyML::Datasources::PolarsDatasource",
26
25
  }
27
26
  DATASOURCE_TYPES = [
28
27
  {
@@ -35,11 +34,6 @@ module EasyML
35
34
  label: "Local Files",
36
35
  description: "Connect to data stored in local files",
37
36
  },
38
- {
39
- value: "polars",
40
- label: "Polars DataFrame",
41
- description: "In-memory dataframe storage using Polars",
42
- },
43
37
  ].freeze
44
38
  DATASOURCE_NAMES = DATASOURCE_OPTIONS.keys.freeze
45
39
  DATASOURCE_CONSTANTS = DATASOURCE_OPTIONS.values.map(&:constantize)
@@ -88,6 +88,7 @@ module EasyML
88
88
  before_save :update_sha
89
89
  after_find :update_from_feature_class
90
90
  before_save :update_from_feature_class
91
+ before_destroy :wipe
91
92
 
92
93
  def feature_klass
93
94
  feature_class.constantize
@@ -197,7 +198,7 @@ module EasyML
197
198
  end
198
199
 
199
200
  EasyML::Data::Partition::Boundaries.new(
200
- reader.data(lazy: true),
201
+ reader.data(lazy: true, all_columns: true),
201
202
  primary_key,
202
203
  batch_size
203
204
  ).to_a.map.with_index do |partition, idx|
@@ -207,7 +208,6 @@ module EasyML
207
208
  batch_end: partition[:partition_end],
208
209
  batch_number: feature_position,
209
210
  subbatch_number: idx,
210
- parent_batch_id: Random.uuid,
211
211
  }
212
212
  end
213
213
  end
@@ -218,7 +218,12 @@ module EasyML
218
218
 
219
219
  def fit(features: [self], async: false)
220
220
  ordered_features = features.sort_by(&:feature_position)
221
- jobs = ordered_features.map(&:build_batches)
221
+ parent_batch_id = Random.uuid
222
+ jobs = ordered_features.map do |feature|
223
+ feature.build_batches.map do |batch_args|
224
+ batch_args.merge(parent_batch_id: parent_batch_id)
225
+ end
226
+ end
222
227
  job_count = jobs.dup.flatten.size
223
228
 
224
229
  ordered_features.each(&:wipe)
@@ -454,6 +459,10 @@ module EasyML
454
459
  update!(updates)
455
460
  end
456
461
 
462
+ def unlock!
463
+ feature_store.unlock!
464
+ end
465
+
457
466
  UNCONFIGURABLE_COLUMNS = %w(
458
467
  id
459
468
  dataset_id