easy_ml 0.2.0.pre.rc76 → 0.2.0.pre.rc78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/models_controller.rb +3 -2
  3. data/app/frontend/components/ModelForm.tsx +16 -0
  4. data/app/frontend/components/ScheduleModal.tsx +0 -2
  5. data/app/frontend/components/dataset/PreprocessingConfig.tsx +7 -6
  6. data/app/jobs/easy_ml/application_job.rb +1 -0
  7. data/app/jobs/easy_ml/batch_job.rb +47 -6
  8. data/app/jobs/easy_ml/compute_feature_job.rb +10 -10
  9. data/app/jobs/easy_ml/reaper.rb +14 -10
  10. data/app/jobs/easy_ml/refresh_dataset_job.rb +2 -0
  11. data/app/jobs/easy_ml/sync_datasource_job.rb +1 -0
  12. data/app/models/concerns/easy_ml/dataframe_serialization.rb +1 -17
  13. data/app/models/easy_ml/column/imputers/base.rb +1 -1
  14. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -5
  15. data/app/models/easy_ml/column/imputers/today.rb +1 -1
  16. data/app/models/easy_ml/column/selector.rb +0 -8
  17. data/app/models/easy_ml/column.rb +1 -1
  18. data/app/models/easy_ml/dataset/learner/base.rb +2 -2
  19. data/app/models/easy_ml/dataset/learner/eager.rb +3 -1
  20. data/app/models/easy_ml/dataset/learner/lazy.rb +4 -1
  21. data/app/models/easy_ml/dataset/refresh_reasons.rb +12 -0
  22. data/app/models/easy_ml/dataset.rb +29 -76
  23. data/app/models/easy_ml/datasource.rb +0 -6
  24. data/app/models/easy_ml/feature.rb +27 -38
  25. data/app/models/easy_ml/model.rb +20 -2
  26. data/app/models/easy_ml/models/xgboost/evals_callback.rb +3 -2
  27. data/app/models/easy_ml/models/xgboost.rb +52 -36
  28. data/app/models/easy_ml/retraining_run.rb +1 -1
  29. data/app/serializers/easy_ml/dataset_serializer.rb +1 -1
  30. data/app/serializers/easy_ml/model_serializer.rb +1 -0
  31. data/lib/easy_ml/core/tuner.rb +7 -4
  32. data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
  33. data/lib/easy_ml/data/dataset_manager/reader/base.rb +80 -0
  34. data/lib/easy_ml/data/dataset_manager/reader/batch.rb +106 -0
  35. data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +23 -0
  36. data/lib/easy_ml/data/dataset_manager/reader/file.rb +75 -0
  37. data/lib/easy_ml/data/dataset_manager/reader.rb +58 -0
  38. data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +67 -0
  39. data/lib/easy_ml/data/dataset_manager/writer/base.rb +139 -0
  40. data/lib/easy_ml/data/dataset_manager/writer/named.rb +14 -0
  41. data/lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb +15 -0
  42. data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +150 -0
  43. data/lib/easy_ml/data/dataset_manager/writer.rb +80 -0
  44. data/lib/easy_ml/data/dataset_manager.rb +140 -0
  45. data/lib/easy_ml/data/partition/boundaries.rb +60 -0
  46. data/lib/easy_ml/data/partition.rb +7 -0
  47. data/lib/easy_ml/data/polars_column.rb +19 -5
  48. data/lib/easy_ml/data/synced_directory.rb +1 -2
  49. data/lib/easy_ml/data.rb +2 -0
  50. data/lib/easy_ml/engine.rb +16 -14
  51. data/lib/easy_ml/feature_store.rb +21 -188
  52. data/lib/easy_ml/reasons.rb +41 -0
  53. data/lib/easy_ml/support/lockable.rb +1 -5
  54. data/lib/easy_ml/version.rb +1 -1
  55. data/lib/easy_ml.rb +1 -1
  56. data/public/easy_ml/assets/.vite/manifest.json +1 -1
  57. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Bbf3mD_b.js +522 -0
  58. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-B1qLZuyu.js.map → Application.tsx-Bbf3mD_b.js.map} +1 -1
  59. metadata +24 -9
  60. data/app/models/easy_ml/datasources/polars_datasource.rb +0 -69
  61. data/lib/easy_ml/data/filter_extensions.rb +0 -31
  62. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-B1qLZuyu.js +0 -522
  63. /data/app/models/{lineage_history.rb → easy_ml/lineage_history.rb} +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1eebc157e0f33c3da40ef2b1bdb7cc0ed1c2b6f73615cdf26a6898cb60e60d2d
4
- data.tar.gz: a12b441fe0736f251de773574858316346ba19c5b3784d73f3db200af0e619e4
3
+ metadata.gz: 13858267adb9445f665a01214f2109bc23dd63a76d5ab0ae502c60ac94a6d2d4
4
+ data.tar.gz: bc1b37afabf4757ce1e7e311699d6e8ac0bea2230025d8e696ada4071b0b3563
5
5
  SHA512:
6
- metadata.gz: 4aabb816a9d02a6f2bd870cde3db3eaaf00a314cf5e0d50a11bf707534b9d93eddee648d62304f48976916ea9d5942269dbeded81d49df23199ffcc13d6ae0eb
7
- data.tar.gz: 284973f49424ac622ceb3e44071e88336ea316154dee788b0e7c865441eeb01939192289deea84283b691bf8f5a3b79f708d3d62ab9fcec3d596f67ff4c093a9
6
+ metadata.gz: ccd5fc9e0b9529da07012a1745f826cf8e88391b24e3df20ba636c9e6ccf853172d18916cccc3087692873971a9dd2b72aa7151e286824df5cb500255610d603
7
+ data.tar.gz: 6034abbae5e25a00f204a649c62b568a90a76481c6ff91aaadd766fe515fe76dbf6692bebabe905c5a4bc1b9642717c77f4cbfda6b43684624a5e32517f73d99
@@ -30,7 +30,7 @@ module EasyML
30
30
  def new
31
31
  render inertia: "pages/NewModelPage", props: {
32
32
  datasets: EasyML::Dataset.all.map do |dataset|
33
- dataset.slice(:id, :name, :num_rows)
33
+ dataset_to_json(dataset)
34
34
  end,
35
35
  constants: EasyML::Model.constants,
36
36
  }
@@ -41,7 +41,7 @@ module EasyML
41
41
  render inertia: "pages/EditModelPage", props: {
42
42
  model: model_to_json(model),
43
43
  datasets: EasyML::Dataset.all.map do |dataset|
44
- dataset.slice(:id, :name, :num_rows)
44
+ dataset_to_json(dataset)
45
45
  end,
46
46
  constants: EasyML::Model.constants,
47
47
  }
@@ -177,6 +177,7 @@ module EasyML
177
177
  :dataset_id,
178
178
  :task,
179
179
  :objective,
180
+ :weights_column,
180
181
  metrics: [],
181
182
  retraining_job_attributes: [
182
183
  :id,
@@ -16,6 +16,7 @@ interface ModelFormProps {
16
16
  task: string;
17
17
  objective?: string;
18
18
  metrics?: string[];
19
+ weights_column?: string;
19
20
  retraining_job?: {
20
21
  frequency: string;
21
22
  at: {
@@ -75,6 +76,7 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
75
76
  task: initialData?.task || 'classification',
76
77
  objective: initialData?.objective || 'binary:logistic',
77
78
  metrics: initialData?.metrics || ['accuracy_score'],
79
+ weights_column: initialData?.weights_column || '',
78
80
  retraining_job_attributes: initialData?.retraining_job ? {
79
81
  id: initialData.retraining_job.id,
80
82
  frequency: initialData.retraining_job.frequency,
@@ -165,6 +167,7 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
165
167
  };
166
168
 
167
169
  const selectedDataset = datasets.find(d => d.id === data.model.dataset_id);
170
+ const columns = selectedDataset?.columns || [];
168
171
 
169
172
  const filteredTunerJobConstants = constants.tuner_job_constants[data.model.model_type] || {};
170
173
 
@@ -246,6 +249,19 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
246
249
  <ErrorDisplay error={errors.dataset_id} />
247
250
  </div>
248
251
 
252
+ <div>
253
+ <label className="block text-sm font-medium text-gray-700 mb-1">
254
+ Weights Column (Optional)
255
+ </label>
256
+ <SearchableSelect
257
+ value={data.model.weights_column}
258
+ options={columns.map(col => ({ value: col.name, label: col.name }))}
259
+ onChange={(value) => setData('model.weights_column', value)}
260
+ isClearable={true}
261
+ />
262
+ <ErrorDisplay error={errors.weights_column} />
263
+ </div>
264
+
249
265
  <div>
250
266
  <label className="block text-sm font-medium text-gray-700 mb-1">
251
267
  Task
@@ -587,8 +587,6 @@ export function ScheduleModal({ isOpen, onClose, onSave, initialData, metrics, t
587
587
  value={formData.retraining_job_attributes.threshold}
588
588
  onChange={(e) => handleEvaluatorChange('threshold', parseFloat(e.target.value))}
589
589
  step={0.01}
590
- min={0}
591
- max={1}
592
590
  className="block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 py-2 px-4 shadow-sm border-gray-300 border"
593
591
  />
594
592
  </div>
@@ -250,16 +250,17 @@ export function PreprocessingConfig({
250
250
  setIsEditingDescription(true);
251
251
  };
252
252
 
253
- let nullCount = (column.statistics?.processed.null_count || column.statistics?.raw?.null_count) || 0;
254
- const nullPercentage = nullCount && column.statistics?.raw.num_rows
255
- ? ((nullCount / column.statistics.raw.num_rows) * 100)
253
+ let nullCount = (column.statistics?.processed?.null_count || column.statistics?.raw?.null_count) || 0;
254
+ let numRows = (column.statistics?.processed?.num_rows) || (column.statistics?.raw?.num_rows) || 0;
255
+ const nullPercentage = nullCount && numRows
256
+ ? ((nullCount / numRows) * 100)
256
257
  : 0;
257
258
 
258
- const nullPercentageProcessed = column.statistics?.processed?.null_count && column.statistics?.raw.num_rows
259
- ? ((column.statistics.processed.null_count / column.statistics.raw.num_rows) * 100)
259
+ const nullPercentageProcessed = column.statistics?.processed?.null_count && column.statistics?.processed?.num_rows
260
+ ? ((column.statistics.processed.null_count / column.statistics.processed.num_rows) * 100)
260
261
  : 0;
261
262
 
262
- const totalRows = column.statistics?.raw.num_rows ?? 0;
263
+ const totalRows = numRows;
263
264
 
264
265
  const renderStrategySpecificInfo = (type: 'training' | 'inference') => {
265
266
  const strategy = type === 'training' ? training : inference;
@@ -1,5 +1,6 @@
1
1
  module EasyML
2
2
  class ApplicationJob < ActiveJob::Base
3
+ @queue = :easy_ml
3
4
  queue_as :easy_ml
4
5
 
5
6
  def create_event(model, status, error = nil)
@@ -39,15 +39,15 @@ module EasyML
39
39
  rest.map do |batch|
40
40
  Resque.redis.rpush("batch:#{parent_id}:remaining", batch.to_json)
41
41
  end
42
-
42
+ track_batch(parent_id)
43
43
  handle_batch(parent_id, batch)
44
44
  end
45
45
 
46
46
  def handle_batch(parent_id, batch)
47
47
  if batch.size > 1
48
- enqueue_batch(batch)
48
+ enqueue_batch(batch, parent_id)
49
49
  else
50
- run_one_batch(parent_id, batch.first)
50
+ new.perform(parent_id, batch.first)
51
51
  after_batch_hook(parent_id, batch)
52
52
  end
53
53
  end
@@ -60,7 +60,21 @@ module EasyML
60
60
  end
61
61
 
62
62
  def next_batch?(parent_id)
63
- batches_remaining(parent_id) > 0
63
+ (batches_remaining(parent_id) > 0)
64
+ end
65
+
66
+ def list_batches
67
+ Resque.redis.hkeys("batches:tracking")
68
+ end
69
+
70
+ def track_batch(parent_id)
71
+ Resque.redis.hset("batches:tracking", parent_id, 1)
72
+ end
73
+
74
+ def cleanup_all
75
+ list_batches.each do |batch_id|
76
+ cleanup_batch(batch_id)
77
+ end
64
78
  end
65
79
 
66
80
  def batches_remaining(parent_id)
@@ -69,12 +83,39 @@ module EasyML
69
83
 
70
84
  def cleanup_batch(parent_id)
71
85
  Resque.redis.del("batch:#{parent_id}:remaining")
86
+ Resque.redis.hdel("batches:tracking", parent_id)
72
87
  end
73
88
 
74
- private
89
+ def batch_args
90
+ list_batches.map do |batch_id|
91
+ fetch_batch_arguments(batch_id)
92
+ end
93
+ end
94
+
95
+ def select_batches(&block)
96
+ list_batches.select do |batch_id|
97
+ yield fetch_batch_arguments(batch_id)
98
+ end
99
+ end
100
+
101
+ def poll
102
+ while true
103
+ sleep 2
104
+ EasyML::BatchJob.list_batches.map do |batch|
105
+ puts "Batch #{batch} | Remaining : #{EasyML::BatchJob.batches_remaining(batch)}"
106
+ end
107
+ end
108
+ end
75
109
 
76
110
  def get_parent_batch_id(args_list)
77
- args_list.dup.flatten.first.dig(:parent_batch_id)
111
+ args_list.dup.flatten.detect { |arg| arg.dig(:parent_batch_id) }.dig(:parent_batch_id)
112
+ end
113
+
114
+ private
115
+
116
+ def get_args_list(batch_id)
117
+ redis_key = "#{batch(batch_id)}:original_args"
118
+ redis.get(redis_key)
78
119
  end
79
120
 
80
121
  # Store batch arguments in Redis
@@ -14,31 +14,31 @@ module EasyML
14
14
  #
15
15
  # https://github.com/drfeelngood/resque-batched-job/blob/master/lib/resque/plugins/batched_job.rb#L86
16
16
  batch_args = batch_args.dup
17
- run_one_batch(batch_id, batch_args)
17
+ EasyML::ComputeFeatureJob.new.perform(batch_id, batch_args)
18
18
  end
19
19
 
20
- def self.run_one_batch(batch_id, batch_args)
20
+ def perform(batch_id, batch_args = {})
21
21
  EasyML::Feature.fit_one_batch(batch_id, batch_args)
22
22
  end
23
23
 
24
24
  def self.after_batch_hook(batch_id, *args)
25
- batch_args = fetch_batch_arguments(batch_id).flatten.map(&:symbolize_keys)
26
- feature_ids = batch_args.pluck(:feature_id).uniq
27
- parent_id = batch_args.pluck(:parent_batch_id).first
25
+ args = args.flatten.first.with_indifferent_access
26
+ feature_id = args.dig(:feature_id)
28
27
 
29
- feature = EasyML::Feature.find_by(id: feature_ids.first)
28
+ feature = EasyML::Feature.find_by(id: feature_id)
30
29
 
31
30
  if feature.failed?
32
31
  dataset.features.where(workflow_status: :analyzing).update_all(workflow_status: :ready)
33
- return BatchJob.cleanup_batch(parent_id)
32
+ return BatchJob.cleanup_batch(batch_id)
34
33
  end
35
34
 
36
35
  feature.after_fit
37
36
 
38
- if BatchJob.next_batch?(parent_id)
39
- BatchJob.enqueue_next_batch(self, parent_id)
37
+ if BatchJob.next_batch?(batch_id)
38
+ BatchJob.enqueue_next_batch(self, batch_id)
40
39
  else
41
- dataset = EasyML::Feature.find_by(id: feature_ids.first).dataset
40
+ cleanup_batch(batch_id)
41
+ dataset = feature.dataset
42
42
  dataset.after_fit_features
43
43
  end
44
44
  end
@@ -9,8 +9,8 @@ module EasyML
9
9
  {
10
10
  worker: worker,
11
11
  working: true,
12
- class: args.dig("job_class"),
13
- args: args.dig("arguments"),
12
+ class: args.is_a?(Hash) ? args.dig("job_class") : nil,
13
+ args: args.is_a?(Hash) ? args.dig("arguments") : nil,
14
14
  pid: worker.pid,
15
15
  }
16
16
  else
@@ -19,17 +19,23 @@ module EasyML
19
19
  end
20
20
  end
21
21
 
22
- def find_job(worker_class, *args)
22
+ def find_job(worker_class, *args, &block)
23
23
  list_workers.select do |config|
24
- config.dig(:class) == worker_class.to_s && config.dig(:args) == args
24
+ selected = config.dig(:class) == worker_class.to_s
25
+ if block_given?
26
+ selected &&= yield(config)
27
+ else
28
+ selected &= config.dig(:args) == args
29
+ end
30
+ selected
25
31
  end
26
32
  end
27
33
 
28
- def kill(worker_class, *args)
29
- find_job(worker_class, *args).each do |job|
34
+ def kill(worker_class, *args, &block)
35
+ find_job(worker_class, *args, &block).each do |job|
30
36
  begin
31
- # Send TERM signal to the process
32
- Process.kill("TERM", job[:pid])
37
+ # Send HUP signal to the process
38
+ Process.kill("USR1", job[:pid])
33
39
 
34
40
  # Remove the worker from Redis so it doesn't show up as a zombie
35
41
  # in the Resque web interface. This is important because:
@@ -37,12 +43,10 @@ module EasyML
37
43
  # 2. Prevents confusion about running workers
38
44
  # 3. Allows proper worker cleanup in Redis
39
45
  job[:worker].done_working
40
- job[:worker].unregister_worker
41
46
  rescue Errno::ESRCH
42
47
  # Process already gone, but still try to clean up Redis
43
48
  begin
44
49
  job[:worker].done_working
45
- job[:worker].unregister_worker
46
50
  rescue => e
47
51
  # Redis cleanup failed, worker might already be unregistered
48
52
  puts "Failed to unregister worker: #{e.message}"
@@ -1,5 +1,7 @@
1
1
  module EasyML
2
2
  class RefreshDatasetJob < ApplicationJob
3
+ @queue = :easy_ml
4
+
3
5
  def perform(id)
4
6
  begin
5
7
  dataset = EasyML::Dataset.find(id)
@@ -8,6 +8,7 @@ module EasyML
8
8
 
9
9
  begin
10
10
  datasource.refresh
11
+ datasource.after_sync
11
12
  rescue StandardError => e
12
13
  datasource.update!(is_syncing: false)
13
14
  handle_error(datasource, e)
@@ -8,23 +8,7 @@ module EasyML
8
8
  end
9
9
 
10
10
  def deserialize_dataframe(df_data)
11
- return unless df_data.present? && df_data.key?("columns")
12
-
13
- columns = df_data["columns"].map do |col|
14
- dtype = case col["datatype"]
15
- when Hash
16
- if col["datatype"]["Datetime"]
17
- Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
18
- else
19
- Polars::Utf8
20
- end
21
- else
22
- Polars.const_get(col["datatype"])
23
- end
24
- Polars::Series.new(col["name"], col["values"], dtype: dtype)
25
- end
26
-
27
- Polars::DataFrame.new(columns)
11
+ Polars::DataFrame.new(df_data)
28
12
  end
29
13
  end
30
14
  end
@@ -63,7 +63,7 @@ module EasyML
63
63
  if column.is_computed
64
64
  column.statistics.dig(:processed, *args)
65
65
  else
66
- column.statistics.dig(:clipped, *args) || column.statistics.dig(:raw, *args)
66
+ column.statistics.dig(:raw, *args)
67
67
  end
68
68
  end
69
69
 
@@ -50,11 +50,7 @@ module EasyML
50
50
  end
51
51
 
52
52
  def cast_encoder(encoder)
53
- begin
54
- encoder.transform_keys { |k| column.cast(k) }
55
- rescue => e
56
- binding.pry
57
- end
53
+ encoder.transform_keys { |k| column.cast(k) }
58
54
  end
59
55
 
60
56
  def cast_decoder(decoder)
@@ -10,7 +10,7 @@ module EasyML
10
10
 
11
11
  def transform(df)
12
12
  df = df.with_column(
13
- Polars.col(column.name).fill_null(Polars.lit(UTC.today.beginning_of_day)).alias(column.name)
13
+ Polars.col(column.name).fill_null(Polars.lit(EasyML::Support::UTC.today.beginning_of_day)).alias(column.name)
14
14
  )
15
15
  df
16
16
  end
@@ -24,14 +24,6 @@ module EasyML
24
24
  end
25
25
  end
26
26
 
27
- def clipped
28
- Selector.new(column, :raw) do |df|
29
- column.imputers.training.clip(df)
30
- end
31
- end
32
-
33
- measure_method_timing :clipped
34
-
35
27
  def processed
36
28
  Selector.new(column, :processed)
37
29
  end
@@ -140,7 +140,7 @@ module EasyML
140
140
  end
141
141
  end
142
142
 
143
- delegate :raw, :processed, :data, :train, :test, :valid, :clipped, to: :data_selector
143
+ delegate :raw, :processed, :data, :train, :test, :valid, to: :data_selector
144
144
 
145
145
  def empty?
146
146
  data.blank?
@@ -15,8 +15,8 @@ module EasyML
15
15
  (column.one_hot? && type.to_sym == :processed)
16
16
  end
17
17
 
18
- TYPES_ALL = %i(raw clipped processed)
19
- TYPES_RAW = %i(raw clipped)
18
+ TYPES_ALL = %i(raw processed)
19
+ TYPES_RAW = %i(raw)
20
20
  TYPES_PROCESSED = %i(processed)
21
21
 
22
22
  def types(type = :all)
@@ -19,7 +19,9 @@ module EasyML
19
19
  end
20
20
 
21
21
  def fetch_df(split, type)
22
- @dataset.send(type).send(split, all_columns: true)
22
+ dataset.columns.apply_clip(
23
+ @dataset.send(type).send(split, all_columns: true)
24
+ )
23
25
  end
24
26
 
25
27
  def execute_queries(split, type)
@@ -21,7 +21,10 @@ module EasyML
21
21
 
22
22
  def run_queries(split, type)
23
23
  queries = build_queries(split, type)
24
- @dataset.send(type).send(split, all_columns: true, lazy: true).select(queries).collect
24
+
25
+ dataset.columns.apply_clip(
26
+ @dataset.send(type).send(split, all_columns: true, lazy: true)
27
+ ).select(queries).collect
25
28
  end
26
29
 
27
30
  def get_column_statistics(query_results)
@@ -0,0 +1,12 @@
1
+ module EasyML
2
+ class Dataset
3
+ class RefreshReasons < EasyML::Reasons
4
+ add_reason "Not split", -> { not_split? }
5
+ add_reason "Refreshed at is nil", -> { refreshed_at.nil? }
6
+ add_reason "Columns need refresh", -> { columns_need_refresh? }
7
+ add_reason "Features need refresh", -> { features_need_fit? }
8
+ add_reason "Datasource needs refresh", -> { datasource_needs_refresh? }
9
+ add_reason "Datasource was refreshed", -> { datasource_was_refreshed? }
10
+ end
11
+ end
12
+ end
@@ -180,6 +180,8 @@ module EasyML
180
180
  EasyML::Reaper.kill(EasyML::RefreshDatasetJob, id)
181
181
  update(workflow_status: :ready)
182
182
  unlock!
183
+ features.update_all(needs_fit: true, workflow_status: "ready")
184
+ features.each(&:wipe)
183
185
  end
184
186
 
185
187
  def refresh_async
@@ -201,12 +203,6 @@ module EasyML
201
203
  @raw = initialize_split("raw")
202
204
  end
203
205
 
204
- def clipped
205
- return @clipped if @clipped && @clipped.dataset
206
-
207
- @clipped = initialize_split("clipped")
208
- end
209
-
210
206
  def processed
211
207
  return @processed if @processed && @processed.dataset
212
208
 
@@ -265,9 +261,7 @@ module EasyML
265
261
 
266
262
  def refresh!(async: false)
267
263
  refreshing do
268
- puts "Prepare..."
269
264
  prepare!
270
- puts "Fit features..."
271
265
  fit_features!(async: async)
272
266
  end
273
267
  end
@@ -276,9 +270,7 @@ module EasyML
276
270
  return refresh_async if async
277
271
 
278
272
  refreshing do
279
- puts "prepare.."
280
273
  prepare
281
- puts "fit features..."
282
274
  fit_features(async: async)
283
275
  end
284
276
  end
@@ -291,6 +283,7 @@ module EasyML
291
283
 
292
284
  def fit_features(async: false, features: self.features, force: false)
293
285
  features_to_compute = force ? features : features.needs_fit
286
+ puts "Features to compute.... #{features_to_compute}"
294
287
  return after_fit_features if features_to_compute.empty?
295
288
 
296
289
  features.first.fit(features: features_to_compute, async: async)
@@ -299,11 +292,12 @@ module EasyML
299
292
  measure_method_timing :fit_features
300
293
 
301
294
  def after_fit_features
302
- puts "after fit features..."
295
+ puts "After fit features"
303
296
  unlock!
304
297
  reload
305
298
  return if failed?
306
299
 
300
+ puts "Actually refresh..."
307
301
  actually_refresh
308
302
  end
309
303
 
@@ -338,45 +332,12 @@ module EasyML
338
332
  #
339
333
  # So yes this is an annoying way to structure a method, but it's helpful for performance
340
334
  #
341
- def refresh_reasons(exclude: [])
342
- {
343
- not_split: {
344
- name: "Not split",
345
- check: -> { not_split? },
346
- },
347
- refreshed_at_is_nil: {
348
- name: "Refreshed at is nil",
349
- check: -> { refreshed_at.nil? },
350
- },
351
- columns_need_refresh: {
352
- name: "Columns need refresh",
353
- check: -> { columns_need_refresh? },
354
- },
355
- features_need_fit: {
356
- name: "Features need refresh",
357
- check: -> { features_need_fit? },
358
- },
359
- datasource_needs_refresh: {
360
- name: "Datasource needs refresh",
361
- check: -> { datasource_needs_refresh? },
362
- },
363
- refreshed_datasource: {
364
- name: "Refreshed datasource",
365
- check: -> { refreshed_datasource? },
366
- },
367
- datasource_was_refreshed: {
368
- name: "Datasource was refreshed",
369
- check: -> { datasource_was_refreshed? },
370
- },
371
- }.except(*exclude).select do |k, config|
372
- config[:check].call
373
- end.map do |k, config|
374
- config[:name]
375
- end
335
+ def refresh_reasons(except: [])
336
+ RefreshReasons.new(self).check(except: except)
376
337
  end
377
338
 
378
- def needs_refresh?(exclude: [])
379
- refresh_reasons(exclude: exclude).any?
339
+ def needs_refresh?(except: [])
340
+ refresh_reasons(except: except).any?
380
341
  end
381
342
 
382
343
  def processed?
@@ -423,6 +384,7 @@ module EasyML
423
384
 
424
385
  def unlock!
425
386
  Support::Lockable.unlock!(lock_key)
387
+ features.each(&:unlock!)
426
388
  end
427
389
 
428
390
  def locked?
@@ -518,23 +480,34 @@ module EasyML
518
480
  end
519
481
 
520
482
  def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
521
- puts "Apply missing features..."
522
483
  df = apply_missing_columns(df, inference: inference)
523
- puts "Transform columns..."
524
484
  df = columns.transform(df, inference: inference)
525
- puts "Apply features..."
526
485
  df = apply_features(df, features)
527
- puts "Transform columns..."
528
486
  df = columns.transform(df, inference: inference, computed: true)
529
- puts "Apply column mask..."
530
487
  df = apply_column_mask(df, inference: inference) unless all_columns
531
- puts "Drop nulls..."
532
488
  df = drop_nulls(df) unless inference
533
- puts "Split features and targets..."
534
489
  df, = processed.split_features_targets(df, true, target) if split_ys
535
490
  df
536
491
  end
537
492
 
493
+ # Massage out one-hot cats to their canonical name
494
+ #
495
+ # Takes: ["Sex_male", "Sex_female", "Embarked_c", "PassengerId"]
496
+ # Returns: ["Embarked", "Sex", "PassengerId"]
497
+ def regular_columns(col_list)
498
+ one_hot_cats = columns.allowed_categories.invert.reduce({}) do |h, (k, v)|
499
+ h.tap do
500
+ k.each do |k2|
501
+ h["#{v}_#{k2}"] = v
502
+ end
503
+ end
504
+ end
505
+
506
+ col_list.map do |col|
507
+ one_hot_cats.key?(col) ? one_hot_cats[col] : col
508
+ end.uniq.sort
509
+ end
510
+
538
511
  measure_method_timing :normalize
539
512
 
540
513
  def missing_required_fields(df)
@@ -582,7 +555,6 @@ module EasyML
582
555
 
583
556
  def cleanup
584
557
  raw.cleanup
585
- clipped.cleanup
586
558
  processed.cleanup
587
559
  end
588
560
 
@@ -775,10 +747,8 @@ module EasyML
775
747
 
776
748
  def initialize_splits
777
749
  @raw = nil
778
- @clipped = nil
779
750
  @processed = nil
780
751
  raw
781
- clipped
782
752
  processed
783
753
  end
784
754
 
@@ -823,7 +793,7 @@ module EasyML
823
793
  processed.cleanup
824
794
 
825
795
  SPLIT_ORDER.each do |segment|
826
- df = clipped.read(segment)
796
+ df = raw.read(segment)
827
797
  learn_computed_columns(df) if segment == :train
828
798
  processed_df = normalize(df, all_columns: true)
829
799
  processed.save(segment, processed_df)
@@ -870,26 +840,9 @@ module EasyML
870
840
  end
871
841
 
872
842
  def fit
873
- apply_clip
874
843
  learn_statistics(type: :raw)
875
844
  end
876
845
 
877
- def apply_clip
878
- clipped.cleanup
879
-
880
- SPLIT_ORDER.each do |segment|
881
- df = raw.send(segment, lazy: true, all_columns: true)
882
- clipped.save(
883
- segment,
884
- columns.apply_clip(df) # Ensuring this returns a LazyFrame means we'll automatically use sink_parquet
885
- )
886
- end
887
- end
888
-
889
- measure_method_timing :apply_clip
890
-
891
- # log_method :fit, "Learning statistics", verbose: true
892
-
893
846
  def split_data!
894
847
  split_data(force: true)
895
848
  end