easy_ml 0.2.0.pre.rc77 → 0.2.0.pre.rc78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/models_controller.rb +3 -2
- data/app/frontend/components/ModelForm.tsx +16 -0
- data/app/frontend/components/ScheduleModal.tsx +0 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +7 -6
- data/app/jobs/easy_ml/application_job.rb +1 -0
- data/app/jobs/easy_ml/batch_job.rb +47 -6
- data/app/jobs/easy_ml/compute_feature_job.rb +10 -10
- data/app/jobs/easy_ml/reaper.rb +14 -10
- data/app/jobs/easy_ml/refresh_dataset_job.rb +2 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +1 -0
- data/app/models/concerns/easy_ml/dataframe_serialization.rb +1 -17
- data/app/models/easy_ml/column/imputers/base.rb +1 -1
- data/app/models/easy_ml/column/imputers/today.rb +1 -1
- data/app/models/easy_ml/column/selector.rb +0 -8
- data/app/models/easy_ml/column.rb +1 -1
- data/app/models/easy_ml/dataset/learner/base.rb +2 -2
- data/app/models/easy_ml/dataset/learner/eager.rb +3 -1
- data/app/models/easy_ml/dataset/learner/lazy.rb +4 -1
- data/app/models/easy_ml/dataset.rb +25 -27
- data/app/models/easy_ml/datasource.rb +0 -6
- data/app/models/easy_ml/feature.rb +12 -3
- data/app/models/easy_ml/model.rb +20 -2
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +3 -2
- data/app/models/easy_ml/models/xgboost.rb +52 -36
- data/app/models/easy_ml/retraining_run.rb +1 -1
- data/app/serializers/easy_ml/model_serializer.rb +1 -0
- data/lib/easy_ml/core/tuner.rb +7 -4
- data/lib/easy_ml/data/dataset_manager/writer/base.rb +26 -9
- data/lib/easy_ml/data/dataset_manager/writer.rb +5 -1
- data/lib/easy_ml/data/dataset_manager.rb +8 -2
- data/lib/easy_ml/data/polars_column.rb +19 -5
- data/lib/easy_ml/engine.rb +16 -14
- data/lib/easy_ml/feature_store.rb +19 -16
- data/lib/easy_ml/support/lockable.rb +1 -5
- data/lib/easy_ml/version.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Bbf3mD_b.js +522 -0
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-B1qLZuyu.js.map → Application.tsx-Bbf3mD_b.js.map} +1 -1
- metadata +6 -7
- data/app/models/easy_ml/datasources/polars_datasource.rb +0 -69
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-B1qLZuyu.js +0 -522
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13858267adb9445f665a01214f2109bc23dd63a76d5ab0ae502c60ac94a6d2d4
|
4
|
+
data.tar.gz: bc1b37afabf4757ce1e7e311699d6e8ac0bea2230025d8e696ada4071b0b3563
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ccd5fc9e0b9529da07012a1745f826cf8e88391b24e3df20ba636c9e6ccf853172d18916cccc3087692873971a9dd2b72aa7151e286824df5cb500255610d603
|
7
|
+
data.tar.gz: 6034abbae5e25a00f204a649c62b568a90a76481c6ff91aaadd766fe515fe76dbf6692bebabe905c5a4bc1b9642717c77f4cbfda6b43684624a5e32517f73d99
|
@@ -30,7 +30,7 @@ module EasyML
|
|
30
30
|
def new
|
31
31
|
render inertia: "pages/NewModelPage", props: {
|
32
32
|
datasets: EasyML::Dataset.all.map do |dataset|
|
33
|
-
dataset
|
33
|
+
dataset_to_json(dataset)
|
34
34
|
end,
|
35
35
|
constants: EasyML::Model.constants,
|
36
36
|
}
|
@@ -41,7 +41,7 @@ module EasyML
|
|
41
41
|
render inertia: "pages/EditModelPage", props: {
|
42
42
|
model: model_to_json(model),
|
43
43
|
datasets: EasyML::Dataset.all.map do |dataset|
|
44
|
-
dataset
|
44
|
+
dataset_to_json(dataset)
|
45
45
|
end,
|
46
46
|
constants: EasyML::Model.constants,
|
47
47
|
}
|
@@ -177,6 +177,7 @@ module EasyML
|
|
177
177
|
:dataset_id,
|
178
178
|
:task,
|
179
179
|
:objective,
|
180
|
+
:weights_column,
|
180
181
|
metrics: [],
|
181
182
|
retraining_job_attributes: [
|
182
183
|
:id,
|
@@ -16,6 +16,7 @@ interface ModelFormProps {
|
|
16
16
|
task: string;
|
17
17
|
objective?: string;
|
18
18
|
metrics?: string[];
|
19
|
+
weights_column?: string;
|
19
20
|
retraining_job?: {
|
20
21
|
frequency: string;
|
21
22
|
at: {
|
@@ -75,6 +76,7 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
|
|
75
76
|
task: initialData?.task || 'classification',
|
76
77
|
objective: initialData?.objective || 'binary:logistic',
|
77
78
|
metrics: initialData?.metrics || ['accuracy_score'],
|
79
|
+
weights_column: initialData?.weights_column || '',
|
78
80
|
retraining_job_attributes: initialData?.retraining_job ? {
|
79
81
|
id: initialData.retraining_job.id,
|
80
82
|
frequency: initialData.retraining_job.frequency,
|
@@ -165,6 +167,7 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
|
|
165
167
|
};
|
166
168
|
|
167
169
|
const selectedDataset = datasets.find(d => d.id === data.model.dataset_id);
|
170
|
+
const columns = selectedDataset?.columns || [];
|
168
171
|
|
169
172
|
const filteredTunerJobConstants = constants.tuner_job_constants[data.model.model_type] || {};
|
170
173
|
|
@@ -246,6 +249,19 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
|
|
246
249
|
<ErrorDisplay error={errors.dataset_id} />
|
247
250
|
</div>
|
248
251
|
|
252
|
+
<div>
|
253
|
+
<label className="block text-sm font-medium text-gray-700 mb-1">
|
254
|
+
Weights Column (Optional)
|
255
|
+
</label>
|
256
|
+
<SearchableSelect
|
257
|
+
value={data.model.weights_column}
|
258
|
+
options={columns.map(col => ({ value: col.name, label: col.name }))}
|
259
|
+
onChange={(value) => setData('model.weights_column', value)}
|
260
|
+
isClearable={true}
|
261
|
+
/>
|
262
|
+
<ErrorDisplay error={errors.weights_column} />
|
263
|
+
</div>
|
264
|
+
|
249
265
|
<div>
|
250
266
|
<label className="block text-sm font-medium text-gray-700 mb-1">
|
251
267
|
Task
|
@@ -587,8 +587,6 @@ export function ScheduleModal({ isOpen, onClose, onSave, initialData, metrics, t
|
|
587
587
|
value={formData.retraining_job_attributes.threshold}
|
588
588
|
onChange={(e) => handleEvaluatorChange('threshold', parseFloat(e.target.value))}
|
589
589
|
step={0.01}
|
590
|
-
min={0}
|
591
|
-
max={1}
|
592
590
|
className="block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 py-2 px-4 shadow-sm border-gray-300 border"
|
593
591
|
/>
|
594
592
|
</div>
|
@@ -250,16 +250,17 @@ export function PreprocessingConfig({
|
|
250
250
|
setIsEditingDescription(true);
|
251
251
|
};
|
252
252
|
|
253
|
-
let nullCount = (column.statistics?.processed
|
254
|
-
|
255
|
-
|
253
|
+
let nullCount = (column.statistics?.processed?.null_count || column.statistics?.raw?.null_count) || 0;
|
254
|
+
let numRows = (column.statistics?.processed?.num_rows) || (column.statistics?.raw?.num_rows) || 0;
|
255
|
+
const nullPercentage = nullCount && numRows
|
256
|
+
? ((nullCount / numRows) * 100)
|
256
257
|
: 0;
|
257
258
|
|
258
|
-
const nullPercentageProcessed = column.statistics?.processed?.null_count && column.statistics?.
|
259
|
-
? ((column.statistics.processed.null_count / column.statistics.
|
259
|
+
const nullPercentageProcessed = column.statistics?.processed?.null_count && column.statistics?.processed?.num_rows
|
260
|
+
? ((column.statistics.processed.null_count / column.statistics.processed.num_rows) * 100)
|
260
261
|
: 0;
|
261
262
|
|
262
|
-
const totalRows =
|
263
|
+
const totalRows = numRows;
|
263
264
|
|
264
265
|
const renderStrategySpecificInfo = (type: 'training' | 'inference') => {
|
265
266
|
const strategy = type === 'training' ? training : inference;
|
@@ -39,15 +39,15 @@ module EasyML
|
|
39
39
|
rest.map do |batch|
|
40
40
|
Resque.redis.rpush("batch:#{parent_id}:remaining", batch.to_json)
|
41
41
|
end
|
42
|
-
|
42
|
+
track_batch(parent_id)
|
43
43
|
handle_batch(parent_id, batch)
|
44
44
|
end
|
45
45
|
|
46
46
|
def handle_batch(parent_id, batch)
|
47
47
|
if batch.size > 1
|
48
|
-
enqueue_batch(batch)
|
48
|
+
enqueue_batch(batch, parent_id)
|
49
49
|
else
|
50
|
-
|
50
|
+
new.perform(parent_id, batch.first)
|
51
51
|
after_batch_hook(parent_id, batch)
|
52
52
|
end
|
53
53
|
end
|
@@ -60,7 +60,21 @@ module EasyML
|
|
60
60
|
end
|
61
61
|
|
62
62
|
def next_batch?(parent_id)
|
63
|
-
batches_remaining(parent_id) > 0
|
63
|
+
(batches_remaining(parent_id) > 0)
|
64
|
+
end
|
65
|
+
|
66
|
+
def list_batches
|
67
|
+
Resque.redis.hkeys("batches:tracking")
|
68
|
+
end
|
69
|
+
|
70
|
+
def track_batch(parent_id)
|
71
|
+
Resque.redis.hset("batches:tracking", parent_id, 1)
|
72
|
+
end
|
73
|
+
|
74
|
+
def cleanup_all
|
75
|
+
list_batches.each do |batch_id|
|
76
|
+
cleanup_batch(batch_id)
|
77
|
+
end
|
64
78
|
end
|
65
79
|
|
66
80
|
def batches_remaining(parent_id)
|
@@ -69,12 +83,39 @@ module EasyML
|
|
69
83
|
|
70
84
|
def cleanup_batch(parent_id)
|
71
85
|
Resque.redis.del("batch:#{parent_id}:remaining")
|
86
|
+
Resque.redis.hdel("batches:tracking", parent_id)
|
72
87
|
end
|
73
88
|
|
74
|
-
|
89
|
+
def batch_args
|
90
|
+
list_batches.map do |batch_id|
|
91
|
+
fetch_batch_arguments(batch_id)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def select_batches(&block)
|
96
|
+
list_batches.select do |batch_id|
|
97
|
+
yield fetch_batch_arguments(batch_id)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def poll
|
102
|
+
while true
|
103
|
+
sleep 2
|
104
|
+
EasyML::BatchJob.list_batches.map do |batch|
|
105
|
+
puts "Batch #{batch} | Remaining : #{EasyML::BatchJob.batches_remaining(batch)}"
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
75
109
|
|
76
110
|
def get_parent_batch_id(args_list)
|
77
|
-
args_list.dup.flatten.
|
111
|
+
args_list.dup.flatten.detect { |arg| arg.dig(:parent_batch_id) }.dig(:parent_batch_id)
|
112
|
+
end
|
113
|
+
|
114
|
+
private
|
115
|
+
|
116
|
+
def get_args_list(batch_id)
|
117
|
+
redis_key = "#{batch(batch_id)}:original_args"
|
118
|
+
redis.get(redis_key)
|
78
119
|
end
|
79
120
|
|
80
121
|
# Store batch arguments in Redis
|
@@ -14,31 +14,31 @@ module EasyML
|
|
14
14
|
#
|
15
15
|
# https://github.com/drfeelngood/resque-batched-job/blob/master/lib/resque/plugins/batched_job.rb#L86
|
16
16
|
batch_args = batch_args.dup
|
17
|
-
|
17
|
+
EasyML::ComputeFeatureJob.new.perform(batch_id, batch_args)
|
18
18
|
end
|
19
19
|
|
20
|
-
def
|
20
|
+
def perform(batch_id, batch_args = {})
|
21
21
|
EasyML::Feature.fit_one_batch(batch_id, batch_args)
|
22
22
|
end
|
23
23
|
|
24
24
|
def self.after_batch_hook(batch_id, *args)
|
25
|
-
|
26
|
-
|
27
|
-
parent_id = batch_args.pluck(:parent_batch_id).first
|
25
|
+
args = args.flatten.first.with_indifferent_access
|
26
|
+
feature_id = args.dig(:feature_id)
|
28
27
|
|
29
|
-
feature = EasyML::Feature.find_by(id:
|
28
|
+
feature = EasyML::Feature.find_by(id: feature_id)
|
30
29
|
|
31
30
|
if feature.failed?
|
32
31
|
dataset.features.where(workflow_status: :analyzing).update_all(workflow_status: :ready)
|
33
|
-
return BatchJob.cleanup_batch(
|
32
|
+
return BatchJob.cleanup_batch(batch_id)
|
34
33
|
end
|
35
34
|
|
36
35
|
feature.after_fit
|
37
36
|
|
38
|
-
if BatchJob.next_batch?(
|
39
|
-
BatchJob.enqueue_next_batch(self,
|
37
|
+
if BatchJob.next_batch?(batch_id)
|
38
|
+
BatchJob.enqueue_next_batch(self, batch_id)
|
40
39
|
else
|
41
|
-
|
40
|
+
cleanup_batch(batch_id)
|
41
|
+
dataset = feature.dataset
|
42
42
|
dataset.after_fit_features
|
43
43
|
end
|
44
44
|
end
|
data/app/jobs/easy_ml/reaper.rb
CHANGED
@@ -9,8 +9,8 @@ module EasyML
|
|
9
9
|
{
|
10
10
|
worker: worker,
|
11
11
|
working: true,
|
12
|
-
class: args.dig("job_class"),
|
13
|
-
args: args.dig("arguments"),
|
12
|
+
class: args.is_a?(Hash) ? args.dig("job_class") : nil,
|
13
|
+
args: args.is_a?(Hash) ? args.dig("arguments") : nil,
|
14
14
|
pid: worker.pid,
|
15
15
|
}
|
16
16
|
else
|
@@ -19,17 +19,23 @@ module EasyML
|
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
|
-
def find_job(worker_class, *args)
|
22
|
+
def find_job(worker_class, *args, &block)
|
23
23
|
list_workers.select do |config|
|
24
|
-
config.dig(:class) == worker_class.to_s
|
24
|
+
selected = config.dig(:class) == worker_class.to_s
|
25
|
+
if block_given?
|
26
|
+
selected &&= yield(config)
|
27
|
+
else
|
28
|
+
selected &= config.dig(:args) == args
|
29
|
+
end
|
30
|
+
selected
|
25
31
|
end
|
26
32
|
end
|
27
33
|
|
28
|
-
def kill(worker_class, *args)
|
29
|
-
find_job(worker_class, *args).each do |job|
|
34
|
+
def kill(worker_class, *args, &block)
|
35
|
+
find_job(worker_class, *args, &block).each do |job|
|
30
36
|
begin
|
31
|
-
# Send
|
32
|
-
Process.kill("
|
37
|
+
# Send HUP signal to the process
|
38
|
+
Process.kill("USR1", job[:pid])
|
33
39
|
|
34
40
|
# Remove the worker from Redis so it doesn't show up as a zombie
|
35
41
|
# in the Resque web interface. This is important because:
|
@@ -37,12 +43,10 @@ module EasyML
|
|
37
43
|
# 2. Prevents confusion about running workers
|
38
44
|
# 3. Allows proper worker cleanup in Redis
|
39
45
|
job[:worker].done_working
|
40
|
-
job[:worker].unregister_worker
|
41
46
|
rescue Errno::ESRCH
|
42
47
|
# Process already gone, but still try to clean up Redis
|
43
48
|
begin
|
44
49
|
job[:worker].done_working
|
45
|
-
job[:worker].unregister_worker
|
46
50
|
rescue => e
|
47
51
|
# Redis cleanup failed, worker might already be unregistered
|
48
52
|
puts "Failed to unregister worker: #{e.message}"
|
@@ -8,23 +8,7 @@ module EasyML
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def deserialize_dataframe(df_data)
|
11
|
-
|
12
|
-
|
13
|
-
columns = df_data["columns"].map do |col|
|
14
|
-
dtype = case col["datatype"]
|
15
|
-
when Hash
|
16
|
-
if col["datatype"]["Datetime"]
|
17
|
-
Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
|
18
|
-
else
|
19
|
-
Polars::Utf8
|
20
|
-
end
|
21
|
-
else
|
22
|
-
Polars.const_get(col["datatype"])
|
23
|
-
end
|
24
|
-
Polars::Series.new(col["name"], col["values"], dtype: dtype)
|
25
|
-
end
|
26
|
-
|
27
|
-
Polars::DataFrame.new(columns)
|
11
|
+
Polars::DataFrame.new(df_data)
|
28
12
|
end
|
29
13
|
end
|
30
14
|
end
|
@@ -10,7 +10,7 @@ module EasyML
|
|
10
10
|
|
11
11
|
def transform(df)
|
12
12
|
df = df.with_column(
|
13
|
-
Polars.col(column.name).fill_null(Polars.lit(UTC.today.beginning_of_day)).alias(column.name)
|
13
|
+
Polars.col(column.name).fill_null(Polars.lit(EasyML::Support::UTC.today.beginning_of_day)).alias(column.name)
|
14
14
|
)
|
15
15
|
df
|
16
16
|
end
|
@@ -15,8 +15,8 @@ module EasyML
|
|
15
15
|
(column.one_hot? && type.to_sym == :processed)
|
16
16
|
end
|
17
17
|
|
18
|
-
TYPES_ALL = %i(raw
|
19
|
-
TYPES_RAW = %i(raw
|
18
|
+
TYPES_ALL = %i(raw processed)
|
19
|
+
TYPES_RAW = %i(raw)
|
20
20
|
TYPES_PROCESSED = %i(processed)
|
21
21
|
|
22
22
|
def types(type = :all)
|
@@ -21,7 +21,10 @@ module EasyML
|
|
21
21
|
|
22
22
|
def run_queries(split, type)
|
23
23
|
queries = build_queries(split, type)
|
24
|
-
|
24
|
+
|
25
|
+
dataset.columns.apply_clip(
|
26
|
+
@dataset.send(type).send(split, all_columns: true, lazy: true)
|
27
|
+
).select(queries).collect
|
25
28
|
end
|
26
29
|
|
27
30
|
def get_column_statistics(query_results)
|
@@ -180,6 +180,8 @@ module EasyML
|
|
180
180
|
EasyML::Reaper.kill(EasyML::RefreshDatasetJob, id)
|
181
181
|
update(workflow_status: :ready)
|
182
182
|
unlock!
|
183
|
+
features.update_all(needs_fit: true, workflow_status: "ready")
|
184
|
+
features.each(&:wipe)
|
183
185
|
end
|
184
186
|
|
185
187
|
def refresh_async
|
@@ -201,12 +203,6 @@ module EasyML
|
|
201
203
|
@raw = initialize_split("raw")
|
202
204
|
end
|
203
205
|
|
204
|
-
def clipped
|
205
|
-
return @clipped if @clipped && @clipped.dataset
|
206
|
-
|
207
|
-
@clipped = initialize_split("clipped")
|
208
|
-
end
|
209
|
-
|
210
206
|
def processed
|
211
207
|
return @processed if @processed && @processed.dataset
|
212
208
|
|
@@ -287,6 +283,7 @@ module EasyML
|
|
287
283
|
|
288
284
|
def fit_features(async: false, features: self.features, force: false)
|
289
285
|
features_to_compute = force ? features : features.needs_fit
|
286
|
+
puts "Features to compute.... #{features_to_compute}"
|
290
287
|
return after_fit_features if features_to_compute.empty?
|
291
288
|
|
292
289
|
features.first.fit(features: features_to_compute, async: async)
|
@@ -295,10 +292,12 @@ module EasyML
|
|
295
292
|
measure_method_timing :fit_features
|
296
293
|
|
297
294
|
def after_fit_features
|
295
|
+
puts "After fit features"
|
298
296
|
unlock!
|
299
297
|
reload
|
300
298
|
return if failed?
|
301
299
|
|
300
|
+
puts "Actually refresh..."
|
302
301
|
actually_refresh
|
303
302
|
end
|
304
303
|
|
@@ -385,6 +384,7 @@ module EasyML
|
|
385
384
|
|
386
385
|
def unlock!
|
387
386
|
Support::Lockable.unlock!(lock_key)
|
387
|
+
features.each(&:unlock!)
|
388
388
|
end
|
389
389
|
|
390
390
|
def locked?
|
@@ -490,6 +490,24 @@ module EasyML
|
|
490
490
|
df
|
491
491
|
end
|
492
492
|
|
493
|
+
# Massage out one-hot cats to their canonical name
|
494
|
+
#
|
495
|
+
# Takes: ["Sex_male", "Sex_female", "Embarked_c", "PassengerId"]
|
496
|
+
# Returns: ["Embarked", "Sex", "PassengerId"]
|
497
|
+
def regular_columns(col_list)
|
498
|
+
one_hot_cats = columns.allowed_categories.invert.reduce({}) do |h, (k, v)|
|
499
|
+
h.tap do
|
500
|
+
k.each do |k2|
|
501
|
+
h["#{v}_#{k2}"] = v
|
502
|
+
end
|
503
|
+
end
|
504
|
+
end
|
505
|
+
|
506
|
+
col_list.map do |col|
|
507
|
+
one_hot_cats.key?(col) ? one_hot_cats[col] : col
|
508
|
+
end.uniq.sort
|
509
|
+
end
|
510
|
+
|
493
511
|
measure_method_timing :normalize
|
494
512
|
|
495
513
|
def missing_required_fields(df)
|
@@ -537,7 +555,6 @@ module EasyML
|
|
537
555
|
|
538
556
|
def cleanup
|
539
557
|
raw.cleanup
|
540
|
-
clipped.cleanup
|
541
558
|
processed.cleanup
|
542
559
|
end
|
543
560
|
|
@@ -730,10 +747,8 @@ module EasyML
|
|
730
747
|
|
731
748
|
def initialize_splits
|
732
749
|
@raw = nil
|
733
|
-
@clipped = nil
|
734
750
|
@processed = nil
|
735
751
|
raw
|
736
|
-
clipped
|
737
752
|
processed
|
738
753
|
end
|
739
754
|
|
@@ -778,7 +793,7 @@ module EasyML
|
|
778
793
|
processed.cleanup
|
779
794
|
|
780
795
|
SPLIT_ORDER.each do |segment|
|
781
|
-
df =
|
796
|
+
df = raw.read(segment)
|
782
797
|
learn_computed_columns(df) if segment == :train
|
783
798
|
processed_df = normalize(df, all_columns: true)
|
784
799
|
processed.save(segment, processed_df)
|
@@ -825,26 +840,9 @@ module EasyML
|
|
825
840
|
end
|
826
841
|
|
827
842
|
def fit
|
828
|
-
apply_clip
|
829
843
|
learn_statistics(type: :raw)
|
830
844
|
end
|
831
845
|
|
832
|
-
def apply_clip
|
833
|
-
clipped.cleanup
|
834
|
-
|
835
|
-
SPLIT_ORDER.each do |segment|
|
836
|
-
df = raw.send(segment, lazy: true, all_columns: true)
|
837
|
-
clipped.save(
|
838
|
-
segment,
|
839
|
-
columns.apply_clip(df) # Ensuring this returns a LazyFrame means we'll automatically use sink_parquet
|
840
|
-
)
|
841
|
-
end
|
842
|
-
end
|
843
|
-
|
844
|
-
measure_method_timing :apply_clip
|
845
|
-
|
846
|
-
# log_method :fit, "Learning statistics", verbose: true
|
847
|
-
|
848
846
|
def split_data!
|
849
847
|
split_data(force: true)
|
850
848
|
end
|
@@ -22,7 +22,6 @@ module EasyML
|
|
22
22
|
DATASOURCE_OPTIONS = {
|
23
23
|
"s3" => "EasyML::Datasources::S3Datasource",
|
24
24
|
"file" => "EasyML::Datasources::FileDatasource",
|
25
|
-
"polars" => "EasyML::Datasources::PolarsDatasource",
|
26
25
|
}
|
27
26
|
DATASOURCE_TYPES = [
|
28
27
|
{
|
@@ -35,11 +34,6 @@ module EasyML
|
|
35
34
|
label: "Local Files",
|
36
35
|
description: "Connect to data stored in local files",
|
37
36
|
},
|
38
|
-
{
|
39
|
-
value: "polars",
|
40
|
-
label: "Polars DataFrame",
|
41
|
-
description: "In-memory dataframe storage using Polars",
|
42
|
-
},
|
43
37
|
].freeze
|
44
38
|
DATASOURCE_NAMES = DATASOURCE_OPTIONS.keys.freeze
|
45
39
|
DATASOURCE_CONSTANTS = DATASOURCE_OPTIONS.values.map(&:constantize)
|
@@ -88,6 +88,7 @@ module EasyML
|
|
88
88
|
before_save :update_sha
|
89
89
|
after_find :update_from_feature_class
|
90
90
|
before_save :update_from_feature_class
|
91
|
+
before_destroy :wipe
|
91
92
|
|
92
93
|
def feature_klass
|
93
94
|
feature_class.constantize
|
@@ -197,7 +198,7 @@ module EasyML
|
|
197
198
|
end
|
198
199
|
|
199
200
|
EasyML::Data::Partition::Boundaries.new(
|
200
|
-
reader.data(lazy: true),
|
201
|
+
reader.data(lazy: true, all_columns: true),
|
201
202
|
primary_key,
|
202
203
|
batch_size
|
203
204
|
).to_a.map.with_index do |partition, idx|
|
@@ -207,7 +208,6 @@ module EasyML
|
|
207
208
|
batch_end: partition[:partition_end],
|
208
209
|
batch_number: feature_position,
|
209
210
|
subbatch_number: idx,
|
210
|
-
parent_batch_id: Random.uuid,
|
211
211
|
}
|
212
212
|
end
|
213
213
|
end
|
@@ -218,7 +218,12 @@ module EasyML
|
|
218
218
|
|
219
219
|
def fit(features: [self], async: false)
|
220
220
|
ordered_features = features.sort_by(&:feature_position)
|
221
|
-
|
221
|
+
parent_batch_id = Random.uuid
|
222
|
+
jobs = ordered_features.map do |feature|
|
223
|
+
feature.build_batches.map do |batch_args|
|
224
|
+
batch_args.merge(parent_batch_id: parent_batch_id)
|
225
|
+
end
|
226
|
+
end
|
222
227
|
job_count = jobs.dup.flatten.size
|
223
228
|
|
224
229
|
ordered_features.each(&:wipe)
|
@@ -454,6 +459,10 @@ module EasyML
|
|
454
459
|
update!(updates)
|
455
460
|
end
|
456
461
|
|
462
|
+
def unlock!
|
463
|
+
feature_store.unlock!
|
464
|
+
end
|
465
|
+
|
457
466
|
UNCONFIGURABLE_COLUMNS = %w(
|
458
467
|
id
|
459
468
|
dataset_id
|