easy_ml 0.2.0.pre.rc76 → 0.2.0.pre.rc78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/models_controller.rb +3 -2
- data/app/frontend/components/ModelForm.tsx +16 -0
- data/app/frontend/components/ScheduleModal.tsx +0 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +7 -6
- data/app/jobs/easy_ml/application_job.rb +1 -0
- data/app/jobs/easy_ml/batch_job.rb +47 -6
- data/app/jobs/easy_ml/compute_feature_job.rb +10 -10
- data/app/jobs/easy_ml/reaper.rb +14 -10
- data/app/jobs/easy_ml/refresh_dataset_job.rb +2 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +1 -0
- data/app/models/concerns/easy_ml/dataframe_serialization.rb +1 -17
- data/app/models/easy_ml/column/imputers/base.rb +1 -1
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -5
- data/app/models/easy_ml/column/imputers/today.rb +1 -1
- data/app/models/easy_ml/column/selector.rb +0 -8
- data/app/models/easy_ml/column.rb +1 -1
- data/app/models/easy_ml/dataset/learner/base.rb +2 -2
- data/app/models/easy_ml/dataset/learner/eager.rb +3 -1
- data/app/models/easy_ml/dataset/learner/lazy.rb +4 -1
- data/app/models/easy_ml/dataset/refresh_reasons.rb +12 -0
- data/app/models/easy_ml/dataset.rb +29 -76
- data/app/models/easy_ml/datasource.rb +0 -6
- data/app/models/easy_ml/feature.rb +27 -38
- data/app/models/easy_ml/model.rb +20 -2
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +3 -2
- data/app/models/easy_ml/models/xgboost.rb +52 -36
- data/app/models/easy_ml/retraining_run.rb +1 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +1 -1
- data/app/serializers/easy_ml/model_serializer.rb +1 -0
- data/lib/easy_ml/core/tuner.rb +7 -4
- data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
- data/lib/easy_ml/data/dataset_manager/reader/base.rb +80 -0
- data/lib/easy_ml/data/dataset_manager/reader/batch.rb +106 -0
- data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +23 -0
- data/lib/easy_ml/data/dataset_manager/reader/file.rb +75 -0
- data/lib/easy_ml/data/dataset_manager/reader.rb +58 -0
- data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +67 -0
- data/lib/easy_ml/data/dataset_manager/writer/base.rb +139 -0
- data/lib/easy_ml/data/dataset_manager/writer/named.rb +14 -0
- data/lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb +15 -0
- data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +150 -0
- data/lib/easy_ml/data/dataset_manager/writer.rb +80 -0
- data/lib/easy_ml/data/dataset_manager.rb +140 -0
- data/lib/easy_ml/data/partition/boundaries.rb +60 -0
- data/lib/easy_ml/data/partition.rb +7 -0
- data/lib/easy_ml/data/polars_column.rb +19 -5
- data/lib/easy_ml/data/synced_directory.rb +1 -2
- data/lib/easy_ml/data.rb +2 -0
- data/lib/easy_ml/engine.rb +16 -14
- data/lib/easy_ml/feature_store.rb +21 -188
- data/lib/easy_ml/reasons.rb +41 -0
- data/lib/easy_ml/support/lockable.rb +1 -5
- data/lib/easy_ml/version.rb +1 -1
- data/lib/easy_ml.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Bbf3mD_b.js +522 -0
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-B1qLZuyu.js.map → Application.tsx-Bbf3mD_b.js.map} +1 -1
- metadata +24 -9
- data/app/models/easy_ml/datasources/polars_datasource.rb +0 -69
- data/lib/easy_ml/data/filter_extensions.rb +0 -31
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-B1qLZuyu.js +0 -522
- /data/app/models/{lineage_history.rb → easy_ml/lineage_history.rb} +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13858267adb9445f665a01214f2109bc23dd63a76d5ab0ae502c60ac94a6d2d4
|
4
|
+
data.tar.gz: bc1b37afabf4757ce1e7e311699d6e8ac0bea2230025d8e696ada4071b0b3563
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ccd5fc9e0b9529da07012a1745f826cf8e88391b24e3df20ba636c9e6ccf853172d18916cccc3087692873971a9dd2b72aa7151e286824df5cb500255610d603
|
7
|
+
data.tar.gz: 6034abbae5e25a00f204a649c62b568a90a76481c6ff91aaadd766fe515fe76dbf6692bebabe905c5a4bc1b9642717c77f4cbfda6b43684624a5e32517f73d99
|
@@ -30,7 +30,7 @@ module EasyML
|
|
30
30
|
def new
|
31
31
|
render inertia: "pages/NewModelPage", props: {
|
32
32
|
datasets: EasyML::Dataset.all.map do |dataset|
|
33
|
-
dataset
|
33
|
+
dataset_to_json(dataset)
|
34
34
|
end,
|
35
35
|
constants: EasyML::Model.constants,
|
36
36
|
}
|
@@ -41,7 +41,7 @@ module EasyML
|
|
41
41
|
render inertia: "pages/EditModelPage", props: {
|
42
42
|
model: model_to_json(model),
|
43
43
|
datasets: EasyML::Dataset.all.map do |dataset|
|
44
|
-
dataset
|
44
|
+
dataset_to_json(dataset)
|
45
45
|
end,
|
46
46
|
constants: EasyML::Model.constants,
|
47
47
|
}
|
@@ -177,6 +177,7 @@ module EasyML
|
|
177
177
|
:dataset_id,
|
178
178
|
:task,
|
179
179
|
:objective,
|
180
|
+
:weights_column,
|
180
181
|
metrics: [],
|
181
182
|
retraining_job_attributes: [
|
182
183
|
:id,
|
@@ -16,6 +16,7 @@ interface ModelFormProps {
|
|
16
16
|
task: string;
|
17
17
|
objective?: string;
|
18
18
|
metrics?: string[];
|
19
|
+
weights_column?: string;
|
19
20
|
retraining_job?: {
|
20
21
|
frequency: string;
|
21
22
|
at: {
|
@@ -75,6 +76,7 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
|
|
75
76
|
task: initialData?.task || 'classification',
|
76
77
|
objective: initialData?.objective || 'binary:logistic',
|
77
78
|
metrics: initialData?.metrics || ['accuracy_score'],
|
79
|
+
weights_column: initialData?.weights_column || '',
|
78
80
|
retraining_job_attributes: initialData?.retraining_job ? {
|
79
81
|
id: initialData.retraining_job.id,
|
80
82
|
frequency: initialData.retraining_job.frequency,
|
@@ -165,6 +167,7 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
|
|
165
167
|
};
|
166
168
|
|
167
169
|
const selectedDataset = datasets.find(d => d.id === data.model.dataset_id);
|
170
|
+
const columns = selectedDataset?.columns || [];
|
168
171
|
|
169
172
|
const filteredTunerJobConstants = constants.tuner_job_constants[data.model.model_type] || {};
|
170
173
|
|
@@ -246,6 +249,19 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
|
|
246
249
|
<ErrorDisplay error={errors.dataset_id} />
|
247
250
|
</div>
|
248
251
|
|
252
|
+
<div>
|
253
|
+
<label className="block text-sm font-medium text-gray-700 mb-1">
|
254
|
+
Weights Column (Optional)
|
255
|
+
</label>
|
256
|
+
<SearchableSelect
|
257
|
+
value={data.model.weights_column}
|
258
|
+
options={columns.map(col => ({ value: col.name, label: col.name }))}
|
259
|
+
onChange={(value) => setData('model.weights_column', value)}
|
260
|
+
isClearable={true}
|
261
|
+
/>
|
262
|
+
<ErrorDisplay error={errors.weights_column} />
|
263
|
+
</div>
|
264
|
+
|
249
265
|
<div>
|
250
266
|
<label className="block text-sm font-medium text-gray-700 mb-1">
|
251
267
|
Task
|
@@ -587,8 +587,6 @@ export function ScheduleModal({ isOpen, onClose, onSave, initialData, metrics, t
|
|
587
587
|
value={formData.retraining_job_attributes.threshold}
|
588
588
|
onChange={(e) => handleEvaluatorChange('threshold', parseFloat(e.target.value))}
|
589
589
|
step={0.01}
|
590
|
-
min={0}
|
591
|
-
max={1}
|
592
590
|
className="block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 py-2 px-4 shadow-sm border-gray-300 border"
|
593
591
|
/>
|
594
592
|
</div>
|
@@ -250,16 +250,17 @@ export function PreprocessingConfig({
|
|
250
250
|
setIsEditingDescription(true);
|
251
251
|
};
|
252
252
|
|
253
|
-
let nullCount = (column.statistics?.processed
|
254
|
-
|
255
|
-
|
253
|
+
let nullCount = (column.statistics?.processed?.null_count || column.statistics?.raw?.null_count) || 0;
|
254
|
+
let numRows = (column.statistics?.processed?.num_rows) || (column.statistics?.raw?.num_rows) || 0;
|
255
|
+
const nullPercentage = nullCount && numRows
|
256
|
+
? ((nullCount / numRows) * 100)
|
256
257
|
: 0;
|
257
258
|
|
258
|
-
const nullPercentageProcessed = column.statistics?.processed?.null_count && column.statistics?.
|
259
|
-
? ((column.statistics.processed.null_count / column.statistics.
|
259
|
+
const nullPercentageProcessed = column.statistics?.processed?.null_count && column.statistics?.processed?.num_rows
|
260
|
+
? ((column.statistics.processed.null_count / column.statistics.processed.num_rows) * 100)
|
260
261
|
: 0;
|
261
262
|
|
262
|
-
const totalRows =
|
263
|
+
const totalRows = numRows;
|
263
264
|
|
264
265
|
const renderStrategySpecificInfo = (type: 'training' | 'inference') => {
|
265
266
|
const strategy = type === 'training' ? training : inference;
|
@@ -39,15 +39,15 @@ module EasyML
|
|
39
39
|
rest.map do |batch|
|
40
40
|
Resque.redis.rpush("batch:#{parent_id}:remaining", batch.to_json)
|
41
41
|
end
|
42
|
-
|
42
|
+
track_batch(parent_id)
|
43
43
|
handle_batch(parent_id, batch)
|
44
44
|
end
|
45
45
|
|
46
46
|
def handle_batch(parent_id, batch)
|
47
47
|
if batch.size > 1
|
48
|
-
enqueue_batch(batch)
|
48
|
+
enqueue_batch(batch, parent_id)
|
49
49
|
else
|
50
|
-
|
50
|
+
new.perform(parent_id, batch.first)
|
51
51
|
after_batch_hook(parent_id, batch)
|
52
52
|
end
|
53
53
|
end
|
@@ -60,7 +60,21 @@ module EasyML
|
|
60
60
|
end
|
61
61
|
|
62
62
|
def next_batch?(parent_id)
|
63
|
-
batches_remaining(parent_id) > 0
|
63
|
+
(batches_remaining(parent_id) > 0)
|
64
|
+
end
|
65
|
+
|
66
|
+
def list_batches
|
67
|
+
Resque.redis.hkeys("batches:tracking")
|
68
|
+
end
|
69
|
+
|
70
|
+
def track_batch(parent_id)
|
71
|
+
Resque.redis.hset("batches:tracking", parent_id, 1)
|
72
|
+
end
|
73
|
+
|
74
|
+
def cleanup_all
|
75
|
+
list_batches.each do |batch_id|
|
76
|
+
cleanup_batch(batch_id)
|
77
|
+
end
|
64
78
|
end
|
65
79
|
|
66
80
|
def batches_remaining(parent_id)
|
@@ -69,12 +83,39 @@ module EasyML
|
|
69
83
|
|
70
84
|
def cleanup_batch(parent_id)
|
71
85
|
Resque.redis.del("batch:#{parent_id}:remaining")
|
86
|
+
Resque.redis.hdel("batches:tracking", parent_id)
|
72
87
|
end
|
73
88
|
|
74
|
-
|
89
|
+
def batch_args
|
90
|
+
list_batches.map do |batch_id|
|
91
|
+
fetch_batch_arguments(batch_id)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def select_batches(&block)
|
96
|
+
list_batches.select do |batch_id|
|
97
|
+
yield fetch_batch_arguments(batch_id)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def poll
|
102
|
+
while true
|
103
|
+
sleep 2
|
104
|
+
EasyML::BatchJob.list_batches.map do |batch|
|
105
|
+
puts "Batch #{batch} | Remaining : #{EasyML::BatchJob.batches_remaining(batch)}"
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
75
109
|
|
76
110
|
def get_parent_batch_id(args_list)
|
77
|
-
args_list.dup.flatten.
|
111
|
+
args_list.dup.flatten.detect { |arg| arg.dig(:parent_batch_id) }.dig(:parent_batch_id)
|
112
|
+
end
|
113
|
+
|
114
|
+
private
|
115
|
+
|
116
|
+
def get_args_list(batch_id)
|
117
|
+
redis_key = "#{batch(batch_id)}:original_args"
|
118
|
+
redis.get(redis_key)
|
78
119
|
end
|
79
120
|
|
80
121
|
# Store batch arguments in Redis
|
@@ -14,31 +14,31 @@ module EasyML
|
|
14
14
|
#
|
15
15
|
# https://github.com/drfeelngood/resque-batched-job/blob/master/lib/resque/plugins/batched_job.rb#L86
|
16
16
|
batch_args = batch_args.dup
|
17
|
-
|
17
|
+
EasyML::ComputeFeatureJob.new.perform(batch_id, batch_args)
|
18
18
|
end
|
19
19
|
|
20
|
-
def
|
20
|
+
def perform(batch_id, batch_args = {})
|
21
21
|
EasyML::Feature.fit_one_batch(batch_id, batch_args)
|
22
22
|
end
|
23
23
|
|
24
24
|
def self.after_batch_hook(batch_id, *args)
|
25
|
-
|
26
|
-
|
27
|
-
parent_id = batch_args.pluck(:parent_batch_id).first
|
25
|
+
args = args.flatten.first.with_indifferent_access
|
26
|
+
feature_id = args.dig(:feature_id)
|
28
27
|
|
29
|
-
feature = EasyML::Feature.find_by(id:
|
28
|
+
feature = EasyML::Feature.find_by(id: feature_id)
|
30
29
|
|
31
30
|
if feature.failed?
|
32
31
|
dataset.features.where(workflow_status: :analyzing).update_all(workflow_status: :ready)
|
33
|
-
return BatchJob.cleanup_batch(
|
32
|
+
return BatchJob.cleanup_batch(batch_id)
|
34
33
|
end
|
35
34
|
|
36
35
|
feature.after_fit
|
37
36
|
|
38
|
-
if BatchJob.next_batch?(
|
39
|
-
BatchJob.enqueue_next_batch(self,
|
37
|
+
if BatchJob.next_batch?(batch_id)
|
38
|
+
BatchJob.enqueue_next_batch(self, batch_id)
|
40
39
|
else
|
41
|
-
|
40
|
+
cleanup_batch(batch_id)
|
41
|
+
dataset = feature.dataset
|
42
42
|
dataset.after_fit_features
|
43
43
|
end
|
44
44
|
end
|
data/app/jobs/easy_ml/reaper.rb
CHANGED
@@ -9,8 +9,8 @@ module EasyML
|
|
9
9
|
{
|
10
10
|
worker: worker,
|
11
11
|
working: true,
|
12
|
-
class: args.dig("job_class"),
|
13
|
-
args: args.dig("arguments"),
|
12
|
+
class: args.is_a?(Hash) ? args.dig("job_class") : nil,
|
13
|
+
args: args.is_a?(Hash) ? args.dig("arguments") : nil,
|
14
14
|
pid: worker.pid,
|
15
15
|
}
|
16
16
|
else
|
@@ -19,17 +19,23 @@ module EasyML
|
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
|
-
def find_job(worker_class, *args)
|
22
|
+
def find_job(worker_class, *args, &block)
|
23
23
|
list_workers.select do |config|
|
24
|
-
config.dig(:class) == worker_class.to_s
|
24
|
+
selected = config.dig(:class) == worker_class.to_s
|
25
|
+
if block_given?
|
26
|
+
selected &&= yield(config)
|
27
|
+
else
|
28
|
+
selected &= config.dig(:args) == args
|
29
|
+
end
|
30
|
+
selected
|
25
31
|
end
|
26
32
|
end
|
27
33
|
|
28
|
-
def kill(worker_class, *args)
|
29
|
-
find_job(worker_class, *args).each do |job|
|
34
|
+
def kill(worker_class, *args, &block)
|
35
|
+
find_job(worker_class, *args, &block).each do |job|
|
30
36
|
begin
|
31
|
-
# Send
|
32
|
-
Process.kill("
|
37
|
+
# Send HUP signal to the process
|
38
|
+
Process.kill("USR1", job[:pid])
|
33
39
|
|
34
40
|
# Remove the worker from Redis so it doesn't show up as a zombie
|
35
41
|
# in the Resque web interface. This is important because:
|
@@ -37,12 +43,10 @@ module EasyML
|
|
37
43
|
# 2. Prevents confusion about running workers
|
38
44
|
# 3. Allows proper worker cleanup in Redis
|
39
45
|
job[:worker].done_working
|
40
|
-
job[:worker].unregister_worker
|
41
46
|
rescue Errno::ESRCH
|
42
47
|
# Process already gone, but still try to clean up Redis
|
43
48
|
begin
|
44
49
|
job[:worker].done_working
|
45
|
-
job[:worker].unregister_worker
|
46
50
|
rescue => e
|
47
51
|
# Redis cleanup failed, worker might already be unregistered
|
48
52
|
puts "Failed to unregister worker: #{e.message}"
|
@@ -8,23 +8,7 @@ module EasyML
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def deserialize_dataframe(df_data)
|
11
|
-
|
12
|
-
|
13
|
-
columns = df_data["columns"].map do |col|
|
14
|
-
dtype = case col["datatype"]
|
15
|
-
when Hash
|
16
|
-
if col["datatype"]["Datetime"]
|
17
|
-
Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
|
18
|
-
else
|
19
|
-
Polars::Utf8
|
20
|
-
end
|
21
|
-
else
|
22
|
-
Polars.const_get(col["datatype"])
|
23
|
-
end
|
24
|
-
Polars::Series.new(col["name"], col["values"], dtype: dtype)
|
25
|
-
end
|
26
|
-
|
27
|
-
Polars::DataFrame.new(columns)
|
11
|
+
Polars::DataFrame.new(df_data)
|
28
12
|
end
|
29
13
|
end
|
30
14
|
end
|
@@ -10,7 +10,7 @@ module EasyML
|
|
10
10
|
|
11
11
|
def transform(df)
|
12
12
|
df = df.with_column(
|
13
|
-
Polars.col(column.name).fill_null(Polars.lit(UTC.today.beginning_of_day)).alias(column.name)
|
13
|
+
Polars.col(column.name).fill_null(Polars.lit(EasyML::Support::UTC.today.beginning_of_day)).alias(column.name)
|
14
14
|
)
|
15
15
|
df
|
16
16
|
end
|
@@ -15,8 +15,8 @@ module EasyML
|
|
15
15
|
(column.one_hot? && type.to_sym == :processed)
|
16
16
|
end
|
17
17
|
|
18
|
-
TYPES_ALL = %i(raw
|
19
|
-
TYPES_RAW = %i(raw
|
18
|
+
TYPES_ALL = %i(raw processed)
|
19
|
+
TYPES_RAW = %i(raw)
|
20
20
|
TYPES_PROCESSED = %i(processed)
|
21
21
|
|
22
22
|
def types(type = :all)
|
@@ -21,7 +21,10 @@ module EasyML
|
|
21
21
|
|
22
22
|
def run_queries(split, type)
|
23
23
|
queries = build_queries(split, type)
|
24
|
-
|
24
|
+
|
25
|
+
dataset.columns.apply_clip(
|
26
|
+
@dataset.send(type).send(split, all_columns: true, lazy: true)
|
27
|
+
).select(queries).collect
|
25
28
|
end
|
26
29
|
|
27
30
|
def get_column_statistics(query_results)
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class RefreshReasons < EasyML::Reasons
|
4
|
+
add_reason "Not split", -> { not_split? }
|
5
|
+
add_reason "Refreshed at is nil", -> { refreshed_at.nil? }
|
6
|
+
add_reason "Columns need refresh", -> { columns_need_refresh? }
|
7
|
+
add_reason "Features need refresh", -> { features_need_fit? }
|
8
|
+
add_reason "Datasource needs refresh", -> { datasource_needs_refresh? }
|
9
|
+
add_reason "Datasource was refreshed", -> { datasource_was_refreshed? }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -180,6 +180,8 @@ module EasyML
|
|
180
180
|
EasyML::Reaper.kill(EasyML::RefreshDatasetJob, id)
|
181
181
|
update(workflow_status: :ready)
|
182
182
|
unlock!
|
183
|
+
features.update_all(needs_fit: true, workflow_status: "ready")
|
184
|
+
features.each(&:wipe)
|
183
185
|
end
|
184
186
|
|
185
187
|
def refresh_async
|
@@ -201,12 +203,6 @@ module EasyML
|
|
201
203
|
@raw = initialize_split("raw")
|
202
204
|
end
|
203
205
|
|
204
|
-
def clipped
|
205
|
-
return @clipped if @clipped && @clipped.dataset
|
206
|
-
|
207
|
-
@clipped = initialize_split("clipped")
|
208
|
-
end
|
209
|
-
|
210
206
|
def processed
|
211
207
|
return @processed if @processed && @processed.dataset
|
212
208
|
|
@@ -265,9 +261,7 @@ module EasyML
|
|
265
261
|
|
266
262
|
def refresh!(async: false)
|
267
263
|
refreshing do
|
268
|
-
puts "Prepare..."
|
269
264
|
prepare!
|
270
|
-
puts "Fit features..."
|
271
265
|
fit_features!(async: async)
|
272
266
|
end
|
273
267
|
end
|
@@ -276,9 +270,7 @@ module EasyML
|
|
276
270
|
return refresh_async if async
|
277
271
|
|
278
272
|
refreshing do
|
279
|
-
puts "prepare.."
|
280
273
|
prepare
|
281
|
-
puts "fit features..."
|
282
274
|
fit_features(async: async)
|
283
275
|
end
|
284
276
|
end
|
@@ -291,6 +283,7 @@ module EasyML
|
|
291
283
|
|
292
284
|
def fit_features(async: false, features: self.features, force: false)
|
293
285
|
features_to_compute = force ? features : features.needs_fit
|
286
|
+
puts "Features to compute.... #{features_to_compute}"
|
294
287
|
return after_fit_features if features_to_compute.empty?
|
295
288
|
|
296
289
|
features.first.fit(features: features_to_compute, async: async)
|
@@ -299,11 +292,12 @@ module EasyML
|
|
299
292
|
measure_method_timing :fit_features
|
300
293
|
|
301
294
|
def after_fit_features
|
302
|
-
puts "
|
295
|
+
puts "After fit features"
|
303
296
|
unlock!
|
304
297
|
reload
|
305
298
|
return if failed?
|
306
299
|
|
300
|
+
puts "Actually refresh..."
|
307
301
|
actually_refresh
|
308
302
|
end
|
309
303
|
|
@@ -338,45 +332,12 @@ module EasyML
|
|
338
332
|
#
|
339
333
|
# So yes this is an annoying way to structure a method, but it's helpful for performance
|
340
334
|
#
|
341
|
-
def refresh_reasons(
|
342
|
-
|
343
|
-
not_split: {
|
344
|
-
name: "Not split",
|
345
|
-
check: -> { not_split? },
|
346
|
-
},
|
347
|
-
refreshed_at_is_nil: {
|
348
|
-
name: "Refreshed at is nil",
|
349
|
-
check: -> { refreshed_at.nil? },
|
350
|
-
},
|
351
|
-
columns_need_refresh: {
|
352
|
-
name: "Columns need refresh",
|
353
|
-
check: -> { columns_need_refresh? },
|
354
|
-
},
|
355
|
-
features_need_fit: {
|
356
|
-
name: "Features need refresh",
|
357
|
-
check: -> { features_need_fit? },
|
358
|
-
},
|
359
|
-
datasource_needs_refresh: {
|
360
|
-
name: "Datasource needs refresh",
|
361
|
-
check: -> { datasource_needs_refresh? },
|
362
|
-
},
|
363
|
-
refreshed_datasource: {
|
364
|
-
name: "Refreshed datasource",
|
365
|
-
check: -> { refreshed_datasource? },
|
366
|
-
},
|
367
|
-
datasource_was_refreshed: {
|
368
|
-
name: "Datasource was refreshed",
|
369
|
-
check: -> { datasource_was_refreshed? },
|
370
|
-
},
|
371
|
-
}.except(*exclude).select do |k, config|
|
372
|
-
config[:check].call
|
373
|
-
end.map do |k, config|
|
374
|
-
config[:name]
|
375
|
-
end
|
335
|
+
def refresh_reasons(except: [])
|
336
|
+
RefreshReasons.new(self).check(except: except)
|
376
337
|
end
|
377
338
|
|
378
|
-
def needs_refresh?(
|
379
|
-
refresh_reasons(
|
339
|
+
def needs_refresh?(except: [])
|
340
|
+
refresh_reasons(except: except).any?
|
380
341
|
end
|
381
342
|
|
382
343
|
def processed?
|
@@ -423,6 +384,7 @@ module EasyML
|
|
423
384
|
|
424
385
|
def unlock!
|
425
386
|
Support::Lockable.unlock!(lock_key)
|
387
|
+
features.each(&:unlock!)
|
426
388
|
end
|
427
389
|
|
428
390
|
def locked?
|
@@ -518,23 +480,34 @@ module EasyML
|
|
518
480
|
end
|
519
481
|
|
520
482
|
def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
|
521
|
-
puts "Apply missing features..."
|
522
483
|
df = apply_missing_columns(df, inference: inference)
|
523
|
-
puts "Transform columns..."
|
524
484
|
df = columns.transform(df, inference: inference)
|
525
|
-
puts "Apply features..."
|
526
485
|
df = apply_features(df, features)
|
527
|
-
puts "Transform columns..."
|
528
486
|
df = columns.transform(df, inference: inference, computed: true)
|
529
|
-
puts "Apply column mask..."
|
530
487
|
df = apply_column_mask(df, inference: inference) unless all_columns
|
531
|
-
puts "Drop nulls..."
|
532
488
|
df = drop_nulls(df) unless inference
|
533
|
-
puts "Split features and targets..."
|
534
489
|
df, = processed.split_features_targets(df, true, target) if split_ys
|
535
490
|
df
|
536
491
|
end
|
537
492
|
|
493
|
+
# Massage out one-hot cats to their canonical name
|
494
|
+
#
|
495
|
+
# Takes: ["Sex_male", "Sex_female", "Embarked_c", "PassengerId"]
|
496
|
+
# Returns: ["Embarked", "Sex", "PassengerId"]
|
497
|
+
def regular_columns(col_list)
|
498
|
+
one_hot_cats = columns.allowed_categories.invert.reduce({}) do |h, (k, v)|
|
499
|
+
h.tap do
|
500
|
+
k.each do |k2|
|
501
|
+
h["#{v}_#{k2}"] = v
|
502
|
+
end
|
503
|
+
end
|
504
|
+
end
|
505
|
+
|
506
|
+
col_list.map do |col|
|
507
|
+
one_hot_cats.key?(col) ? one_hot_cats[col] : col
|
508
|
+
end.uniq.sort
|
509
|
+
end
|
510
|
+
|
538
511
|
measure_method_timing :normalize
|
539
512
|
|
540
513
|
def missing_required_fields(df)
|
@@ -582,7 +555,6 @@ module EasyML
|
|
582
555
|
|
583
556
|
def cleanup
|
584
557
|
raw.cleanup
|
585
|
-
clipped.cleanup
|
586
558
|
processed.cleanup
|
587
559
|
end
|
588
560
|
|
@@ -775,10 +747,8 @@ module EasyML
|
|
775
747
|
|
776
748
|
def initialize_splits
|
777
749
|
@raw = nil
|
778
|
-
@clipped = nil
|
779
750
|
@processed = nil
|
780
751
|
raw
|
781
|
-
clipped
|
782
752
|
processed
|
783
753
|
end
|
784
754
|
|
@@ -823,7 +793,7 @@ module EasyML
|
|
823
793
|
processed.cleanup
|
824
794
|
|
825
795
|
SPLIT_ORDER.each do |segment|
|
826
|
-
df =
|
796
|
+
df = raw.read(segment)
|
827
797
|
learn_computed_columns(df) if segment == :train
|
828
798
|
processed_df = normalize(df, all_columns: true)
|
829
799
|
processed.save(segment, processed_df)
|
@@ -870,26 +840,9 @@ module EasyML
|
|
870
840
|
end
|
871
841
|
|
872
842
|
def fit
|
873
|
-
apply_clip
|
874
843
|
learn_statistics(type: :raw)
|
875
844
|
end
|
876
845
|
|
877
|
-
def apply_clip
|
878
|
-
clipped.cleanup
|
879
|
-
|
880
|
-
SPLIT_ORDER.each do |segment|
|
881
|
-
df = raw.send(segment, lazy: true, all_columns: true)
|
882
|
-
clipped.save(
|
883
|
-
segment,
|
884
|
-
columns.apply_clip(df) # Ensuring this returns a LazyFrame means we'll automatically use sink_parquet
|
885
|
-
)
|
886
|
-
end
|
887
|
-
end
|
888
|
-
|
889
|
-
measure_method_timing :apply_clip
|
890
|
-
|
891
|
-
# log_method :fit, "Learning statistics", verbose: true
|
892
|
-
|
893
846
|
def split_data!
|
894
847
|
split_data(force: true)
|
895
848
|
end
|