easy_ml 0.2.0.pre.rc77 → 0.2.0.pre.rc81
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +3 -3
- data/app/controllers/easy_ml/models_controller.rb +4 -3
- data/app/frontend/components/ModelForm.tsx +16 -0
- data/app/frontend/components/ScheduleModal.tsx +0 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +7 -6
- data/app/jobs/easy_ml/application_job.rb +1 -0
- data/app/jobs/easy_ml/batch_job.rb +47 -6
- data/app/jobs/easy_ml/compute_feature_job.rb +10 -10
- data/app/jobs/easy_ml/reaper.rb +14 -10
- data/app/jobs/easy_ml/refresh_dataset_job.rb +2 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +1 -0
- data/app/models/concerns/easy_ml/dataframe_serialization.rb +1 -17
- data/app/models/easy_ml/column/imputers/base.rb +1 -1
- data/app/models/easy_ml/column/imputers/imputer.rb +2 -0
- data/app/models/easy_ml/column/imputers/today.rb +1 -1
- data/app/models/easy_ml/column/selector.rb +0 -8
- data/app/models/easy_ml/column.rb +1 -1
- data/app/models/easy_ml/column_list.rb +2 -3
- data/app/models/easy_ml/dataset/learner/base.rb +2 -2
- data/app/models/easy_ml/dataset/learner/eager.rb +3 -1
- data/app/models/easy_ml/dataset/learner/lazy.rb +4 -1
- data/app/models/easy_ml/dataset.rb +47 -38
- data/app/models/easy_ml/datasource.rb +0 -6
- data/app/models/easy_ml/feature.rb +33 -8
- data/app/models/easy_ml/model.rb +27 -4
- data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +21 -5
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +9 -5
- data/app/models/easy_ml/models/xgboost.rb +58 -36
- data/app/models/easy_ml/retraining_run.rb +1 -1
- data/app/serializers/easy_ml/model_serializer.rb +1 -0
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +16 -3
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +0 -17
- data/lib/easy_ml/core/tuner.rb +14 -5
- data/lib/easy_ml/data/dataset_manager/reader/base.rb +12 -0
- data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +8 -3
- data/lib/easy_ml/data/dataset_manager/reader/file.rb +5 -0
- data/lib/easy_ml/data/dataset_manager/reader.rb +7 -1
- data/lib/easy_ml/data/dataset_manager/writer/base.rb +26 -9
- data/lib/easy_ml/data/dataset_manager/writer.rb +5 -1
- data/lib/easy_ml/data/dataset_manager.rb +18 -4
- data/lib/easy_ml/data/embeddings/adapters.rb +56 -0
- data/lib/easy_ml/data/embeddings/compression.rb +0 -0
- data/lib/easy_ml/data/embeddings.rb +43 -0
- data/lib/easy_ml/data/polars_column.rb +19 -5
- data/lib/easy_ml/engine.rb +16 -14
- data/lib/easy_ml/feature_store.rb +19 -16
- data/lib/easy_ml/support/lockable.rb +1 -5
- data/lib/easy_ml/version.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Bbf3mD_b.js +522 -0
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-B1qLZuyu.js.map → Application.tsx-Bbf3mD_b.js.map} +1 -1
- metadata +9 -7
- data/app/models/easy_ml/datasources/polars_datasource.rb +0 -69
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-B1qLZuyu.js +0 -522
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 169873e9ea5e1b00f7a4e499a2aeffc377615757cdd4b5fe6d70c8c454b9d426
|
4
|
+
data.tar.gz: fc1d4509606f011bd3adbdf367e767a3e9dfc4fdbb6b5cd91bb413f72da364b2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f4ea106a66d3185f612e481b607cef21f5453a00ef6eb12558e53f43aa1d68b8f20d1e950c1840d97180ba6271db9b9b42c8a2d480c22ca8c1e33d1b768590d2
|
7
|
+
data.tar.gz: 24885cdecd46d612be8b8ce7d4bd9889bdf60420bc82c01993cfe0168e454ebdaaa70899f5cf9cc879ff89a895fd00b1761d01c945dafffc784f7bc1e4a2fb8e
|
@@ -23,7 +23,7 @@
|
|
23
23
|
module EasyML
|
24
24
|
class DatasetsController < ApplicationController
|
25
25
|
def index
|
26
|
-
datasets = Dataset.all.order(id: :desc)
|
26
|
+
datasets = Dataset.all.includes(:columns, :datasource).order(id: :desc)
|
27
27
|
|
28
28
|
render inertia: "pages/DatasetsPage", props: {
|
29
29
|
datasets: datasets.map { |dataset| dataset_to_json_small(dataset) },
|
@@ -80,7 +80,7 @@ module EasyML
|
|
80
80
|
if dataset_params[:features_attributes].present?
|
81
81
|
# Clean up any feature IDs that don't exist anymore
|
82
82
|
feature_ids = dataset_params[:features_attributes].map { |attrs| attrs[:id] }.compact
|
83
|
-
existing_feature_ids =
|
83
|
+
existing_feature_ids = dataset.features.where(id: feature_ids).pluck(:id)
|
84
84
|
|
85
85
|
params[:dataset][:features_attributes].each do |attrs|
|
86
86
|
if attrs[:id].present? && !existing_feature_ids.include?(attrs[:id].to_i)
|
@@ -93,7 +93,7 @@ module EasyML
|
|
93
93
|
attrs[:feature_class] if attrs[:id].blank?
|
94
94
|
}.compact
|
95
95
|
|
96
|
-
existing_features =
|
96
|
+
existing_features = dataset.features.where(feature_class: feature_classes)
|
97
97
|
|
98
98
|
# Update params with existing feature IDs
|
99
99
|
existing_features.each do |feature|
|
@@ -30,7 +30,7 @@ module EasyML
|
|
30
30
|
def new
|
31
31
|
render inertia: "pages/NewModelPage", props: {
|
32
32
|
datasets: EasyML::Dataset.all.map do |dataset|
|
33
|
-
dataset
|
33
|
+
dataset_to_json(dataset)
|
34
34
|
end,
|
35
35
|
constants: EasyML::Model.constants,
|
36
36
|
}
|
@@ -41,7 +41,7 @@ module EasyML
|
|
41
41
|
render inertia: "pages/EditModelPage", props: {
|
42
42
|
model: model_to_json(model),
|
43
43
|
datasets: EasyML::Dataset.all.map do |dataset|
|
44
|
-
dataset
|
44
|
+
dataset_to_json_small(dataset)
|
45
45
|
end,
|
46
46
|
constants: EasyML::Model.constants,
|
47
47
|
}
|
@@ -167,7 +167,7 @@ module EasyML
|
|
167
167
|
private
|
168
168
|
|
169
169
|
def includes_list
|
170
|
-
[:retraining_runs, :retraining_job, dataset: [:
|
170
|
+
[:retraining_runs, :retraining_job, dataset: [:features, :splitter, columns: [:lineages]]]
|
171
171
|
end
|
172
172
|
|
173
173
|
def model_params
|
@@ -177,6 +177,7 @@ module EasyML
|
|
177
177
|
:dataset_id,
|
178
178
|
:task,
|
179
179
|
:objective,
|
180
|
+
:weights_column,
|
180
181
|
metrics: [],
|
181
182
|
retraining_job_attributes: [
|
182
183
|
:id,
|
@@ -16,6 +16,7 @@ interface ModelFormProps {
|
|
16
16
|
task: string;
|
17
17
|
objective?: string;
|
18
18
|
metrics?: string[];
|
19
|
+
weights_column?: string;
|
19
20
|
retraining_job?: {
|
20
21
|
frequency: string;
|
21
22
|
at: {
|
@@ -75,6 +76,7 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
|
|
75
76
|
task: initialData?.task || 'classification',
|
76
77
|
objective: initialData?.objective || 'binary:logistic',
|
77
78
|
metrics: initialData?.metrics || ['accuracy_score'],
|
79
|
+
weights_column: initialData?.weights_column || '',
|
78
80
|
retraining_job_attributes: initialData?.retraining_job ? {
|
79
81
|
id: initialData.retraining_job.id,
|
80
82
|
frequency: initialData.retraining_job.frequency,
|
@@ -165,6 +167,7 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
|
|
165
167
|
};
|
166
168
|
|
167
169
|
const selectedDataset = datasets.find(d => d.id === data.model.dataset_id);
|
170
|
+
const columns = selectedDataset?.columns || [];
|
168
171
|
|
169
172
|
const filteredTunerJobConstants = constants.tuner_job_constants[data.model.model_type] || {};
|
170
173
|
|
@@ -246,6 +249,19 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
|
|
246
249
|
<ErrorDisplay error={errors.dataset_id} />
|
247
250
|
</div>
|
248
251
|
|
252
|
+
<div>
|
253
|
+
<label className="block text-sm font-medium text-gray-700 mb-1">
|
254
|
+
Weights Column (Optional)
|
255
|
+
</label>
|
256
|
+
<SearchableSelect
|
257
|
+
value={data.model.weights_column}
|
258
|
+
options={columns.map(col => ({ value: col.name, label: col.name }))}
|
259
|
+
onChange={(value) => setData('model.weights_column', value)}
|
260
|
+
isClearable={true}
|
261
|
+
/>
|
262
|
+
<ErrorDisplay error={errors.weights_column} />
|
263
|
+
</div>
|
264
|
+
|
249
265
|
<div>
|
250
266
|
<label className="block text-sm font-medium text-gray-700 mb-1">
|
251
267
|
Task
|
@@ -587,8 +587,6 @@ export function ScheduleModal({ isOpen, onClose, onSave, initialData, metrics, t
|
|
587
587
|
value={formData.retraining_job_attributes.threshold}
|
588
588
|
onChange={(e) => handleEvaluatorChange('threshold', parseFloat(e.target.value))}
|
589
589
|
step={0.01}
|
590
|
-
min={0}
|
591
|
-
max={1}
|
592
590
|
className="block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 py-2 px-4 shadow-sm border-gray-300 border"
|
593
591
|
/>
|
594
592
|
</div>
|
@@ -250,16 +250,17 @@ export function PreprocessingConfig({
|
|
250
250
|
setIsEditingDescription(true);
|
251
251
|
};
|
252
252
|
|
253
|
-
let nullCount = (column.statistics?.processed
|
254
|
-
|
255
|
-
|
253
|
+
let nullCount = (column.statistics?.processed?.null_count || column.statistics?.raw?.null_count) || 0;
|
254
|
+
let numRows = (column.statistics?.processed?.num_rows) || (column.statistics?.raw?.num_rows) || 0;
|
255
|
+
const nullPercentage = nullCount && numRows
|
256
|
+
? ((nullCount / numRows) * 100)
|
256
257
|
: 0;
|
257
258
|
|
258
|
-
const nullPercentageProcessed = column.statistics?.processed?.null_count && column.statistics?.
|
259
|
-
? ((column.statistics.processed.null_count / column.statistics.
|
259
|
+
const nullPercentageProcessed = column.statistics?.processed?.null_count && column.statistics?.processed?.num_rows
|
260
|
+
? ((column.statistics.processed.null_count / column.statistics.processed.num_rows) * 100)
|
260
261
|
: 0;
|
261
262
|
|
262
|
-
const totalRows =
|
263
|
+
const totalRows = numRows;
|
263
264
|
|
264
265
|
const renderStrategySpecificInfo = (type: 'training' | 'inference') => {
|
265
266
|
const strategy = type === 'training' ? training : inference;
|
@@ -39,15 +39,15 @@ module EasyML
|
|
39
39
|
rest.map do |batch|
|
40
40
|
Resque.redis.rpush("batch:#{parent_id}:remaining", batch.to_json)
|
41
41
|
end
|
42
|
-
|
42
|
+
track_batch(parent_id)
|
43
43
|
handle_batch(parent_id, batch)
|
44
44
|
end
|
45
45
|
|
46
46
|
def handle_batch(parent_id, batch)
|
47
47
|
if batch.size > 1
|
48
|
-
enqueue_batch(batch)
|
48
|
+
enqueue_batch(batch, parent_id)
|
49
49
|
else
|
50
|
-
|
50
|
+
new.perform(parent_id, batch.first)
|
51
51
|
after_batch_hook(parent_id, batch)
|
52
52
|
end
|
53
53
|
end
|
@@ -60,7 +60,21 @@ module EasyML
|
|
60
60
|
end
|
61
61
|
|
62
62
|
def next_batch?(parent_id)
|
63
|
-
batches_remaining(parent_id) > 0
|
63
|
+
(batches_remaining(parent_id) > 0)
|
64
|
+
end
|
65
|
+
|
66
|
+
def list_batches
|
67
|
+
Resque.redis.hkeys("batches:tracking")
|
68
|
+
end
|
69
|
+
|
70
|
+
def track_batch(parent_id)
|
71
|
+
Resque.redis.hset("batches:tracking", parent_id, 1)
|
72
|
+
end
|
73
|
+
|
74
|
+
def cleanup_all
|
75
|
+
list_batches.each do |batch_id|
|
76
|
+
cleanup_batch(batch_id)
|
77
|
+
end
|
64
78
|
end
|
65
79
|
|
66
80
|
def batches_remaining(parent_id)
|
@@ -69,12 +83,39 @@ module EasyML
|
|
69
83
|
|
70
84
|
def cleanup_batch(parent_id)
|
71
85
|
Resque.redis.del("batch:#{parent_id}:remaining")
|
86
|
+
Resque.redis.hdel("batches:tracking", parent_id)
|
72
87
|
end
|
73
88
|
|
74
|
-
|
89
|
+
def batch_args
|
90
|
+
list_batches.map do |batch_id|
|
91
|
+
fetch_batch_arguments(batch_id)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def select_batches(&block)
|
96
|
+
list_batches.select do |batch_id|
|
97
|
+
yield fetch_batch_arguments(batch_id)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def poll
|
102
|
+
while true
|
103
|
+
sleep 2
|
104
|
+
EasyML::BatchJob.list_batches.map do |batch|
|
105
|
+
puts "Batch #{batch} | Remaining : #{EasyML::BatchJob.batches_remaining(batch)}"
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
75
109
|
|
76
110
|
def get_parent_batch_id(args_list)
|
77
|
-
args_list.dup.flatten.
|
111
|
+
args_list.dup.flatten.detect { |arg| arg.dig(:parent_batch_id) }.dig(:parent_batch_id)
|
112
|
+
end
|
113
|
+
|
114
|
+
private
|
115
|
+
|
116
|
+
def get_args_list(batch_id)
|
117
|
+
redis_key = "#{batch(batch_id)}:original_args"
|
118
|
+
redis.get(redis_key)
|
78
119
|
end
|
79
120
|
|
80
121
|
# Store batch arguments in Redis
|
@@ -14,31 +14,31 @@ module EasyML
|
|
14
14
|
#
|
15
15
|
# https://github.com/drfeelngood/resque-batched-job/blob/master/lib/resque/plugins/batched_job.rb#L86
|
16
16
|
batch_args = batch_args.dup
|
17
|
-
|
17
|
+
EasyML::ComputeFeatureJob.new.perform(batch_id, batch_args)
|
18
18
|
end
|
19
19
|
|
20
|
-
def
|
20
|
+
def perform(batch_id, batch_args = {})
|
21
21
|
EasyML::Feature.fit_one_batch(batch_id, batch_args)
|
22
22
|
end
|
23
23
|
|
24
24
|
def self.after_batch_hook(batch_id, *args)
|
25
|
-
|
26
|
-
|
27
|
-
parent_id = batch_args.pluck(:parent_batch_id).first
|
25
|
+
args = args.flatten.first.with_indifferent_access
|
26
|
+
feature_id = args.dig(:feature_id)
|
28
27
|
|
29
|
-
feature = EasyML::Feature.find_by(id:
|
28
|
+
feature = EasyML::Feature.find_by(id: feature_id)
|
30
29
|
|
31
30
|
if feature.failed?
|
32
31
|
dataset.features.where(workflow_status: :analyzing).update_all(workflow_status: :ready)
|
33
|
-
return BatchJob.cleanup_batch(
|
32
|
+
return BatchJob.cleanup_batch(batch_id)
|
34
33
|
end
|
35
34
|
|
36
35
|
feature.after_fit
|
37
36
|
|
38
|
-
if BatchJob.next_batch?(
|
39
|
-
BatchJob.enqueue_next_batch(self,
|
37
|
+
if BatchJob.next_batch?(batch_id)
|
38
|
+
BatchJob.enqueue_next_batch(self, batch_id)
|
40
39
|
else
|
41
|
-
|
40
|
+
cleanup_batch(batch_id)
|
41
|
+
dataset = feature.dataset
|
42
42
|
dataset.after_fit_features
|
43
43
|
end
|
44
44
|
end
|
data/app/jobs/easy_ml/reaper.rb
CHANGED
@@ -9,8 +9,8 @@ module EasyML
|
|
9
9
|
{
|
10
10
|
worker: worker,
|
11
11
|
working: true,
|
12
|
-
class: args.dig("job_class"),
|
13
|
-
args: args.dig("arguments"),
|
12
|
+
class: args.is_a?(Hash) ? args.dig("job_class") : nil,
|
13
|
+
args: args.is_a?(Hash) ? args.dig("arguments") : nil,
|
14
14
|
pid: worker.pid,
|
15
15
|
}
|
16
16
|
else
|
@@ -19,17 +19,23 @@ module EasyML
|
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
|
-
def find_job(worker_class, *args)
|
22
|
+
def find_job(worker_class, *args, &block)
|
23
23
|
list_workers.select do |config|
|
24
|
-
config.dig(:class) == worker_class.to_s
|
24
|
+
selected = config.dig(:class) == worker_class.to_s
|
25
|
+
if block_given?
|
26
|
+
selected &&= yield(config)
|
27
|
+
else
|
28
|
+
selected &= config.dig(:args) == args
|
29
|
+
end
|
30
|
+
selected
|
25
31
|
end
|
26
32
|
end
|
27
33
|
|
28
|
-
def kill(worker_class, *args)
|
29
|
-
find_job(worker_class, *args).each do |job|
|
34
|
+
def kill(worker_class, *args, &block)
|
35
|
+
find_job(worker_class, *args, &block).each do |job|
|
30
36
|
begin
|
31
|
-
# Send
|
32
|
-
Process.kill("
|
37
|
+
# Send HUP signal to the process
|
38
|
+
Process.kill("USR1", job[:pid])
|
33
39
|
|
34
40
|
# Remove the worker from Redis so it doesn't show up as a zombie
|
35
41
|
# in the Resque web interface. This is important because:
|
@@ -37,12 +43,10 @@ module EasyML
|
|
37
43
|
# 2. Prevents confusion about running workers
|
38
44
|
# 3. Allows proper worker cleanup in Redis
|
39
45
|
job[:worker].done_working
|
40
|
-
job[:worker].unregister_worker
|
41
46
|
rescue Errno::ESRCH
|
42
47
|
# Process already gone, but still try to clean up Redis
|
43
48
|
begin
|
44
49
|
job[:worker].done_working
|
45
|
-
job[:worker].unregister_worker
|
46
50
|
rescue => e
|
47
51
|
# Redis cleanup failed, worker might already be unregistered
|
48
52
|
puts "Failed to unregister worker: #{e.message}"
|
@@ -8,23 +8,7 @@ module EasyML
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def deserialize_dataframe(df_data)
|
11
|
-
|
12
|
-
|
13
|
-
columns = df_data["columns"].map do |col|
|
14
|
-
dtype = case col["datatype"]
|
15
|
-
when Hash
|
16
|
-
if col["datatype"]["Datetime"]
|
17
|
-
Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
|
18
|
-
else
|
19
|
-
Polars::Utf8
|
20
|
-
end
|
21
|
-
else
|
22
|
-
Polars.const_get(col["datatype"])
|
23
|
-
end
|
24
|
-
Polars::Series.new(col["name"], col["values"], dtype: dtype)
|
25
|
-
end
|
26
|
-
|
27
|
-
Polars::DataFrame.new(columns)
|
11
|
+
Polars::DataFrame.new(df_data)
|
28
12
|
end
|
29
13
|
end
|
30
14
|
end
|
@@ -10,7 +10,7 @@ module EasyML
|
|
10
10
|
|
11
11
|
def transform(df)
|
12
12
|
df = df.with_column(
|
13
|
-
Polars.col(column.name).fill_null(Polars.lit(UTC.today.beginning_of_day)).alias(column.name)
|
13
|
+
Polars.col(column.name).fill_null(Polars.lit(EasyML::Support::UTC.today.beginning_of_day)).alias(column.name)
|
14
14
|
)
|
15
15
|
df
|
16
16
|
end
|
@@ -28,12 +28,11 @@ module EasyML
|
|
28
28
|
if computed
|
29
29
|
cols = column_list.computed
|
30
30
|
else
|
31
|
-
cols = column_list
|
31
|
+
cols = column_list
|
32
32
|
end
|
33
33
|
|
34
34
|
by_name = cols.index_by(&:name)
|
35
|
-
|
36
|
-
column = by_name[col]
|
35
|
+
cols.each do |column|
|
37
36
|
df = column.transform(df, inference: inference, computed: computed) if column
|
38
37
|
end
|
39
38
|
|
@@ -15,8 +15,8 @@ module EasyML
|
|
15
15
|
(column.one_hot? && type.to_sym == :processed)
|
16
16
|
end
|
17
17
|
|
18
|
-
TYPES_ALL = %i(raw
|
19
|
-
TYPES_RAW = %i(raw
|
18
|
+
TYPES_ALL = %i(raw processed)
|
19
|
+
TYPES_RAW = %i(raw)
|
20
20
|
TYPES_PROCESSED = %i(processed)
|
21
21
|
|
22
22
|
def types(type = :all)
|
@@ -21,7 +21,10 @@ module EasyML
|
|
21
21
|
|
22
22
|
def run_queries(split, type)
|
23
23
|
queries = build_queries(split, type)
|
24
|
-
|
24
|
+
|
25
|
+
dataset.columns.apply_clip(
|
26
|
+
@dataset.send(type).send(split, all_columns: true, lazy: true)
|
27
|
+
).select(queries).collect
|
25
28
|
end
|
26
29
|
|
27
30
|
def get_column_statistics(query_results)
|