easy_ml 0.2.0.pre.rc56 → 0.2.0.pre.rc58
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/apis_controller.rb +8 -0
- data/app/controllers/easy_ml/models_controller.rb +3 -0
- data/app/controllers/easy_ml/predictions_controller.rb +10 -5
- data/app/frontend/components/ModelForm.tsx +1 -1
- data/app/frontend/components/SearchableSelect.tsx +0 -1
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +1 -1
- data/app/frontend/pages/DatasourcesPage.tsx +0 -2
- data/app/jobs/easy_ml/compute_feature_job.rb +1 -0
- data/app/models/easy_ml/column.rb +55 -4
- data/app/models/easy_ml/column_history.rb +5 -1
- data/app/models/easy_ml/column_list.rb +46 -14
- data/app/models/easy_ml/dataset.rb +47 -27
- data/app/models/easy_ml/datasource.rb +1 -0
- data/app/models/easy_ml/feature.rb +10 -3
- data/app/models/easy_ml/model.rb +30 -6
- data/app/models/easy_ml/model_history.rb +1 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +4 -3
- data/app/models/easy_ml/retraining_run.rb +1 -0
- data/config/initializers/inflections.rb +2 -0
- data/config/routes.rb +3 -0
- data/lib/easy_ml/core/evaluators/base_evaluator.rb +1 -1
- data/lib/easy_ml/core/evaluators/classification_evaluators.rb +9 -9
- data/lib/easy_ml/core/evaluators/regression_evaluators.rb +4 -4
- data/lib/easy_ml/core/model_evaluator.rb +18 -3
- data/lib/easy_ml/core/tuner.rb +23 -17
- data/lib/easy_ml/data/preprocessor.rb +10 -53
- data/lib/easy_ml/data/splits/in_memory_split.rb +4 -0
- data/lib/easy_ml/data/statistics_learner.rb +79 -14
- data/lib/easy_ml/data/synced_directory.rb +4 -2
- data/lib/easy_ml/predict.rb +13 -2
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +3 -0
- data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +14 -0
- data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt +20 -0
- data/lib/easy_ml/version.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-DTZ2348z.js → Application.tsx-DmkdJsDd.js} +34 -34
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-DTZ2348z.js.map → Application.tsx-DmkdJsDd.js.map} +1 -1
- metadata +8 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1a25c50b89c079e7e62f52d1f5a52ef16f3d7bc9b388fcee9a7b0983148de9cd
|
4
|
+
data.tar.gz: bfcf0d06fbe498ccc70251c144649d7fd6699c2bdc4a9acbf5866e60ce04c7bd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 77186b1d2d7558db7d128e03c68f8632af6f28be4b1bf2daa71ac804abbcc5a26470fe29a802258d326048155fea46a7f707e46fc88cf8571af5f30cb870d839
|
7
|
+
data.tar.gz: a22ba3e21ab32e64674033f0c83d023e41c3c5f158117be5fa8b85f1865a4bfc39bda72042e4c1ff80277d4d664fcc694349f7a377933c7c52f23417c299618b
|
@@ -53,6 +53,9 @@ module EasyML
|
|
53
53
|
flash[:notice] = "Model was successfully created."
|
54
54
|
redirect_to easy_ml_models_path
|
55
55
|
else
|
56
|
+
errors = model.errors.to_hash(true)
|
57
|
+
values = errors.values.flatten
|
58
|
+
flash.now[:error] = values.join(", ")
|
56
59
|
render inertia: "pages/NewModelPage", props: {
|
57
60
|
datasets: EasyML::Dataset.all.map do |dataset|
|
58
61
|
dataset.slice(:id, :name, :num_rows)
|
@@ -3,6 +3,11 @@ module EasyML
|
|
3
3
|
skip_before_action :verify_authenticity_token, only: [:create]
|
4
4
|
|
5
5
|
def create
|
6
|
+
slug = params[:model]
|
7
|
+
unless EasyML::Model.find_by(slug: slug).inference_version.present?
|
8
|
+
return render json: { error: "Model not found" }, status: :not_found
|
9
|
+
end
|
10
|
+
|
6
11
|
unless params.key?(:input)
|
7
12
|
return render json: { error: "Must provide key: input" }, status: :not_found
|
8
13
|
end
|
@@ -12,17 +17,17 @@ module EasyML
|
|
12
17
|
return render json: { error: "Input must be a hash" }, status: :not_found
|
13
18
|
end
|
14
19
|
|
15
|
-
|
16
|
-
unless
|
17
|
-
return render json: { error: "
|
20
|
+
valid, fields = EasyML::Predict.validate_input(slug, input)
|
21
|
+
unless valid
|
22
|
+
return render json: { error: "Missing required fields: #{fields}" }, status: :not_found
|
18
23
|
end
|
19
24
|
|
20
|
-
prediction = EasyML::Predict.predict(
|
25
|
+
prediction = EasyML::Predict.predict(slug, input)
|
21
26
|
|
22
27
|
render json: { prediction: EasyML::PredictionSerializer.new(prediction).serializable_hash.dig(:data, :attributes) }, status: :ok
|
23
28
|
rescue ActiveRecord::RecordNotFound
|
24
29
|
render json: { error: "Model not found" }, status: :not_found
|
25
|
-
rescue
|
30
|
+
rescue => e
|
26
31
|
render json: { error: e.message }, status: :unprocessable_entity
|
27
32
|
end
|
28
33
|
end
|
@@ -74,7 +74,7 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
|
|
74
74
|
dataset_id: initialData?.dataset_id || '',
|
75
75
|
task: initialData?.task || 'classification',
|
76
76
|
objective: initialData?.objective || 'binary:logistic',
|
77
|
-
metrics: initialData?.metrics || ['
|
77
|
+
metrics: initialData?.metrics || ['accuracy_score'],
|
78
78
|
retraining_job_attributes: initialData?.retraining_job ? {
|
79
79
|
id: initialData.retraining_job.id,
|
80
80
|
frequency: initialData.retraining_job.frequency,
|
@@ -61,7 +61,6 @@ export const SearchableSelect = forwardRef<HTMLButtonElement, SearchableSelectPr
|
|
61
61
|
}, [isOpen]);
|
62
62
|
|
63
63
|
const handleOptionClick = (optionValue: Option['value'], e: React.MouseEvent) => {
|
64
|
-
debugger;
|
65
64
|
e.preventDefault();
|
66
65
|
e.stopPropagation();
|
67
66
|
onChange(optionValue);
|
@@ -250,7 +250,7 @@ export function PreprocessingConfig({
|
|
250
250
|
setIsEditingDescription(true);
|
251
251
|
};
|
252
252
|
|
253
|
-
let nullCount = (column.statistics?.processed.null_count || column.statistics?.raw
|
253
|
+
let nullCount = (column.statistics?.processed.null_count || column.statistics?.raw?.null_count) || 0;
|
254
254
|
const nullPercentage = nullCount && column.statistics?.raw.num_rows
|
255
255
|
? ((nullCount / column.statistics.raw.num_rows) * 100)
|
256
256
|
: 0;
|
@@ -49,12 +49,10 @@ export default function DatasourcesPage({ datasources }: { datasources: Datasour
|
|
49
49
|
preserveScroll: true, // Keeps the scroll position
|
50
50
|
preserveState: true, // Keeps the form state
|
51
51
|
onSuccess: (e) => {
|
52
|
-
debugger;
|
53
52
|
console.log("SUCCESS")
|
54
53
|
// The page will automatically refresh with new data
|
55
54
|
},
|
56
55
|
onError: () => {
|
57
|
-
debugger;
|
58
56
|
// Handle error case if needed
|
59
57
|
console.error('Failed to sync datasource');
|
60
58
|
}
|
@@ -8,7 +8,7 @@
|
|
8
8
|
# description :string
|
9
9
|
# datatype :string
|
10
10
|
# polars_datatype :string
|
11
|
-
# is_target :boolean
|
11
|
+
# is_target :boolean default(FALSE)
|
12
12
|
# hidden :boolean default(FALSE)
|
13
13
|
# drop_if_null :boolean default(FALSE)
|
14
14
|
# preprocessing_steps :json
|
@@ -17,6 +17,8 @@
|
|
17
17
|
# created_at :datetime not null
|
18
18
|
# updated_at :datetime not null
|
19
19
|
# is_date_column :boolean default(FALSE)
|
20
|
+
# computed_by :string
|
21
|
+
# is_computed :boolean default(FALSE)
|
20
22
|
#
|
21
23
|
module EasyML
|
22
24
|
class Column < ActiveRecord::Base
|
@@ -30,7 +32,6 @@ module EasyML
|
|
30
32
|
validates :name, uniqueness: { scope: :dataset_id }
|
31
33
|
|
32
34
|
before_save :ensure_valid_datatype
|
33
|
-
after_create :set_date_column_if_date_splitter
|
34
35
|
after_save :handle_date_column_change
|
35
36
|
before_save :set_defaults
|
36
37
|
|
@@ -40,6 +41,21 @@ module EasyML
|
|
40
41
|
scope :categorical, -> { where(datatype: %w[categorical string boolean]) }
|
41
42
|
scope :datetime, -> { where(datatype: "datetime") }
|
42
43
|
scope :date_column, -> { where(is_date_column: true) }
|
44
|
+
scope :required, -> { where(is_computed: false, hidden: false, is_target: false).where("preprocessing_steps IS NULL OR preprocessing_steps::text = '{}'::text") }
|
45
|
+
scope :api_inputs, -> { where(is_computed: false, hidden: false, is_target: false) }
|
46
|
+
scope :computed, -> { where(is_computed: true) }
|
47
|
+
|
48
|
+
def aliases
|
49
|
+
[name].concat(virtual_columns)
|
50
|
+
end
|
51
|
+
|
52
|
+
def virtual_columns
|
53
|
+
if one_hot?
|
54
|
+
allowed_categories.map { |cat| "#{name}_#{cat}" }
|
55
|
+
else
|
56
|
+
[]
|
57
|
+
end
|
58
|
+
end
|
43
59
|
|
44
60
|
def datatype=(dtype)
|
45
61
|
write_attribute(:datatype, dtype)
|
@@ -88,15 +104,50 @@ module EasyML
|
|
88
104
|
end
|
89
105
|
|
90
106
|
def allowed_categories
|
91
|
-
return
|
107
|
+
return [] unless one_hot?
|
108
|
+
stats = dataset.statistics
|
109
|
+
return [] if stats.nil? || stats.blank?
|
110
|
+
|
111
|
+
stats = stats.deep_symbolize_keys
|
112
|
+
stats = stats.dig(:raw)
|
92
113
|
|
93
|
-
|
114
|
+
(stats.dig(name.to_sym, :allowed_categories) || []).sort.concat(["other"])
|
94
115
|
end
|
95
116
|
|
96
117
|
def date_column?
|
97
118
|
is_date_column
|
98
119
|
end
|
99
120
|
|
121
|
+
def lineage
|
122
|
+
[
|
123
|
+
present_in_raw_dataset ? "Raw dataset" : nil,
|
124
|
+
computed_by ? "Computed by #{computed_by}" : nil,
|
125
|
+
preprocessing_steps.present? ? "Preprocessed using #{preprocessing_steps.keys.join(", ")}" : nil,
|
126
|
+
].compact
|
127
|
+
end
|
128
|
+
|
129
|
+
def required?
|
130
|
+
is_computed && (preprocessing_steps.nil? || preprocessing_steps == {}) && !hidden && !is_target
|
131
|
+
end
|
132
|
+
|
133
|
+
def present_in_raw_dataset
|
134
|
+
dataset.raw.data&.columns&.include?(name) || false
|
135
|
+
end
|
136
|
+
|
137
|
+
def sort_required
|
138
|
+
required? ? 0 : 1
|
139
|
+
end
|
140
|
+
|
141
|
+
def to_api
|
142
|
+
{
|
143
|
+
name: name,
|
144
|
+
datatype: datatype,
|
145
|
+
description: description,
|
146
|
+
required: required?,
|
147
|
+
allowed_values: allowed_categories.empty? ? nil : allowed_categories,
|
148
|
+
}.compact
|
149
|
+
end
|
150
|
+
|
100
151
|
private
|
101
152
|
|
102
153
|
def set_defaults
|
@@ -9,7 +9,7 @@
|
|
9
9
|
# description :string
|
10
10
|
# datatype :string
|
11
11
|
# polars_datatype :string
|
12
|
-
# is_target :boolean
|
12
|
+
# is_target :boolean default(FALSE)
|
13
13
|
# hidden :boolean default(FALSE)
|
14
14
|
# drop_if_null :boolean default(FALSE)
|
15
15
|
# preprocessing_steps :json
|
@@ -22,10 +22,14 @@
|
|
22
22
|
# history_user_id :integer
|
23
23
|
# snapshot_id :string
|
24
24
|
# is_date_column :boolean default(FALSE)
|
25
|
+
# computed_by :string
|
26
|
+
# is_computed :boolean default(FALSE)
|
25
27
|
#
|
26
28
|
module EasyML
|
27
29
|
class ColumnHistory < ActiveRecord::Base
|
28
30
|
self.table_name = "easy_ml_column_histories"
|
29
31
|
include Historiographer::History
|
32
|
+
scope :required, -> { where(is_computed: false, hidden: false, is_target: false).where("preprocessing_steps IS NULL OR preprocessing_steps::text = '{}'::text") }
|
33
|
+
scope :computed, -> { where(is_computed: true) }
|
30
34
|
end
|
31
35
|
end
|
@@ -1,16 +1,19 @@
|
|
1
1
|
module EasyML
|
2
2
|
module ColumnList
|
3
|
-
|
3
|
+
include Historiographer::Relation
|
4
|
+
|
5
|
+
def sync(delete: true)
|
4
6
|
return unless dataset.schema.present?
|
5
7
|
|
6
8
|
EasyML::Column.transaction do
|
7
9
|
col_names = syncable
|
8
10
|
existing_columns = where(name: col_names)
|
9
11
|
import_new(col_names, existing_columns)
|
12
|
+
update_existing(existing_columns)
|
13
|
+
set_feature_lineage
|
10
14
|
|
11
|
-
if
|
12
|
-
|
13
|
-
delete_missing(existing_columns)
|
15
|
+
if delete
|
16
|
+
delete_missing(col_names)
|
14
17
|
end
|
15
18
|
|
16
19
|
if existing_columns.none? # Totally new dataset
|
@@ -37,14 +40,9 @@ module EasyML
|
|
37
40
|
end
|
38
41
|
end
|
39
42
|
|
40
|
-
def virtual_column?(column)
|
41
|
-
false
|
42
|
-
end
|
43
|
-
|
44
43
|
def syncable
|
45
44
|
dataset.processed_schema.keys.select do |col|
|
46
|
-
!one_hot?(col)
|
47
|
-
!virtual_column?(col)
|
45
|
+
!one_hot?(col)
|
48
46
|
end
|
49
47
|
end
|
50
48
|
|
@@ -56,8 +54,36 @@ module EasyML
|
|
56
54
|
proxy_association.owner
|
57
55
|
end
|
58
56
|
|
57
|
+
def sort_by_required
|
58
|
+
column_list.sort_by { |col| [col.sort_required, col.name] }
|
59
|
+
end
|
60
|
+
|
59
61
|
private
|
60
62
|
|
63
|
+
def set_feature_lineage
|
64
|
+
# Get all features that compute columns
|
65
|
+
features_computing_columns = dataset.features.all.map do |feature|
|
66
|
+
[feature.name, feature.computes_columns]
|
67
|
+
end.compact.to_h
|
68
|
+
|
69
|
+
updates = column_list.reload.map do |column|
|
70
|
+
# Check if column is computed by any feature
|
71
|
+
computing_feature = features_computing_columns.find { |_, cols| cols.include?(column.name) }&.first
|
72
|
+
is_computed = !computing_feature.nil?
|
73
|
+
|
74
|
+
column.assign_attributes(
|
75
|
+
computed_by: computing_feature,
|
76
|
+
is_computed: is_computed,
|
77
|
+
)
|
78
|
+
next unless column.changed?
|
79
|
+
|
80
|
+
column
|
81
|
+
end.compact
|
82
|
+
EasyML::Column.import(updates.to_a, { on_duplicate_key_update: { columns: %i[computed_by is_computed] } })
|
83
|
+
cols = EasyML::Column.where(id: updates.map(&:id)).to_a
|
84
|
+
column_list.bulk_record_history(cols, { history_user_id: 1 })
|
85
|
+
end
|
86
|
+
|
61
87
|
def import_new(new_columns, existing_columns)
|
62
88
|
new_columns = new_columns - existing_columns.map(&:name)
|
63
89
|
cols_to_insert = new_columns.map do |col_name|
|
@@ -67,6 +93,7 @@ module EasyML
|
|
67
93
|
)
|
68
94
|
end
|
69
95
|
EasyML::Column.import(cols_to_insert)
|
96
|
+
column_list.reload
|
70
97
|
end
|
71
98
|
|
72
99
|
def update_existing(existing_columns)
|
@@ -116,13 +143,18 @@ module EasyML
|
|
116
143
|
end
|
117
144
|
EasyML::Column.import(existing_columns.to_a,
|
118
145
|
{ on_duplicate_key_update: { columns: %i[statistics datatype polars_datatype
|
119
|
-
sample_values] } })
|
146
|
+
sample_values computed_by is_computed] } })
|
120
147
|
end
|
121
148
|
|
122
|
-
def delete_missing(
|
123
|
-
raw_cols = dataset.
|
149
|
+
def delete_missing(col_names)
|
150
|
+
raw_cols = dataset.best_segment.train(all_columns: true, limit: 1).columns
|
124
151
|
raw_cols = where(name: raw_cols)
|
125
|
-
columns_to_delete = column_list
|
152
|
+
columns_to_delete = column_list.select do |col|
|
153
|
+
col_names.exclude?(col.name) &&
|
154
|
+
one_hots.map(&:name).exclude?(col.name) &&
|
155
|
+
raw_cols.map(&:name).exclude?(col.name) &&
|
156
|
+
dataset.features.flat_map(&:computes_columns).exclude?(col.name)
|
157
|
+
end
|
126
158
|
columns_to_delete.each(&:destroy!)
|
127
159
|
end
|
128
160
|
end
|
@@ -140,6 +140,12 @@ module EasyML
|
|
140
140
|
EasyML::RefreshDatasetJob.perform_later(id)
|
141
141
|
end
|
142
142
|
|
143
|
+
def best_segment
|
144
|
+
[processed, raw].detect do |segment|
|
145
|
+
segment.send(:train, all_columns: true, limit: 1)&.columns
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
143
149
|
def raw
|
144
150
|
return @raw if @raw && @raw.dataset
|
145
151
|
|
@@ -175,9 +181,10 @@ module EasyML
|
|
175
181
|
|
176
182
|
def actually_refresh
|
177
183
|
refreshing do
|
184
|
+
learn(delete: false) # After syncing datasource, learn new statistics + sync columns
|
178
185
|
process_data
|
179
186
|
fully_reload
|
180
|
-
learn
|
187
|
+
learn # After processing data, we may have new columns from newly applied features
|
181
188
|
now = UTC.now
|
182
189
|
update(workflow_status: "ready", refreshed_at: now, updated_at: now)
|
183
190
|
fully_reload
|
@@ -272,10 +279,10 @@ module EasyML
|
|
272
279
|
raw.split_at.present? && raw.split_at < datasource.last_updated_at
|
273
280
|
end
|
274
281
|
|
275
|
-
def learn(
|
282
|
+
def learn(delete: true)
|
276
283
|
learn_schema
|
277
284
|
learn_statistics
|
278
|
-
columns.sync(
|
285
|
+
columns.sync(delete: delete)
|
279
286
|
end
|
280
287
|
|
281
288
|
def refreshing
|
@@ -336,21 +343,25 @@ module EasyML
|
|
336
343
|
|
337
344
|
def learn_statistics
|
338
345
|
stats = {
|
339
|
-
raw: EasyML::Data::StatisticsLearner.learn(raw, self),
|
346
|
+
raw: EasyML::Data::StatisticsLearner.learn(raw, self, :raw),
|
340
347
|
}
|
341
|
-
stats.merge!(processed: EasyML::Data::StatisticsLearner.learn(processed, self)) if processed.data.present?
|
348
|
+
stats.merge!(processed: EasyML::Data::StatisticsLearner.learn(processed, self, :processed)) if processed.data.present?
|
349
|
+
|
350
|
+
columns.select(&:is_computed).each do |col|
|
351
|
+
if stats.dig(:processed, col.name)
|
352
|
+
stats[:raw][col.name] = stats[:processed][col.name]
|
353
|
+
end
|
354
|
+
end
|
342
355
|
|
343
356
|
update(statistics: stats)
|
344
357
|
end
|
345
358
|
|
346
359
|
def process_data
|
347
|
-
split_data
|
348
360
|
fit
|
349
361
|
normalize_all
|
350
|
-
# alert_nulls
|
351
362
|
end
|
352
363
|
|
353
|
-
def needs_learn?
|
364
|
+
def needs_learn?
|
354
365
|
return true if columns_need_refresh?
|
355
366
|
|
356
367
|
never_learned = columns.none?
|
@@ -359,6 +370,7 @@ module EasyML
|
|
359
370
|
new_features = features.any? { |f| f.updated_at > columns.maximum(:updated_at) }
|
360
371
|
return true if new_features
|
361
372
|
|
373
|
+
df = raw.query(limit: 1)
|
362
374
|
new_cols = df.present? ? (df.columns - columns.map(&:name)) : []
|
363
375
|
new_cols = columns.syncable
|
364
376
|
|
@@ -390,22 +402,24 @@ module EasyML
|
|
390
402
|
{ differing_columns: differing_columns, differences: differences }
|
391
403
|
end
|
392
404
|
|
393
|
-
def
|
394
|
-
|
395
|
-
|
405
|
+
def validate_input(df)
|
406
|
+
fields = missing_required_fields(df)
|
407
|
+
return fields.empty?, fields
|
408
|
+
end
|
409
|
+
|
410
|
+
def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
|
396
411
|
df = apply_missing_features(df, inference: inference)
|
412
|
+
df = drop_nulls(df)
|
397
413
|
df = preprocessor.postprocess(df, inference: inference)
|
398
|
-
|
399
|
-
#
|
400
|
-
|
401
|
-
learn(only_new: true) if idx == 1 && needs_learn?(df)
|
414
|
+
df = apply_features(df, features)
|
415
|
+
learn unless inference # After applying features, we need to learn new statistics
|
416
|
+
df = preprocessor.postprocess(df, inference: inference, computed: true)
|
402
417
|
df = apply_column_mask(df, inference: inference) unless all_columns
|
403
|
-
raise_on_nulls(df) if inference
|
404
418
|
df, = processed.split_features_targets(df, true, target) if split_ys
|
405
419
|
df
|
406
420
|
end
|
407
421
|
|
408
|
-
def
|
422
|
+
def missing_required_fields(df)
|
409
423
|
desc_df = df.describe
|
410
424
|
|
411
425
|
# Get the 'null_count' row
|
@@ -416,8 +430,10 @@ module EasyML
|
|
416
430
|
null_count_row[col][0].to_i > 0
|
417
431
|
end
|
418
432
|
|
419
|
-
|
420
|
-
|
433
|
+
# This is a history class, because this only occurs on prediction
|
434
|
+
required_columns = columns.current.required.map(&:name)
|
435
|
+
required_columns.select do |col|
|
436
|
+
columns_with_nulls.include?(col) || df.columns.map(&:to_s).exclude?(col.to_s)
|
421
437
|
end
|
422
438
|
end
|
423
439
|
|
@@ -487,7 +503,7 @@ module EasyML
|
|
487
503
|
end
|
488
504
|
|
489
505
|
def preprocessing_steps
|
490
|
-
return if columns.nil? || (columns.respond_to?(:empty?) && columns.empty?)
|
506
|
+
return {} if columns.nil? || (columns.respond_to?(:empty?) && columns.empty?)
|
491
507
|
return @preprocessing_steps if @preprocessing_steps.present?
|
492
508
|
|
493
509
|
training = standardize_preprocessing_steps(:training)
|
@@ -515,7 +531,7 @@ module EasyML
|
|
515
531
|
end
|
516
532
|
|
517
533
|
def drop_cols
|
518
|
-
@drop_cols ||= preloaded_columns.select(&:hidden).
|
534
|
+
@drop_cols ||= preloaded_columns.select(&:hidden).flat_map(&:aliases)
|
519
535
|
end
|
520
536
|
|
521
537
|
def drop_if_null
|
@@ -552,10 +568,14 @@ module EasyML
|
|
552
568
|
df[column_mask(df, inference: inference)]
|
553
569
|
end
|
554
570
|
|
555
|
-
def apply_missing_features(df, inference: false)
|
571
|
+
def apply_missing_features(df, inference: false, include_one_hots: false)
|
556
572
|
return df unless inference
|
557
573
|
|
558
574
|
missing_features = (col_order(inference: inference) - df.columns).compact
|
575
|
+
unless include_one_hots
|
576
|
+
missing_features -= columns.one_hots.flat_map(&:virtual_columns) unless include_one_hots
|
577
|
+
missing_features += columns.one_hots.map(&:name) - df.columns
|
578
|
+
end
|
559
579
|
df.with_columns(missing_features.map { |f| Polars.lit(nil).alias(f) })
|
560
580
|
end
|
561
581
|
|
@@ -661,9 +681,9 @@ module EasyML
|
|
661
681
|
def normalize_all
|
662
682
|
processed.cleanup
|
663
683
|
|
664
|
-
SPLIT_ORDER.
|
684
|
+
SPLIT_ORDER.each do |segment|
|
665
685
|
df = raw.read(segment)
|
666
|
-
processed_df = normalize(df, all_columns: true
|
686
|
+
processed_df = normalize(df, all_columns: true)
|
667
687
|
processed.save(segment, processed_df)
|
668
688
|
end
|
669
689
|
@normalized = true
|
@@ -687,8 +707,9 @@ module EasyML
|
|
687
707
|
end
|
688
708
|
|
689
709
|
def fit
|
690
|
-
|
691
|
-
|
710
|
+
computed_statistics = columns.where(is_computed: true).reduce({}) { |h, c| h.tap { h[c.name] = c.statistics.dig("processed") } }
|
711
|
+
preprocessor.fit(raw.train(all_columns: true), computed_statistics)
|
712
|
+
update(preprocessor_statistics: preprocessor.statistics)
|
692
713
|
end
|
693
714
|
|
694
715
|
# log_method :fit, "Learning statistics", verbose: true
|
@@ -701,7 +722,6 @@ module EasyML
|
|
701
722
|
return unless force || should_split?
|
702
723
|
|
703
724
|
cleanup
|
704
|
-
features = self.features.ordered.load
|
705
725
|
splitter.split(datasource) do |train_df, valid_df, test_df|
|
706
726
|
[:train, :valid, :test].zip([train_df, valid_df, test_df]).each do |segment, df|
|
707
727
|
raw.save(segment, df)
|
@@ -55,6 +55,7 @@ module EasyML
|
|
55
55
|
|
56
56
|
has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
|
57
57
|
attr_accessor :schema, :columns, :num_rows, :is_syncing
|
58
|
+
belongs_to :dataset, class_name: "EasyML::Dataset", optional: true, dependent: :destroy
|
58
59
|
|
59
60
|
add_configuration_attributes :schema, :columns, :num_rows, :polars_args, :verbose, :is_syncing
|
60
61
|
DATASOURCE_CONSTANTS.flat_map(&:configuration_attributes).each do |attribute|
|
@@ -165,6 +165,13 @@ module EasyML
|
|
165
165
|
end
|
166
166
|
end
|
167
167
|
|
168
|
+
def computes_columns
|
169
|
+
unless adapter.respond_to?(:computes_columns)
|
170
|
+
raise "Feature #{feature_class} must declare which columns it computes using the :computes_columns method"
|
171
|
+
end
|
172
|
+
adapter.computes_columns
|
173
|
+
end
|
174
|
+
|
168
175
|
def build_batches
|
169
176
|
if batchable?
|
170
177
|
batch
|
@@ -239,7 +246,7 @@ module EasyML
|
|
239
246
|
|
240
247
|
# Transform a single batch, used for testing the user's feature implementation
|
241
248
|
def transform_batch(df = nil, batch_args = {})
|
242
|
-
if df.
|
249
|
+
if df.is_a?(Polars::DataFrame)
|
243
250
|
actually_transform_batch(df)
|
244
251
|
else
|
245
252
|
actually_transform_batch(build_batch(get_batch_args(**batch_args)))
|
@@ -296,8 +303,8 @@ module EasyML
|
|
296
303
|
end
|
297
304
|
|
298
305
|
def actually_transform_batch(df)
|
299
|
-
return nil unless df.
|
300
|
-
return df if adapter.respond_to?(:
|
306
|
+
return nil unless df.is_a?(Polars::DataFrame)
|
307
|
+
return df if !adapter.respond_to?(:transform) && feature_store.empty?
|
301
308
|
|
302
309
|
result = adapter.transform(df, self)
|
303
310
|
update!(applied_at: Time.current)
|