easy_ml 0.2.0.pre.rc57 → 0.2.0.pre.rc60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/apis_controller.rb +8 -0
- data/app/controllers/easy_ml/application_controller.rb +4 -0
- data/app/controllers/easy_ml/datasets_controller.rb +32 -1
- data/app/controllers/easy_ml/models_controller.rb +3 -0
- data/app/controllers/easy_ml/predictions_controller.rb +10 -5
- data/app/frontend/components/DatasetPreview.tsx +50 -19
- data/app/frontend/components/ModelForm.tsx +1 -1
- data/app/frontend/components/SearchableSelect.tsx +0 -1
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
- data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
- data/app/frontend/components/dataset/ColumnList.tsx +14 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +82 -21
- data/app/frontend/pages/DatasourcesPage.tsx +0 -2
- data/app/frontend/types/dataset.ts +3 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +0 -2
- data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
- data/app/models/easy_ml/column/imputers/base.rb +89 -0
- data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
- data/app/models/easy_ml/column/imputers/clip.rb +30 -0
- data/app/models/easy_ml/column/imputers/constant.rb +27 -0
- data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
- data/app/models/easy_ml/column/imputers/mean.rb +27 -0
- data/app/models/easy_ml/column/imputers/median.rb +27 -0
- data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
- data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
- data/app/models/easy_ml/column/imputers/today.rb +20 -0
- data/app/models/easy_ml/column/imputers.rb +126 -0
- data/app/models/easy_ml/column/learner.rb +18 -0
- data/app/models/easy_ml/column/learners/base.rb +103 -0
- data/app/models/easy_ml/column/learners/boolean.rb +11 -0
- data/app/models/easy_ml/column/learners/categorical.rb +51 -0
- data/app/models/easy_ml/column/learners/datetime.rb +19 -0
- data/app/models/easy_ml/column/learners/null.rb +22 -0
- data/app/models/easy_ml/column/learners/numeric.rb +33 -0
- data/app/models/easy_ml/column/learners/string.rb +15 -0
- data/app/models/easy_ml/column/lineage/base.rb +22 -0
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
- data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
- data/app/models/easy_ml/column/lineage.rb +28 -0
- data/app/models/easy_ml/column/selector.rb +96 -0
- data/app/models/easy_ml/column.rb +344 -39
- data/app/models/easy_ml/column_history.rb +31 -20
- data/app/models/easy_ml/column_list.rb +79 -62
- data/app/models/easy_ml/dataset.rb +156 -104
- data/app/models/easy_ml/dataset_history.rb +23 -23
- data/app/models/easy_ml/datasource.rb +4 -0
- data/app/models/easy_ml/datasource_history.rb +1 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
- data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
- data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
- data/app/models/easy_ml/feature.rb +29 -10
- data/app/models/easy_ml/feature_history.rb +12 -0
- data/app/models/easy_ml/feature_list.rb +15 -0
- data/app/models/easy_ml/model.rb +25 -4
- data/app/models/easy_ml/model_history.rb +1 -0
- data/app/models/easy_ml/retraining_run.rb +1 -0
- data/app/serializers/easy_ml/column_serializer.rb +11 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
- data/config/initializers/enumerable.rb +17 -0
- data/config/initializers/inflections.rb +2 -0
- data/config/routes.rb +3 -0
- data/lib/easy_ml/core/tuner.rb +1 -1
- data/lib/easy_ml/data/date_converter.rb +137 -30
- data/lib/easy_ml/data/polars_column.rb +17 -0
- data/lib/easy_ml/data/polars_in_memory.rb +30 -0
- data/lib/easy_ml/data/polars_reader.rb +20 -1
- data/lib/easy_ml/data/splits/in_memory_split.rb +7 -5
- data/lib/easy_ml/data/splits/split.rb +2 -1
- data/lib/easy_ml/data/synced_directory.rb +5 -3
- data/lib/easy_ml/data.rb +1 -2
- data/lib/easy_ml/feature_store.rb +33 -22
- data/lib/easy_ml/predict.rb +13 -2
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +7 -0
- data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +18 -0
- data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
- data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
- data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt +20 -0
- data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/tasks/profile.rake +40 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
- metadata +45 -10
- data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
- data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
- data/lib/easy_ml/data/preprocessor.rb +0 -383
- data/lib/easy_ml/data/simple_imputer.rb +0 -255
- data/lib/easy_ml/data/statistics_learner.rb +0 -128
- data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js +0 -474
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js.map +0 -1
@@ -1,24 +1,24 @@
|
|
1
|
-
# ==
|
1
|
+
# == Schema Information
|
2
2
|
#
|
3
3
|
# Table name: easy_ml_datasets
|
4
4
|
#
|
5
|
-
# id
|
6
|
-
# name
|
7
|
-
# description
|
8
|
-
# dataset_type
|
9
|
-
# status
|
10
|
-
# version
|
11
|
-
# datasource_id
|
12
|
-
# root_dir
|
13
|
-
# configuration
|
14
|
-
# num_rows
|
15
|
-
# workflow_status
|
16
|
-
# statistics
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# name :string not null
|
7
|
+
# description :string
|
8
|
+
# dataset_type :string
|
9
|
+
# status :string
|
10
|
+
# version :string
|
11
|
+
# datasource_id :bigint
|
12
|
+
# root_dir :string
|
13
|
+
# configuration :json
|
14
|
+
# num_rows :bigint
|
15
|
+
# workflow_status :string
|
16
|
+
# statistics :json
|
17
|
+
# schema :json
|
18
|
+
# refreshed_at :datetime
|
19
|
+
# created_at :datetime not null
|
20
|
+
# updated_at :datetime not null
|
21
|
+
# last_datasource_sha :string
|
22
22
|
#
|
23
23
|
module EasyML
|
24
24
|
class Dataset < ActiveRecord::Base
|
@@ -45,7 +45,7 @@ module EasyML
|
|
45
45
|
has_many :columns, class_name: "EasyML::Column", dependent: :destroy, extend: EasyML::ColumnList
|
46
46
|
accepts_nested_attributes_for :columns, allow_destroy: true, update_only: true
|
47
47
|
|
48
|
-
has_many :features, dependent: :destroy, class_name: "EasyML::Feature"
|
48
|
+
has_many :features, dependent: :destroy, class_name: "EasyML::Feature", extend: EasyML::FeatureList
|
49
49
|
accepts_nested_attributes_for :features, allow_destroy: true
|
50
50
|
|
51
51
|
has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
|
@@ -80,7 +80,7 @@ module EasyML
|
|
80
80
|
column_types: EasyML::Data::PolarsColumn::TYPE_MAP.keys.map do |type|
|
81
81
|
{ value: type.to_s, label: type.to_s.titleize }
|
82
82
|
end,
|
83
|
-
preprocessing_strategies: EasyML::
|
83
|
+
preprocessing_strategies: EasyML::Column::Imputers.constants[:preprocessing_strategies],
|
84
84
|
feature_options: EasyML::Features::Registry.list_flat,
|
85
85
|
splitter_constants: EasyML::Splitter.constants,
|
86
86
|
}
|
@@ -119,13 +119,6 @@ module EasyML
|
|
119
119
|
processed.data(limit: 1)&.schema || raw.data(limit: 1)&.schema
|
120
120
|
end
|
121
121
|
|
122
|
-
def refresh_datatypes
|
123
|
-
return unless columns_need_refresh?
|
124
|
-
|
125
|
-
cleanup
|
126
|
-
datasource.reread(columns)
|
127
|
-
end
|
128
|
-
|
129
122
|
def num_rows
|
130
123
|
if datasource&.num_rows.nil?
|
131
124
|
datasource.after_sync
|
@@ -140,6 +133,12 @@ module EasyML
|
|
140
133
|
EasyML::RefreshDatasetJob.perform_later(id)
|
141
134
|
end
|
142
135
|
|
136
|
+
def best_segment
|
137
|
+
[processed, raw].detect do |segment|
|
138
|
+
segment.send(:data, all_columns: true, limit: 1)&.columns
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
143
142
|
def raw
|
144
143
|
return @raw if @raw && @raw.dataset
|
145
144
|
|
@@ -162,22 +161,36 @@ module EasyML
|
|
162
161
|
save
|
163
162
|
end
|
164
163
|
|
164
|
+
def refreshed_datasource?
|
165
|
+
last_datasource_sha_changed?
|
166
|
+
end
|
167
|
+
|
168
|
+
def prepare_features
|
169
|
+
features.update_all(workflow_status: "ready")
|
170
|
+
end
|
171
|
+
|
165
172
|
def prepare!
|
173
|
+
prepare_features
|
166
174
|
cleanup
|
167
175
|
refresh_datasource!
|
168
176
|
split_data
|
177
|
+
process_data
|
169
178
|
end
|
170
179
|
|
171
180
|
def prepare
|
181
|
+
prepare_features
|
172
182
|
refresh_datasource
|
173
183
|
split_data
|
184
|
+
process_data
|
174
185
|
end
|
175
186
|
|
176
187
|
def actually_refresh
|
177
188
|
refreshing do
|
189
|
+
learn(delete: false) # After syncing datasource, learn new statistics + sync columns
|
178
190
|
process_data
|
179
191
|
fully_reload
|
180
192
|
learn
|
193
|
+
learn_statistics(type: :processed) # After processing data, we learn any new statistics
|
181
194
|
now = UTC.now
|
182
195
|
update(workflow_status: "ready", refreshed_at: now, updated_at: now)
|
183
196
|
fully_reload
|
@@ -245,19 +258,57 @@ module EasyML
|
|
245
258
|
features_need_fit.any?
|
246
259
|
end
|
247
260
|
|
248
|
-
|
261
|
+
# Some of these are expensive to calculate, so we only want to include
|
262
|
+
# them in the refresh reasons if they are actually needed.
|
263
|
+
#
|
264
|
+
# During dataset_serializer for instance, we don't want to check s3,
|
265
|
+
# we only do that during background jobs.
|
266
|
+
#
|
267
|
+
# So yes this is an annoying way to structure a method, but it's helpful for performance
|
268
|
+
#
|
269
|
+
def refresh_reasons(exclude: [])
|
249
270
|
{
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
271
|
+
not_split: {
|
272
|
+
name: "Not split",
|
273
|
+
check: -> { not_split? },
|
274
|
+
},
|
275
|
+
refreshed_at_is_nil: {
|
276
|
+
name: "Refreshed at is nil",
|
277
|
+
check: -> { refreshed_at.nil? },
|
278
|
+
},
|
279
|
+
columns_need_refresh: {
|
280
|
+
name: "Columns need refresh",
|
281
|
+
check: -> { columns_need_refresh? },
|
282
|
+
},
|
283
|
+
features_need_fit: {
|
284
|
+
name: "Features need fit",
|
285
|
+
check: -> { features_need_fit? },
|
286
|
+
},
|
287
|
+
datasource_needs_refresh: {
|
288
|
+
name: "Datasource needs refresh",
|
289
|
+
check: -> { datasource_needs_refresh? },
|
290
|
+
},
|
291
|
+
refreshed_datasource: {
|
292
|
+
name: "Refreshed datasource",
|
293
|
+
check: -> { refreshed_datasource? },
|
294
|
+
},
|
295
|
+
datasource_was_refreshed: {
|
296
|
+
name: "Datasource was refreshed",
|
297
|
+
check: -> { datasource_was_refreshed? },
|
298
|
+
},
|
299
|
+
}.except(*exclude).select do |k, config|
|
300
|
+
config[:check].call
|
301
|
+
end.map do |k, config|
|
302
|
+
config[:name]
|
303
|
+
end
|
257
304
|
end
|
258
305
|
|
259
|
-
def needs_refresh?
|
260
|
-
refresh_reasons.any?
|
306
|
+
def needs_refresh?(exclude: [])
|
307
|
+
refresh_reasons(exclude: exclude).any?
|
308
|
+
end
|
309
|
+
|
310
|
+
def processed?
|
311
|
+
!needs_refresh?
|
261
312
|
end
|
262
313
|
|
263
314
|
def not_split?
|
@@ -274,7 +325,6 @@ module EasyML
|
|
274
325
|
|
275
326
|
def learn(delete: true)
|
276
327
|
learn_schema
|
277
|
-
learn_statistics
|
278
328
|
columns.sync(delete: delete)
|
279
329
|
end
|
280
330
|
|
@@ -326,6 +376,8 @@ module EasyML
|
|
326
376
|
|
327
377
|
def learn_schema
|
328
378
|
data = processed.data(limit: 1).to_a.any? ? processed.data : raw.data
|
379
|
+
return nil if data.nil?
|
380
|
+
|
329
381
|
schema = data.schema.reduce({}) do |h, (k, v)|
|
330
382
|
h.tap do
|
331
383
|
h[k] = EasyML::Data::PolarsColumn.polars_to_sym(v)
|
@@ -334,23 +386,23 @@ module EasyML
|
|
334
386
|
write_attribute(:schema, schema)
|
335
387
|
end
|
336
388
|
|
337
|
-
def learn_statistics
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
389
|
+
def learn_statistics(type: :raw, computed: false)
|
390
|
+
columns.learn(type: type, computed: computed)
|
391
|
+
update(
|
392
|
+
statistics: columns.reload.statistics,
|
393
|
+
)
|
394
|
+
end
|
342
395
|
|
343
|
-
|
396
|
+
def statistics
|
397
|
+
(read_attribute(:statistics) || {}).with_indifferent_access
|
344
398
|
end
|
345
399
|
|
346
400
|
def process_data
|
347
|
-
split_data
|
348
401
|
fit
|
349
402
|
normalize_all
|
350
|
-
# alert_nulls
|
351
403
|
end
|
352
404
|
|
353
|
-
def needs_learn?
|
405
|
+
def needs_learn?
|
354
406
|
return true if columns_need_refresh?
|
355
407
|
|
356
408
|
never_learned = columns.none?
|
@@ -359,6 +411,7 @@ module EasyML
|
|
359
411
|
new_features = features.any? { |f| f.updated_at > columns.maximum(:updated_at) }
|
360
412
|
return true if new_features
|
361
413
|
|
414
|
+
df = raw.query(limit: 1)
|
362
415
|
new_cols = df.present? ? (df.columns - columns.map(&:name)) : []
|
363
416
|
new_cols = columns.syncable
|
364
417
|
|
@@ -390,22 +443,23 @@ module EasyML
|
|
390
443
|
{ differing_columns: differing_columns, differences: differences }
|
391
444
|
end
|
392
445
|
|
393
|
-
def
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
df = preprocessor.postprocess(df, inference: inference)
|
446
|
+
def validate_input(df)
|
447
|
+
fields = missing_required_fields(df)
|
448
|
+
return fields.empty?, fields
|
449
|
+
end
|
398
450
|
|
399
|
-
|
400
|
-
|
401
|
-
|
451
|
+
def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
|
452
|
+
df = apply_missing_features(df, inference: inference)
|
453
|
+
df = drop_nulls(df)
|
454
|
+
df = columns.transform(df, inference: inference)
|
455
|
+
df = apply_features(df, features)
|
456
|
+
df = columns.transform(df, inference: inference, computed: true)
|
402
457
|
df = apply_column_mask(df, inference: inference) unless all_columns
|
403
|
-
raise_on_nulls(df) if inference
|
404
458
|
df, = processed.split_features_targets(df, true, target) if split_ys
|
405
459
|
df
|
406
460
|
end
|
407
461
|
|
408
|
-
def
|
462
|
+
def missing_required_fields(df)
|
409
463
|
desc_df = df.describe
|
410
464
|
|
411
465
|
# Get the 'null_count' row
|
@@ -416,8 +470,10 @@ module EasyML
|
|
416
470
|
null_count_row[col][0].to_i > 0
|
417
471
|
end
|
418
472
|
|
419
|
-
|
420
|
-
|
473
|
+
# This is a history class, because this only occurs on prediction
|
474
|
+
required_columns = columns.current.required.map(&:name)
|
475
|
+
required_columns.select do |col|
|
476
|
+
columns_with_nulls.include?(col) || df.columns.map(&:to_s).exclude?(col.to_s)
|
421
477
|
end
|
422
478
|
end
|
423
479
|
|
@@ -478,16 +534,15 @@ module EasyML
|
|
478
534
|
result.empty? ? nil : result
|
479
535
|
end
|
480
536
|
|
481
|
-
def processed?
|
482
|
-
!should_split?
|
483
|
-
end
|
484
|
-
|
485
537
|
def decode_labels(ys, col: nil)
|
486
|
-
|
538
|
+
if col.nil?
|
539
|
+
col = target
|
540
|
+
end
|
541
|
+
preloaded_columns.find_by(name: col).decode_labels(ys)
|
487
542
|
end
|
488
543
|
|
489
544
|
def preprocessing_steps
|
490
|
-
return if
|
545
|
+
return {} if preloaded_columns.nil? || (preloaded_columns.respond_to?(:empty?) && preloaded_columns.empty?)
|
491
546
|
return @preprocessing_steps if @preprocessing_steps.present?
|
492
547
|
|
493
548
|
training = standardize_preprocessing_steps(:training)
|
@@ -499,23 +554,16 @@ module EasyML
|
|
499
554
|
}.compact.deep_symbolize_keys
|
500
555
|
end
|
501
556
|
|
502
|
-
def preprocessor
|
503
|
-
@preprocessor ||= initialize_preprocessor
|
504
|
-
return @preprocessor if @preprocessor.preprocessing_steps == preprocessing_steps
|
505
|
-
|
506
|
-
@preprocessor = initialize_preprocessor
|
507
|
-
end
|
508
|
-
|
509
557
|
def target
|
510
558
|
@target ||= preloaded_columns.find(&:is_target)&.name
|
511
559
|
end
|
512
560
|
|
513
561
|
def date_column
|
514
|
-
@date_column ||=
|
562
|
+
@date_column ||= preloaded_columns.find(&:is_date_column?)
|
515
563
|
end
|
516
564
|
|
517
565
|
def drop_cols
|
518
|
-
@drop_cols ||= preloaded_columns.select(&:hidden).flat_map(&:
|
566
|
+
@drop_cols ||= preloaded_columns.select(&:hidden).flat_map(&:aliases)
|
519
567
|
end
|
520
568
|
|
521
569
|
def drop_if_null
|
@@ -552,10 +600,14 @@ module EasyML
|
|
552
600
|
df[column_mask(df, inference: inference)]
|
553
601
|
end
|
554
602
|
|
555
|
-
def apply_missing_features(df, inference: false)
|
603
|
+
def apply_missing_features(df, inference: false, include_one_hots: false)
|
556
604
|
return df unless inference
|
557
605
|
|
558
606
|
missing_features = (col_order(inference: inference) - df.columns).compact
|
607
|
+
unless include_one_hots
|
608
|
+
missing_features -= columns.one_hots.flat_map(&:virtual_columns) unless include_one_hots
|
609
|
+
missing_features += columns.one_hots.map(&:name) - df.columns
|
610
|
+
end
|
559
611
|
df.with_columns(missing_features.map { |f| Polars.lit(nil).alias(f) })
|
560
612
|
end
|
561
613
|
|
@@ -576,7 +628,7 @@ module EasyML
|
|
576
628
|
end
|
577
629
|
|
578
630
|
def upload_remote_files
|
579
|
-
return
|
631
|
+
return if !needs_refresh?
|
580
632
|
|
581
633
|
processed.upload.tap do
|
582
634
|
features.each(&:upload_remote_files)
|
@@ -648,27 +700,42 @@ module EasyML
|
|
648
700
|
|
649
701
|
def refresh_datasource
|
650
702
|
datasource.reload.refresh
|
651
|
-
|
652
|
-
initialize_splits
|
703
|
+
after_refresh_datasource
|
653
704
|
end
|
654
705
|
|
655
706
|
def refresh_datasource!
|
656
707
|
datasource.reload.refresh!
|
657
|
-
|
708
|
+
after_refresh_datasource
|
709
|
+
end
|
710
|
+
|
711
|
+
def after_refresh_datasource
|
712
|
+
update(last_datasource_sha: datasource.sha)
|
658
713
|
initialize_splits
|
659
714
|
end
|
660
715
|
|
661
716
|
def normalize_all
|
662
717
|
processed.cleanup
|
663
718
|
|
664
|
-
SPLIT_ORDER.
|
719
|
+
SPLIT_ORDER.each do |segment|
|
665
720
|
df = raw.read(segment)
|
666
|
-
|
721
|
+
learn_computed_columns(df) if segment == :train
|
722
|
+
processed_df = normalize(df, all_columns: true)
|
667
723
|
processed.save(segment, processed_df)
|
668
724
|
end
|
669
725
|
@normalized = true
|
670
726
|
end
|
671
727
|
|
728
|
+
def learn_computed_columns(df)
|
729
|
+
return unless features.ready_to_apply.any?
|
730
|
+
|
731
|
+
df = df.clone
|
732
|
+
df = apply_features(df)
|
733
|
+
processed.save(:train, df)
|
734
|
+
learn(delete: false)
|
735
|
+
learn_statistics(type: :processed, computed: true)
|
736
|
+
processed.cleanup
|
737
|
+
end
|
738
|
+
|
672
739
|
def drop_nulls(df)
|
673
740
|
return df if drop_if_null.nil? || drop_if_null.empty?
|
674
741
|
|
@@ -678,8 +745,12 @@ module EasyML
|
|
678
745
|
df.drop_nulls(subset: drop)
|
679
746
|
end
|
680
747
|
|
748
|
+
# Pass refresh: false for frontend views so we don't query S3 during web requests
|
681
749
|
def load_data(segment, **kwargs, &block)
|
682
|
-
|
750
|
+
needs_refresh = kwargs.key?(:refresh) ? kwargs[:refresh] : needs_refresh?
|
751
|
+
kwargs.delete(:refresh)
|
752
|
+
|
753
|
+
if !needs_refresh
|
683
754
|
processed.load_data(segment, **kwargs, &block)
|
684
755
|
else
|
685
756
|
raw.load_data(segment, **kwargs, &block)
|
@@ -687,8 +758,7 @@ module EasyML
|
|
687
758
|
end
|
688
759
|
|
689
760
|
def fit
|
690
|
-
|
691
|
-
self.preprocessor_statistics = preprocessor.statistics
|
761
|
+
learn_statistics(type: :raw)
|
692
762
|
end
|
693
763
|
|
694
764
|
# log_method :fit, "Learning statistics", verbose: true
|
@@ -698,10 +768,9 @@ module EasyML
|
|
698
768
|
end
|
699
769
|
|
700
770
|
def split_data(force: false)
|
701
|
-
return unless force ||
|
771
|
+
return unless force || needs_refresh?
|
702
772
|
|
703
773
|
cleanup
|
704
|
-
features = self.features.ordered.load
|
705
774
|
splitter.split(datasource) do |train_df, valid_df, test_df|
|
706
775
|
[:train, :valid, :test].zip([train_df, valid_df, test_df]).each do |segment, df|
|
707
776
|
raw.save(segment, df)
|
@@ -709,10 +778,6 @@ module EasyML
|
|
709
778
|
end
|
710
779
|
end
|
711
780
|
|
712
|
-
def should_split?
|
713
|
-
needs_refresh?
|
714
|
-
end
|
715
|
-
|
716
781
|
def filter_duplicate_features
|
717
782
|
return unless attributes["features_attributes"].present?
|
718
783
|
|
@@ -733,6 +798,7 @@ module EasyML
|
|
733
798
|
end
|
734
799
|
|
735
800
|
def apply_features(df, features = self.features)
|
801
|
+
features = features.ready_to_apply
|
736
802
|
if features.nil? || features.empty?
|
737
803
|
df
|
738
804
|
else
|
@@ -754,10 +820,6 @@ module EasyML
|
|
754
820
|
|
755
821
|
result = feature.transform_batch(acc_df)
|
756
822
|
|
757
|
-
unless result.is_a?(Polars::DataFrame)
|
758
|
-
raise "Feature '#{feature.name}' must return a Polars::DataFrame, got #{result.class}"
|
759
|
-
end
|
760
|
-
|
761
823
|
result
|
762
824
|
end
|
763
825
|
end
|
@@ -769,16 +831,6 @@ module EasyML
|
|
769
831
|
end).to_h.compact.reject { |_k, v| v["method"] == "none" }
|
770
832
|
end
|
771
833
|
|
772
|
-
def initialize_preprocessor
|
773
|
-
EasyML::Data::Preprocessor.new(
|
774
|
-
directory: Pathname.new(root_dir).append("preprocessor"),
|
775
|
-
preprocessing_steps: preprocessing_steps,
|
776
|
-
dataset: self,
|
777
|
-
).tap do |preprocessor|
|
778
|
-
preprocessor.statistics = preprocessor_statistics
|
779
|
-
end
|
780
|
-
end
|
781
|
-
|
782
834
|
def fully_reload
|
783
835
|
return unless persisted?
|
784
836
|
|
@@ -2,28 +2,28 @@
|
|
2
2
|
#
|
3
3
|
# Table name: easy_ml_dataset_histories
|
4
4
|
#
|
5
|
-
# id
|
6
|
-
# dataset_id
|
7
|
-
# name
|
8
|
-
# description
|
9
|
-
# dataset_type
|
10
|
-
# status
|
11
|
-
# version
|
12
|
-
# datasource_id
|
13
|
-
# root_dir
|
14
|
-
# configuration
|
15
|
-
# num_rows
|
16
|
-
# workflow_status
|
17
|
-
# statistics
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# dataset_id :integer not null
|
7
|
+
# name :string not null
|
8
|
+
# description :string
|
9
|
+
# dataset_type :string
|
10
|
+
# status :string
|
11
|
+
# version :string
|
12
|
+
# datasource_id :integer
|
13
|
+
# root_dir :string
|
14
|
+
# configuration :json
|
15
|
+
# num_rows :integer
|
16
|
+
# workflow_status :string
|
17
|
+
# statistics :json
|
18
|
+
# schema :json
|
19
|
+
# refreshed_at :datetime
|
20
|
+
# created_at :datetime not null
|
21
|
+
# updated_at :datetime not null
|
22
|
+
# history_started_at :datetime not null
|
23
|
+
# history_ended_at :datetime
|
24
|
+
# history_user_id :integer
|
25
|
+
# snapshot_id :string
|
26
|
+
# last_datasource_sha :string
|
27
27
|
#
|
28
28
|
module EasyML
|
29
29
|
class DatasetHistory < ActiveRecord::Base
|
@@ -44,7 +44,7 @@ module EasyML
|
|
44
44
|
true
|
45
45
|
end
|
46
46
|
|
47
|
-
def
|
47
|
+
def needs_refresh?
|
48
48
|
false
|
49
49
|
end
|
50
50
|
end
|
@@ -10,6 +10,7 @@
|
|
10
10
|
# refreshed_at :datetime
|
11
11
|
# created_at :datetime not null
|
12
12
|
# updated_at :datetime not null
|
13
|
+
# sha :string
|
13
14
|
#
|
14
15
|
module EasyML
|
15
16
|
class Datasource < ActiveRecord::Base
|
@@ -55,6 +56,7 @@ module EasyML
|
|
55
56
|
|
56
57
|
has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
|
57
58
|
attr_accessor :schema, :columns, :num_rows, :is_syncing
|
59
|
+
belongs_to :dataset, class_name: "EasyML::Dataset", optional: true, dependent: :destroy
|
58
60
|
|
59
61
|
add_configuration_attributes :schema, :columns, :num_rows, :polars_args, :verbose, :is_syncing
|
60
62
|
DATASOURCE_CONSTANTS.flat_map(&:configuration_attributes).each do |attribute|
|
@@ -118,11 +120,13 @@ module EasyML
|
|
118
120
|
self.num_rows = data.shape[0]
|
119
121
|
self.is_syncing = false
|
120
122
|
self.refreshed_at = Time.now
|
123
|
+
self.sha = adapter.sha
|
121
124
|
save
|
122
125
|
end
|
123
126
|
|
124
127
|
def refresh
|
125
128
|
unless adapter.needs_refresh?
|
129
|
+
update(sha: adapter.sha) if sha.nil?
|
126
130
|
update!(is_syncing: false)
|
127
131
|
return
|
128
132
|
end
|
@@ -6,18 +6,8 @@ module EasyML
|
|
6
6
|
validates :df, presence: true
|
7
7
|
add_configuration_attributes :df
|
8
8
|
|
9
|
-
def query(
|
10
|
-
|
11
|
-
|
12
|
-
df = self.df.clone
|
13
|
-
df = df.filter(filter) if filter
|
14
|
-
df = df.select(select) if select.present?
|
15
|
-
df = df.unique if unique
|
16
|
-
drop_cols &= df.columns
|
17
|
-
df = df.drop(drop_cols) unless drop_cols.empty?
|
18
|
-
df = df.sort(sort, reverse: descending) if sort
|
19
|
-
df = df.limit(limit) if limit
|
20
|
-
df
|
9
|
+
def query(**kwargs)
|
10
|
+
EasyML::Data::PolarsInMemory.query(df, **kwargs)
|
21
11
|
end
|
22
12
|
|
23
13
|
def in_batches(of: 10_000)
|
@@ -40,6 +30,10 @@ module EasyML
|
|
40
30
|
datasource.updated_at
|
41
31
|
end
|
42
32
|
|
33
|
+
def sha
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
|
43
37
|
def data
|
44
38
|
df
|
45
39
|
end
|
@@ -17,7 +17,7 @@ module EasyML
|
|
17
17
|
add_configuration_attributes :s3_bucket, :s3_prefix, :s3_region, :cache_for
|
18
18
|
|
19
19
|
delegate :query, :data, :s3_access_key_id, :s3_secret_access_key, :before_sync, :after_sync, :clean,
|
20
|
-
to: :synced_directory
|
20
|
+
:sha, to: :synced_directory
|
21
21
|
|
22
22
|
def in_batches(&block)
|
23
23
|
synced_directory.in_batches(&block)
|