easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/application_controller.rb +4 -0
- data/app/controllers/easy_ml/datasets_controller.rb +32 -1
- data/app/frontend/components/DatasetPreview.tsx +50 -19
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
- data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
- data/app/frontend/components/dataset/ColumnList.tsx +14 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
- data/app/frontend/types/dataset.ts +3 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
- data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
- data/app/models/easy_ml/column/imputers/base.rb +89 -0
- data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
- data/app/models/easy_ml/column/imputers/clip.rb +30 -0
- data/app/models/easy_ml/column/imputers/constant.rb +27 -0
- data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
- data/app/models/easy_ml/column/imputers/mean.rb +27 -0
- data/app/models/easy_ml/column/imputers/median.rb +27 -0
- data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
- data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
- data/app/models/easy_ml/column/imputers/today.rb +20 -0
- data/app/models/easy_ml/column/imputers.rb +126 -0
- data/app/models/easy_ml/column/learner.rb +18 -0
- data/app/models/easy_ml/column/learners/base.rb +103 -0
- data/app/models/easy_ml/column/learners/boolean.rb +11 -0
- data/app/models/easy_ml/column/learners/categorical.rb +51 -0
- data/app/models/easy_ml/column/learners/datetime.rb +19 -0
- data/app/models/easy_ml/column/learners/null.rb +22 -0
- data/app/models/easy_ml/column/learners/numeric.rb +33 -0
- data/app/models/easy_ml/column/learners/string.rb +15 -0
- data/app/models/easy_ml/column/lineage/base.rb +22 -0
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
- data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
- data/app/models/easy_ml/column/lineage.rb +28 -0
- data/app/models/easy_ml/column/selector.rb +96 -0
- data/app/models/easy_ml/column.rb +319 -52
- data/app/models/easy_ml/column_history.rb +29 -22
- data/app/models/easy_ml/column_list.rb +63 -78
- data/app/models/easy_ml/dataset.rb +128 -96
- data/app/models/easy_ml/dataset_history.rb +23 -23
- data/app/models/easy_ml/datasource.rb +3 -0
- data/app/models/easy_ml/datasource_history.rb +1 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
- data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
- data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
- data/app/models/easy_ml/feature.rb +19 -7
- data/app/models/easy_ml/feature_history.rb +12 -0
- data/app/models/easy_ml/feature_list.rb +15 -0
- data/app/serializers/easy_ml/column_serializer.rb +11 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
- data/config/initializers/enumerable.rb +17 -0
- data/lib/easy_ml/data/date_converter.rb +137 -30
- data/lib/easy_ml/data/polars_column.rb +17 -0
- data/lib/easy_ml/data/polars_in_memory.rb +30 -0
- data/lib/easy_ml/data/polars_reader.rb +20 -1
- data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
- data/lib/easy_ml/data/splits/split.rb +2 -1
- data/lib/easy_ml/data/synced_directory.rb +1 -1
- data/lib/easy_ml/data.rb +1 -2
- data/lib/easy_ml/engine.rb +1 -0
- data/lib/easy_ml/feature_store.rb +33 -22
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
- data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
- data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
- data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
- data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/tasks/profile.rake +40 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
- metadata +41 -10
- data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
- data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
- data/lib/easy_ml/data/preprocessor.rb +0 -340
- data/lib/easy_ml/data/simple_imputer.rb +0 -255
- data/lib/easy_ml/data/statistics_learner.rb +0 -193
- data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1
@@ -1,24 +1,24 @@
|
|
1
|
-
# ==
|
1
|
+
# == Schema Information
|
2
2
|
#
|
3
3
|
# Table name: easy_ml_datasets
|
4
4
|
#
|
5
|
-
# id
|
6
|
-
# name
|
7
|
-
# description
|
8
|
-
# dataset_type
|
9
|
-
# status
|
10
|
-
# version
|
11
|
-
# datasource_id
|
12
|
-
# root_dir
|
13
|
-
# configuration
|
14
|
-
# num_rows
|
15
|
-
# workflow_status
|
16
|
-
# statistics
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# name :string not null
|
7
|
+
# description :string
|
8
|
+
# dataset_type :string
|
9
|
+
# status :string
|
10
|
+
# version :string
|
11
|
+
# datasource_id :bigint
|
12
|
+
# root_dir :string
|
13
|
+
# configuration :json
|
14
|
+
# num_rows :bigint
|
15
|
+
# workflow_status :string
|
16
|
+
# statistics :json
|
17
|
+
# schema :json
|
18
|
+
# refreshed_at :datetime
|
19
|
+
# created_at :datetime not null
|
20
|
+
# updated_at :datetime not null
|
21
|
+
# last_datasource_sha :string
|
22
22
|
#
|
23
23
|
module EasyML
|
24
24
|
class Dataset < ActiveRecord::Base
|
@@ -45,7 +45,7 @@ module EasyML
|
|
45
45
|
has_many :columns, class_name: "EasyML::Column", dependent: :destroy, extend: EasyML::ColumnList
|
46
46
|
accepts_nested_attributes_for :columns, allow_destroy: true, update_only: true
|
47
47
|
|
48
|
-
has_many :features, dependent: :destroy, class_name: "EasyML::Feature"
|
48
|
+
has_many :features, dependent: :destroy, class_name: "EasyML::Feature", extend: EasyML::FeatureList
|
49
49
|
accepts_nested_attributes_for :features, allow_destroy: true
|
50
50
|
|
51
51
|
has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
|
@@ -80,7 +80,7 @@ module EasyML
|
|
80
80
|
column_types: EasyML::Data::PolarsColumn::TYPE_MAP.keys.map do |type|
|
81
81
|
{ value: type.to_s, label: type.to_s.titleize }
|
82
82
|
end,
|
83
|
-
preprocessing_strategies: EasyML::
|
83
|
+
preprocessing_strategies: EasyML::Column::Imputers.constants[:preprocessing_strategies],
|
84
84
|
feature_options: EasyML::Features::Registry.list_flat,
|
85
85
|
splitter_constants: EasyML::Splitter.constants,
|
86
86
|
}
|
@@ -119,13 +119,6 @@ module EasyML
|
|
119
119
|
processed.data(limit: 1)&.schema || raw.data(limit: 1)&.schema
|
120
120
|
end
|
121
121
|
|
122
|
-
def refresh_datatypes
|
123
|
-
return unless columns_need_refresh?
|
124
|
-
|
125
|
-
cleanup
|
126
|
-
datasource.reread(columns)
|
127
|
-
end
|
128
|
-
|
129
122
|
def num_rows
|
130
123
|
if datasource&.num_rows.nil?
|
131
124
|
datasource.after_sync
|
@@ -142,7 +135,7 @@ module EasyML
|
|
142
135
|
|
143
136
|
def best_segment
|
144
137
|
[processed, raw].detect do |segment|
|
145
|
-
segment.send(:
|
138
|
+
segment.send(:data, all_columns: true, limit: 1)&.columns
|
146
139
|
end
|
147
140
|
end
|
148
141
|
|
@@ -168,15 +161,27 @@ module EasyML
|
|
168
161
|
save
|
169
162
|
end
|
170
163
|
|
164
|
+
def refreshed_datasource?
|
165
|
+
last_datasource_sha_changed?
|
166
|
+
end
|
167
|
+
|
168
|
+
def prepare_features
|
169
|
+
features.update_all(workflow_status: "ready")
|
170
|
+
end
|
171
|
+
|
171
172
|
def prepare!
|
173
|
+
prepare_features
|
172
174
|
cleanup
|
173
175
|
refresh_datasource!
|
174
176
|
split_data
|
177
|
+
process_data
|
175
178
|
end
|
176
179
|
|
177
180
|
def prepare
|
181
|
+
prepare_features
|
178
182
|
refresh_datasource
|
179
183
|
split_data
|
184
|
+
process_data
|
180
185
|
end
|
181
186
|
|
182
187
|
def actually_refresh
|
@@ -184,7 +189,8 @@ module EasyML
|
|
184
189
|
learn(delete: false) # After syncing datasource, learn new statistics + sync columns
|
185
190
|
process_data
|
186
191
|
fully_reload
|
187
|
-
learn
|
192
|
+
learn
|
193
|
+
learn_statistics(type: :processed) # After processing data, we learn any new statistics
|
188
194
|
now = UTC.now
|
189
195
|
update(workflow_status: "ready", refreshed_at: now, updated_at: now)
|
190
196
|
fully_reload
|
@@ -252,19 +258,57 @@ module EasyML
|
|
252
258
|
features_need_fit.any?
|
253
259
|
end
|
254
260
|
|
255
|
-
|
261
|
+
# Some of these are expensive to calculate, so we only want to include
|
262
|
+
# them in the refresh reasons if they are actually needed.
|
263
|
+
#
|
264
|
+
# During dataset_serializer for instance, we don't want to check s3,
|
265
|
+
# we only do that during background jobs.
|
266
|
+
#
|
267
|
+
# So yes this is an annoying way to structure a method, but it's helpful for performance
|
268
|
+
#
|
269
|
+
def refresh_reasons(exclude: [])
|
256
270
|
{
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
271
|
+
not_split: {
|
272
|
+
name: "Not split",
|
273
|
+
check: -> { not_split? },
|
274
|
+
},
|
275
|
+
refreshed_at_is_nil: {
|
276
|
+
name: "Refreshed at is nil",
|
277
|
+
check: -> { refreshed_at.nil? },
|
278
|
+
},
|
279
|
+
columns_need_refresh: {
|
280
|
+
name: "Columns need refresh",
|
281
|
+
check: -> { columns_need_refresh? },
|
282
|
+
},
|
283
|
+
features_need_fit: {
|
284
|
+
name: "Features need refresh",
|
285
|
+
check: -> { features_need_fit? },
|
286
|
+
},
|
287
|
+
datasource_needs_refresh: {
|
288
|
+
name: "Datasource needs refresh",
|
289
|
+
check: -> { datasource_needs_refresh? },
|
290
|
+
},
|
291
|
+
refreshed_datasource: {
|
292
|
+
name: "Refreshed datasource",
|
293
|
+
check: -> { refreshed_datasource? },
|
294
|
+
},
|
295
|
+
datasource_was_refreshed: {
|
296
|
+
name: "Datasource was refreshed",
|
297
|
+
check: -> { datasource_was_refreshed? },
|
298
|
+
},
|
299
|
+
}.except(*exclude).select do |k, config|
|
300
|
+
config[:check].call
|
301
|
+
end.map do |k, config|
|
302
|
+
config[:name]
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
def needs_refresh?(exclude: [])
|
307
|
+
refresh_reasons(exclude: exclude).any?
|
264
308
|
end
|
265
309
|
|
266
|
-
def
|
267
|
-
|
310
|
+
def processed?
|
311
|
+
!needs_refresh?
|
268
312
|
end
|
269
313
|
|
270
314
|
def not_split?
|
@@ -281,7 +325,6 @@ module EasyML
|
|
281
325
|
|
282
326
|
def learn(delete: true)
|
283
327
|
learn_schema
|
284
|
-
learn_statistics
|
285
328
|
columns.sync(delete: delete)
|
286
329
|
end
|
287
330
|
|
@@ -333,6 +376,8 @@ module EasyML
|
|
333
376
|
|
334
377
|
def learn_schema
|
335
378
|
data = processed.data(limit: 1).to_a.any? ? processed.data : raw.data
|
379
|
+
return nil if data.nil?
|
380
|
+
|
336
381
|
schema = data.schema.reduce({}) do |h, (k, v)|
|
337
382
|
h.tap do
|
338
383
|
h[k] = EasyML::Data::PolarsColumn.polars_to_sym(v)
|
@@ -341,19 +386,15 @@ module EasyML
|
|
341
386
|
write_attribute(:schema, schema)
|
342
387
|
end
|
343
388
|
|
344
|
-
def learn_statistics
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
columns.select(&:is_computed).each do |col|
|
351
|
-
if stats.dig(:processed, col.name)
|
352
|
-
stats[:raw][col.name] = stats[:processed][col.name]
|
353
|
-
end
|
354
|
-
end
|
389
|
+
def learn_statistics(type: :raw, computed: false)
|
390
|
+
columns.learn(type: type, computed: computed)
|
391
|
+
update(
|
392
|
+
statistics: columns.reload.statistics,
|
393
|
+
)
|
394
|
+
end
|
355
395
|
|
356
|
-
|
396
|
+
def statistics
|
397
|
+
(read_attribute(:statistics) || {}).with_indifferent_access
|
357
398
|
end
|
358
399
|
|
359
400
|
def process_data
|
@@ -410,10 +451,9 @@ module EasyML
|
|
410
451
|
def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
|
411
452
|
df = apply_missing_features(df, inference: inference)
|
412
453
|
df = drop_nulls(df)
|
413
|
-
df =
|
454
|
+
df = columns.transform(df, inference: inference)
|
414
455
|
df = apply_features(df, features)
|
415
|
-
|
416
|
-
df = preprocessor.postprocess(df, inference: inference, computed: true)
|
456
|
+
df = columns.transform(df, inference: inference, computed: true)
|
417
457
|
df = apply_column_mask(df, inference: inference) unless all_columns
|
418
458
|
df, = processed.split_features_targets(df, true, target) if split_ys
|
419
459
|
df
|
@@ -494,16 +534,15 @@ module EasyML
|
|
494
534
|
result.empty? ? nil : result
|
495
535
|
end
|
496
536
|
|
497
|
-
def processed?
|
498
|
-
!should_split?
|
499
|
-
end
|
500
|
-
|
501
537
|
def decode_labels(ys, col: nil)
|
502
|
-
|
538
|
+
if col.nil?
|
539
|
+
col = target
|
540
|
+
end
|
541
|
+
preloaded_columns.find_by(name: col).decode_labels(ys)
|
503
542
|
end
|
504
543
|
|
505
544
|
def preprocessing_steps
|
506
|
-
return {} if
|
545
|
+
return {} if preloaded_columns.nil? || (preloaded_columns.respond_to?(:empty?) && preloaded_columns.empty?)
|
507
546
|
return @preprocessing_steps if @preprocessing_steps.present?
|
508
547
|
|
509
548
|
training = standardize_preprocessing_steps(:training)
|
@@ -515,19 +554,12 @@ module EasyML
|
|
515
554
|
}.compact.deep_symbolize_keys
|
516
555
|
end
|
517
556
|
|
518
|
-
def preprocessor
|
519
|
-
@preprocessor ||= initialize_preprocessor
|
520
|
-
return @preprocessor if @preprocessor.preprocessing_steps == preprocessing_steps
|
521
|
-
|
522
|
-
@preprocessor = initialize_preprocessor
|
523
|
-
end
|
524
|
-
|
525
557
|
def target
|
526
558
|
@target ||= preloaded_columns.find(&:is_target)&.name
|
527
559
|
end
|
528
560
|
|
529
561
|
def date_column
|
530
|
-
@date_column ||=
|
562
|
+
@date_column ||= preloaded_columns.find(&:is_date_column?)
|
531
563
|
end
|
532
564
|
|
533
565
|
def drop_cols
|
@@ -596,7 +628,7 @@ module EasyML
|
|
596
628
|
end
|
597
629
|
|
598
630
|
def upload_remote_files
|
599
|
-
return
|
631
|
+
return if !needs_refresh?
|
600
632
|
|
601
633
|
processed.upload.tap do
|
602
634
|
features.each(&:upload_remote_files)
|
@@ -668,13 +700,16 @@ module EasyML
|
|
668
700
|
|
669
701
|
def refresh_datasource
|
670
702
|
datasource.reload.refresh
|
671
|
-
|
672
|
-
initialize_splits
|
703
|
+
after_refresh_datasource
|
673
704
|
end
|
674
705
|
|
675
706
|
def refresh_datasource!
|
676
707
|
datasource.reload.refresh!
|
677
|
-
|
708
|
+
after_refresh_datasource
|
709
|
+
end
|
710
|
+
|
711
|
+
def after_refresh_datasource
|
712
|
+
update(last_datasource_sha: datasource.sha)
|
678
713
|
initialize_splits
|
679
714
|
end
|
680
715
|
|
@@ -683,12 +718,24 @@ module EasyML
|
|
683
718
|
|
684
719
|
SPLIT_ORDER.each do |segment|
|
685
720
|
df = raw.read(segment)
|
721
|
+
learn_computed_columns(df) if segment == :train
|
686
722
|
processed_df = normalize(df, all_columns: true)
|
687
723
|
processed.save(segment, processed_df)
|
688
724
|
end
|
689
725
|
@normalized = true
|
690
726
|
end
|
691
727
|
|
728
|
+
def learn_computed_columns(df)
|
729
|
+
return unless features.ready_to_apply.any?
|
730
|
+
|
731
|
+
df = df.clone
|
732
|
+
df = apply_features(df)
|
733
|
+
processed.save(:train, df)
|
734
|
+
learn(delete: false)
|
735
|
+
learn_statistics(type: :processed, computed: true)
|
736
|
+
processed.cleanup
|
737
|
+
end
|
738
|
+
|
692
739
|
def drop_nulls(df)
|
693
740
|
return df if drop_if_null.nil? || drop_if_null.empty?
|
694
741
|
|
@@ -698,8 +745,12 @@ module EasyML
|
|
698
745
|
df.drop_nulls(subset: drop)
|
699
746
|
end
|
700
747
|
|
748
|
+
# Pass refresh: false for frontend views so we don't query S3 during web requests
|
701
749
|
def load_data(segment, **kwargs, &block)
|
702
|
-
|
750
|
+
needs_refresh = kwargs.key?(:refresh) ? kwargs[:refresh] : needs_refresh?
|
751
|
+
kwargs.delete(:refresh)
|
752
|
+
|
753
|
+
if !needs_refresh
|
703
754
|
processed.load_data(segment, **kwargs, &block)
|
704
755
|
else
|
705
756
|
raw.load_data(segment, **kwargs, &block)
|
@@ -707,9 +758,7 @@ module EasyML
|
|
707
758
|
end
|
708
759
|
|
709
760
|
def fit
|
710
|
-
|
711
|
-
preprocessor.fit(raw.train(all_columns: true), computed_statistics)
|
712
|
-
update(preprocessor_statistics: preprocessor.statistics)
|
761
|
+
learn_statistics(type: :raw)
|
713
762
|
end
|
714
763
|
|
715
764
|
# log_method :fit, "Learning statistics", verbose: true
|
@@ -719,7 +768,7 @@ module EasyML
|
|
719
768
|
end
|
720
769
|
|
721
770
|
def split_data(force: false)
|
722
|
-
return unless force ||
|
771
|
+
return unless force || needs_refresh?
|
723
772
|
|
724
773
|
cleanup
|
725
774
|
splitter.split(datasource) do |train_df, valid_df, test_df|
|
@@ -729,10 +778,6 @@ module EasyML
|
|
729
778
|
end
|
730
779
|
end
|
731
780
|
|
732
|
-
def should_split?
|
733
|
-
needs_refresh?
|
734
|
-
end
|
735
|
-
|
736
781
|
def filter_duplicate_features
|
737
782
|
return unless attributes["features_attributes"].present?
|
738
783
|
|
@@ -753,6 +798,7 @@ module EasyML
|
|
753
798
|
end
|
754
799
|
|
755
800
|
def apply_features(df, features = self.features)
|
801
|
+
features = features.ready_to_apply
|
756
802
|
if features.nil? || features.empty?
|
757
803
|
df
|
758
804
|
else
|
@@ -774,10 +820,6 @@ module EasyML
|
|
774
820
|
|
775
821
|
result = feature.transform_batch(acc_df)
|
776
822
|
|
777
|
-
unless result.is_a?(Polars::DataFrame)
|
778
|
-
raise "Feature '#{feature.name}' must return a Polars::DataFrame, got #{result.class}"
|
779
|
-
end
|
780
|
-
|
781
823
|
result
|
782
824
|
end
|
783
825
|
end
|
@@ -789,16 +831,6 @@ module EasyML
|
|
789
831
|
end).to_h.compact.reject { |_k, v| v["method"] == "none" }
|
790
832
|
end
|
791
833
|
|
792
|
-
def initialize_preprocessor
|
793
|
-
EasyML::Data::Preprocessor.new(
|
794
|
-
directory: Pathname.new(root_dir).append("preprocessor"),
|
795
|
-
preprocessing_steps: preprocessing_steps,
|
796
|
-
dataset: self,
|
797
|
-
).tap do |preprocessor|
|
798
|
-
preprocessor.statistics = preprocessor_statistics
|
799
|
-
end
|
800
|
-
end
|
801
|
-
|
802
834
|
def fully_reload
|
803
835
|
return unless persisted?
|
804
836
|
|
@@ -2,28 +2,28 @@
|
|
2
2
|
#
|
3
3
|
# Table name: easy_ml_dataset_histories
|
4
4
|
#
|
5
|
-
# id
|
6
|
-
# dataset_id
|
7
|
-
# name
|
8
|
-
# description
|
9
|
-
# dataset_type
|
10
|
-
# status
|
11
|
-
# version
|
12
|
-
# datasource_id
|
13
|
-
# root_dir
|
14
|
-
# configuration
|
15
|
-
# num_rows
|
16
|
-
# workflow_status
|
17
|
-
# statistics
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# dataset_id :integer not null
|
7
|
+
# name :string not null
|
8
|
+
# description :string
|
9
|
+
# dataset_type :string
|
10
|
+
# status :string
|
11
|
+
# version :string
|
12
|
+
# datasource_id :integer
|
13
|
+
# root_dir :string
|
14
|
+
# configuration :json
|
15
|
+
# num_rows :integer
|
16
|
+
# workflow_status :string
|
17
|
+
# statistics :json
|
18
|
+
# schema :json
|
19
|
+
# refreshed_at :datetime
|
20
|
+
# created_at :datetime not null
|
21
|
+
# updated_at :datetime not null
|
22
|
+
# history_started_at :datetime not null
|
23
|
+
# history_ended_at :datetime
|
24
|
+
# history_user_id :integer
|
25
|
+
# snapshot_id :string
|
26
|
+
# last_datasource_sha :string
|
27
27
|
#
|
28
28
|
module EasyML
|
29
29
|
class DatasetHistory < ActiveRecord::Base
|
@@ -44,7 +44,7 @@ module EasyML
|
|
44
44
|
true
|
45
45
|
end
|
46
46
|
|
47
|
-
def
|
47
|
+
def needs_refresh?
|
48
48
|
false
|
49
49
|
end
|
50
50
|
end
|
@@ -10,6 +10,7 @@
|
|
10
10
|
# refreshed_at :datetime
|
11
11
|
# created_at :datetime not null
|
12
12
|
# updated_at :datetime not null
|
13
|
+
# sha :string
|
13
14
|
#
|
14
15
|
module EasyML
|
15
16
|
class Datasource < ActiveRecord::Base
|
@@ -119,11 +120,13 @@ module EasyML
|
|
119
120
|
self.num_rows = data.shape[0]
|
120
121
|
self.is_syncing = false
|
121
122
|
self.refreshed_at = Time.now
|
123
|
+
self.sha = adapter.sha
|
122
124
|
save
|
123
125
|
end
|
124
126
|
|
125
127
|
def refresh
|
126
128
|
unless adapter.needs_refresh?
|
129
|
+
update(sha: adapter.sha) if sha.nil?
|
127
130
|
update!(is_syncing: false)
|
128
131
|
return
|
129
132
|
end
|
@@ -6,18 +6,8 @@ module EasyML
|
|
6
6
|
validates :df, presence: true
|
7
7
|
add_configuration_attributes :df
|
8
8
|
|
9
|
-
def query(
|
10
|
-
|
11
|
-
|
12
|
-
df = self.df.clone
|
13
|
-
df = df.filter(filter) if filter
|
14
|
-
df = df.select(select) if select.present?
|
15
|
-
df = df.unique if unique
|
16
|
-
drop_cols &= df.columns
|
17
|
-
df = df.drop(drop_cols) unless drop_cols.empty?
|
18
|
-
df = df.sort(sort, reverse: descending) if sort
|
19
|
-
df = df.limit(limit) if limit
|
20
|
-
df
|
9
|
+
def query(**kwargs)
|
10
|
+
EasyML::Data::PolarsInMemory.query(df, **kwargs)
|
21
11
|
end
|
22
12
|
|
23
13
|
def in_batches(of: 10_000)
|
@@ -40,6 +30,10 @@ module EasyML
|
|
40
30
|
datasource.updated_at
|
41
31
|
end
|
42
32
|
|
33
|
+
def sha
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
|
43
37
|
def data
|
44
38
|
df
|
45
39
|
end
|
@@ -17,7 +17,7 @@ module EasyML
|
|
17
17
|
add_configuration_attributes :s3_bucket, :s3_prefix, :s3_region, :cache_for
|
18
18
|
|
19
19
|
delegate :query, :data, :s3_access_key_id, :s3_secret_access_key, :before_sync, :after_sync, :clean,
|
20
|
-
to: :synced_directory
|
20
|
+
:sha, to: :synced_directory
|
21
21
|
|
22
22
|
def in_batches(&block)
|
23
23
|
synced_directory.in_batches(&block)
|
@@ -55,6 +55,7 @@ module EasyML
|
|
55
55
|
end
|
56
56
|
|
57
57
|
belongs_to :dataset, class_name: "EasyML::Dataset"
|
58
|
+
has_many :columns, class_name: "EasyML::Column", dependent: :destroy
|
58
59
|
|
59
60
|
validates :feature_class, presence: true
|
60
61
|
validates :feature_position, presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 0 }
|
@@ -72,7 +73,7 @@ module EasyML
|
|
72
73
|
end
|
73
74
|
|
74
75
|
# Combine all conditions with OR
|
75
|
-
where(id: where(needs_fit: true).or(where(conditions.join(" OR "))).
|
76
|
+
where(id: where(needs_fit: true).or(where(conditions.join(" OR "))).map(&:id))
|
76
77
|
}
|
77
78
|
scope :never_applied, -> { where(applied_at: nil) }
|
78
79
|
scope :never_fit, -> do
|
@@ -81,6 +82,7 @@ module EasyML
|
|
81
82
|
where(id: fittable.map(&:id))
|
82
83
|
end
|
83
84
|
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
|
85
|
+
scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
|
84
86
|
|
85
87
|
before_save :apply_defaults, if: :new_record?
|
86
88
|
before_save :update_sha
|
@@ -223,8 +225,11 @@ module EasyML
|
|
223
225
|
def fit(features: [self], async: false)
|
224
226
|
ordered_features = features.sort_by(&:feature_position)
|
225
227
|
jobs = ordered_features.map(&:build_batches)
|
228
|
+
job_count = jobs.dup.flatten.size
|
226
229
|
|
227
|
-
|
230
|
+
# This is very important! For whatever reason, Resque BatchJob does not properly
|
231
|
+
# handle batch finished callbacks for batch size = 1
|
232
|
+
if async && job_count > 1
|
228
233
|
EasyML::ComputeFeatureJob.enqueue_ordered_batches(jobs)
|
229
234
|
else
|
230
235
|
jobs.flatten.each do |job|
|
@@ -240,7 +245,8 @@ module EasyML
|
|
240
245
|
if batch_args.key?(:batch_start)
|
241
246
|
actually_fit_batch(batch_args)
|
242
247
|
else
|
243
|
-
|
248
|
+
batch_args = get_batch_args(**batch_args)
|
249
|
+
actually_fit_batch(batch_args)
|
244
250
|
end
|
245
251
|
end
|
246
252
|
|
@@ -288,12 +294,14 @@ module EasyML
|
|
288
294
|
batch_args.symbolize_keys!
|
289
295
|
|
290
296
|
if adapter.respond_to?(:batch)
|
291
|
-
|
297
|
+
df = dataset.raw
|
292
298
|
else
|
293
299
|
df = build_batch(batch_args)
|
294
|
-
batch_df = adapter.fit(df, self, batch_args)
|
295
300
|
end
|
296
301
|
end
|
302
|
+
return if df.blank?
|
303
|
+
|
304
|
+
batch_df = adapter.fit(df, self, batch_args)
|
297
305
|
if batch_df.present?
|
298
306
|
store(batch_df)
|
299
307
|
else
|
@@ -306,7 +314,11 @@ module EasyML
|
|
306
314
|
return nil unless df.is_a?(Polars::DataFrame)
|
307
315
|
return df if !adapter.respond_to?(:transform) && feature_store.empty?
|
308
316
|
|
317
|
+
df_len_was = df.shape[0]
|
309
318
|
result = adapter.transform(df, self)
|
319
|
+
raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
|
320
|
+
df_len_now = result.shape[0]
|
321
|
+
raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if df_len_now != df_len_was
|
310
322
|
update!(applied_at: Time.current)
|
311
323
|
result
|
312
324
|
end
|
@@ -384,8 +396,8 @@ module EasyML
|
|
384
396
|
feature_store.list_partitions
|
385
397
|
end
|
386
398
|
|
387
|
-
def query(
|
388
|
-
feature_store.query(
|
399
|
+
def query(**kwargs)
|
400
|
+
feature_store.query(**kwargs)
|
389
401
|
end
|
390
402
|
|
391
403
|
def store(df)
|
@@ -31,6 +31,18 @@ module EasyML
|
|
31
31
|
|
32
32
|
after_find :download_remote_files
|
33
33
|
scope :ordered, -> { order(feature_position: :asc) }
|
34
|
+
scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
|
35
|
+
scope :has_changes, lambda {
|
36
|
+
none
|
37
|
+
}
|
38
|
+
scope :never_applied, -> { where(applied_at: nil) }
|
39
|
+
scope :never_fit, -> do
|
40
|
+
fittable = where(fit_at: nil)
|
41
|
+
fittable = fittable.select { |f| f.adapter.respond_to?(:fit) }
|
42
|
+
where(id: fittable.map(&:id))
|
43
|
+
end
|
44
|
+
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
|
45
|
+
scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
|
34
46
|
|
35
47
|
def download_remote_files
|
36
48
|
feature_store&.download
|