easy_ml 0.2.0.pre.rc72 → 0.2.0.pre.rc75
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +33 -0
- data/app/controllers/easy_ml/datasources_controller.rb +7 -0
- data/app/controllers/easy_ml/models_controller.rb +38 -0
- data/app/frontend/components/DatasetCard.tsx +212 -0
- data/app/frontend/components/ModelCard.tsx +69 -29
- data/app/frontend/components/StackTrace.tsx +13 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +10 -7
- data/app/frontend/components/datasets/UploadDatasetButton.tsx +51 -0
- data/app/frontend/components/models/DownloadModelModal.tsx +90 -0
- data/app/frontend/components/models/UploadModelModal.tsx +212 -0
- data/app/frontend/components/models/index.ts +2 -0
- data/app/frontend/pages/DatasetsPage.tsx +36 -130
- data/app/frontend/pages/DatasourcesPage.tsx +22 -2
- data/app/frontend/pages/ModelsPage.tsx +37 -11
- data/app/frontend/types/dataset.ts +1 -2
- data/app/frontend/types.ts +1 -1
- data/app/jobs/easy_ml/training_job.rb +2 -2
- data/app/models/easy_ml/column/imputers/base.rb +4 -0
- data/app/models/easy_ml/column/imputers/clip.rb +5 -3
- data/app/models/easy_ml/column/imputers/imputer.rb +11 -13
- data/app/models/easy_ml/column/imputers/mean.rb +7 -3
- data/app/models/easy_ml/column/imputers/null_imputer.rb +3 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +5 -1
- data/app/models/easy_ml/column/imputers.rb +3 -1
- data/app/models/easy_ml/column/lineage/base.rb +5 -1
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +1 -1
- data/app/models/easy_ml/column/lineage/preprocessed.rb +1 -1
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +1 -1
- data/app/models/easy_ml/column/selector.rb +4 -0
- data/app/models/easy_ml/column.rb +79 -63
- data/app/models/easy_ml/column_history.rb +28 -28
- data/app/models/easy_ml/column_list/imputer.rb +23 -0
- data/app/models/easy_ml/column_list.rb +39 -26
- data/app/models/easy_ml/dataset/learner/base.rb +34 -0
- data/app/models/easy_ml/dataset/learner/eager/boolean.rb +10 -0
- data/app/models/easy_ml/dataset/learner/eager/categorical.rb +51 -0
- data/app/models/easy_ml/dataset/learner/eager/query.rb +37 -0
- data/app/models/easy_ml/dataset/learner/eager.rb +43 -0
- data/app/models/easy_ml/dataset/learner/lazy/boolean.rb +13 -0
- data/app/models/easy_ml/dataset/learner/lazy/categorical.rb +10 -0
- data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy/null.rb +17 -0
- data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy/query.rb +69 -0
- data/app/models/easy_ml/dataset/learner/lazy/string.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy.rb +51 -0
- data/app/models/easy_ml/dataset/learner/query.rb +25 -0
- data/app/models/easy_ml/dataset/learner.rb +100 -0
- data/app/models/easy_ml/dataset.rb +150 -36
- data/app/models/easy_ml/dataset_history.rb +1 -0
- data/app/models/easy_ml/datasource.rb +9 -0
- data/app/models/easy_ml/event.rb +4 -0
- data/app/models/easy_ml/export/column.rb +27 -0
- data/app/models/easy_ml/export/dataset.rb +37 -0
- data/app/models/easy_ml/export/datasource.rb +12 -0
- data/app/models/easy_ml/export/feature.rb +24 -0
- data/app/models/easy_ml/export/model.rb +40 -0
- data/app/models/easy_ml/export/retraining_job.rb +20 -0
- data/app/models/easy_ml/export/splitter.rb +14 -0
- data/app/models/easy_ml/feature.rb +21 -0
- data/app/models/easy_ml/import/column.rb +35 -0
- data/app/models/easy_ml/import/dataset.rb +148 -0
- data/app/models/easy_ml/import/feature.rb +36 -0
- data/app/models/easy_ml/import/model.rb +136 -0
- data/app/models/easy_ml/import/retraining_job.rb +29 -0
- data/app/models/easy_ml/import/splitter.rb +34 -0
- data/app/models/easy_ml/lineage.rb +44 -0
- data/app/models/easy_ml/model.rb +93 -36
- data/app/models/easy_ml/model_file.rb +6 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -7
- data/app/models/easy_ml/models/xgboost.rb +33 -9
- data/app/models/easy_ml/retraining_job.rb +8 -1
- data/app/models/easy_ml/retraining_run.rb +6 -4
- data/app/models/easy_ml/splitter.rb +8 -0
- data/app/models/lineage_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +7 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +2 -1
- data/app/serializers/easy_ml/lineage_serializer.rb +9 -0
- data/config/routes.rb +13 -1
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +3 -3
- data/lib/easy_ml/core/tuner.rb +12 -11
- data/lib/easy_ml/data/polars_column.rb +149 -100
- data/lib/easy_ml/data/polars_reader.rb +8 -5
- data/lib/easy_ml/data/polars_schema.rb +56 -0
- data/lib/easy_ml/data/splits/file_split.rb +20 -2
- data/lib/easy_ml/data/splits/split.rb +10 -1
- data/lib/easy_ml/data.rb +1 -0
- data/lib/easy_ml/deep_compact.rb +19 -0
- data/lib/easy_ml/feature_store.rb +2 -6
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_extra_metadata_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt +8 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_lineages.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt +7 -0
- data/lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt +18 -0
- data/lib/easy_ml/timing.rb +34 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/easy_ml.rb +2 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-Q7L6ioxr.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js +522 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js.map +1 -0
- metadata +52 -12
- data/app/models/easy_ml/column/learners/base.rb +0 -103
- data/app/models/easy_ml/column/learners/boolean.rb +0 -11
- data/app/models/easy_ml/column/learners/categorical.rb +0 -51
- data/app/models/easy_ml/column/learners/datetime.rb +0 -19
- data/app/models/easy_ml/column/learners/null.rb +0 -22
- data/app/models/easy_ml/column/learners/numeric.rb +0 -33
- data/app/models/easy_ml/column/learners/string.rb +0 -15
- data/public/easy_ml/assets/assets/Application-B3sRjyMT.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js +0 -489
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js.map +0 -1
@@ -19,6 +19,7 @@
|
|
19
19
|
# created_at :datetime not null
|
20
20
|
# updated_at :datetime not null
|
21
21
|
# last_datasource_sha :string
|
22
|
+
# raw_schema :jsonb
|
22
23
|
#
|
23
24
|
module EasyML
|
24
25
|
class Dataset < ActiveRecord::Base
|
@@ -86,6 +87,26 @@ module EasyML
|
|
86
87
|
}
|
87
88
|
end
|
88
89
|
|
90
|
+
UNCONFIGURABLE_COLUMNS = %w(
|
91
|
+
id
|
92
|
+
statistics
|
93
|
+
root_dir
|
94
|
+
created_at
|
95
|
+
updated_at
|
96
|
+
refreshed_at
|
97
|
+
sha
|
98
|
+
datasource_id
|
99
|
+
last_datasource_sha
|
100
|
+
)
|
101
|
+
|
102
|
+
def to_config
|
103
|
+
EasyML::Export::Dataset.to_config(self)
|
104
|
+
end
|
105
|
+
|
106
|
+
def self.from_config(json_config, action: nil, dataset: nil)
|
107
|
+
EasyML::Import::Dataset.from_config(json_config, action: action, dataset: dataset)
|
108
|
+
end
|
109
|
+
|
89
110
|
def root_dir=(value)
|
90
111
|
raise "Cannot override value of root_dir!" unless value.to_s == root_dir.to_s
|
91
112
|
|
@@ -111,12 +132,41 @@ module EasyML
|
|
111
132
|
FileUtils.rm_rf(root_dir) if root_dir.present?
|
112
133
|
end
|
113
134
|
|
135
|
+
def as_json
|
136
|
+
@serializing = true
|
137
|
+
super.tap do
|
138
|
+
@serializing = false
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
114
142
|
def schema
|
115
|
-
|
143
|
+
return @schema if @schema
|
144
|
+
return read_attribute(:schema) if @serializing
|
145
|
+
|
146
|
+
schema = read_attribute(:schema) || datasource.schema || datasource.after_sync.schema
|
147
|
+
schema = set_schema(schema)
|
148
|
+
@schema = EasyML::Data::PolarsSchema.deserialize(schema)
|
149
|
+
end
|
150
|
+
|
151
|
+
def raw_schema
|
152
|
+
return @raw_schema if @raw_schema
|
153
|
+
return read_attribute(:raw_schema) if @serializing
|
154
|
+
|
155
|
+
raw_schema = read_attribute(:raw_schema) || datasource.schema || datasource.after_sync.schema
|
156
|
+
raw_schema = set_raw_schema(raw_schema)
|
157
|
+
@raw_schema = EasyML::Data::PolarsSchema.deserialize(raw_schema)
|
158
|
+
end
|
159
|
+
|
160
|
+
def set_schema(schema)
|
161
|
+
write_attribute(:schema, EasyML::Data::PolarsSchema.serialize(schema))
|
162
|
+
end
|
163
|
+
|
164
|
+
def set_raw_schema(raw_schema)
|
165
|
+
write_attribute(:raw_schema, EasyML::Data::PolarsSchema.serialize(raw_schema))
|
116
166
|
end
|
117
167
|
|
118
168
|
def processed_schema
|
119
|
-
processed.data(limit: 1)&.schema || raw.data(limit: 1)&.schema
|
169
|
+
processed.data(limit: 1, lazy: true)&.schema || raw.data(limit: 1)&.schema
|
120
170
|
end
|
121
171
|
|
122
172
|
def num_rows
|
@@ -126,6 +176,12 @@ module EasyML
|
|
126
176
|
datasource&.num_rows
|
127
177
|
end
|
128
178
|
|
179
|
+
def abort!
|
180
|
+
EasyML::Reaper.kill(EasyML::RefreshDatasetJob, id)
|
181
|
+
update(workflow_status: :ready)
|
182
|
+
unlock!
|
183
|
+
end
|
184
|
+
|
129
185
|
def refresh_async
|
130
186
|
return if analyzing?
|
131
187
|
|
@@ -145,6 +201,12 @@ module EasyML
|
|
145
201
|
@raw = initialize_split("raw")
|
146
202
|
end
|
147
203
|
|
204
|
+
def clipped
|
205
|
+
return @clipped if @clipped && @clipped.dataset
|
206
|
+
|
207
|
+
@clipped = initialize_split("clipped")
|
208
|
+
end
|
209
|
+
|
148
210
|
def processed
|
149
211
|
return @processed if @processed && @processed.dataset
|
150
212
|
|
@@ -186,23 +248,26 @@ module EasyML
|
|
186
248
|
|
187
249
|
def actually_refresh
|
188
250
|
refreshing do
|
189
|
-
puts "actually_refresh"
|
190
251
|
learn(delete: false) # After syncing datasource, learn new statistics + sync columns
|
191
252
|
process_data
|
192
|
-
puts "process_data"
|
193
253
|
fully_reload
|
194
|
-
puts "Learning..."
|
195
254
|
learn
|
196
255
|
learn_statistics(type: :processed) # After processing data, we learn any new statistics
|
256
|
+
fully_reload
|
197
257
|
now = UTC.now
|
198
258
|
update(workflow_status: "ready", refreshed_at: now, updated_at: now)
|
199
259
|
fully_reload
|
200
260
|
end
|
201
261
|
end
|
202
262
|
|
263
|
+
include EasyML::Timing
|
264
|
+
measure_method_timing :actually_refresh
|
265
|
+
|
203
266
|
def refresh!(async: false)
|
204
267
|
refreshing do
|
268
|
+
puts "Prepare..."
|
205
269
|
prepare!
|
270
|
+
puts "Fit features..."
|
206
271
|
fit_features!(async: async)
|
207
272
|
end
|
208
273
|
end
|
@@ -218,6 +283,8 @@ module EasyML
|
|
218
283
|
end
|
219
284
|
end
|
220
285
|
|
286
|
+
measure_method_timing :refresh
|
287
|
+
|
221
288
|
def fit_features!(async: false, features: self.features)
|
222
289
|
fit_features(async: async, features: features, force: true)
|
223
290
|
end
|
@@ -229,6 +296,8 @@ module EasyML
|
|
229
296
|
features.first.fit(features: features_to_compute, async: async)
|
230
297
|
end
|
231
298
|
|
299
|
+
measure_method_timing :fit_features
|
300
|
+
|
232
301
|
def after_fit_features
|
233
302
|
puts "after fit features..."
|
234
303
|
unlock!
|
@@ -378,15 +447,11 @@ module EasyML
|
|
378
447
|
end
|
379
448
|
|
380
449
|
def learn_schema
|
381
|
-
|
382
|
-
return nil if
|
450
|
+
split = processed.data(limit: 1).to_a.any? ? :processed : :raw
|
451
|
+
return nil if split.nil?
|
383
452
|
|
384
|
-
schema = data
|
385
|
-
|
386
|
-
h[k] = EasyML::Data::PolarsColumn.polars_to_sym(v)
|
387
|
-
end
|
388
|
-
end
|
389
|
-
write_attribute(:schema, schema)
|
453
|
+
schema = send(split).data(all_columns: true, lazy: true).schema
|
454
|
+
set_schema(schema)
|
390
455
|
end
|
391
456
|
|
392
457
|
def learn_statistics(type: :raw, computed: false)
|
@@ -401,6 +466,7 @@ module EasyML
|
|
401
466
|
end
|
402
467
|
|
403
468
|
def process_data
|
469
|
+
learn(delete: false)
|
404
470
|
fit
|
405
471
|
normalize_all
|
406
472
|
end
|
@@ -452,16 +518,25 @@ module EasyML
|
|
452
518
|
end
|
453
519
|
|
454
520
|
def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
|
455
|
-
|
456
|
-
df =
|
521
|
+
puts "Apply missing features..."
|
522
|
+
df = apply_missing_columns(df, inference: inference)
|
523
|
+
puts "Transform columns..."
|
457
524
|
df = columns.transform(df, inference: inference)
|
525
|
+
puts "Apply features..."
|
458
526
|
df = apply_features(df, features)
|
527
|
+
puts "Transform columns..."
|
459
528
|
df = columns.transform(df, inference: inference, computed: true)
|
529
|
+
puts "Apply column mask..."
|
460
530
|
df = apply_column_mask(df, inference: inference) unless all_columns
|
531
|
+
puts "Drop nulls..."
|
532
|
+
df = drop_nulls(df) unless inference
|
533
|
+
puts "Split features and targets..."
|
461
534
|
df, = processed.split_features_targets(df, true, target) if split_ys
|
462
535
|
df
|
463
536
|
end
|
464
537
|
|
538
|
+
measure_method_timing :normalize
|
539
|
+
|
465
540
|
def missing_required_fields(df)
|
466
541
|
desc_df = df.describe
|
467
542
|
|
@@ -507,6 +582,7 @@ module EasyML
|
|
507
582
|
|
508
583
|
def cleanup
|
509
584
|
raw.cleanup
|
585
|
+
clipped.cleanup
|
510
586
|
processed.cleanup
|
511
587
|
end
|
512
588
|
|
@@ -583,7 +659,7 @@ module EasyML
|
|
583
659
|
one_hot_cats = columns.allowed_categories.symbolize_keys
|
584
660
|
|
585
661
|
# Map columns to names, handling one_hot expansion
|
586
|
-
scope.
|
662
|
+
scope.flat_map do |col|
|
587
663
|
if col.one_hot?
|
588
664
|
one_hot_cats[col.name.to_sym].map do |cat|
|
589
665
|
"#{col.name}_#{cat}"
|
@@ -591,7 +667,7 @@ module EasyML
|
|
591
667
|
else
|
592
668
|
col.name
|
593
669
|
end
|
594
|
-
end
|
670
|
+
end.sort
|
595
671
|
end
|
596
672
|
|
597
673
|
def column_mask(df, inference: false)
|
@@ -603,15 +679,23 @@ module EasyML
|
|
603
679
|
df[column_mask(df, inference: inference)]
|
604
680
|
end
|
605
681
|
|
606
|
-
|
682
|
+
measure_method_timing :apply_column_mask
|
683
|
+
|
684
|
+
def apply_missing_columns(df, inference: false, include_one_hots: false)
|
607
685
|
return df unless inference
|
608
686
|
|
609
|
-
|
687
|
+
missing_columns = (col_order(inference: inference) - df.columns).compact
|
610
688
|
unless include_one_hots
|
611
|
-
|
612
|
-
|
689
|
+
columns.one_hots.each do |one_hot|
|
690
|
+
virtual_columns = one_hot.virtual_columns
|
691
|
+
if virtual_columns.all? { |vc| df.columns.include?(vc) }
|
692
|
+
missing_columns -= columns.one_hots.flat_map(&:virtual_columns)
|
693
|
+
else
|
694
|
+
missing_columns += columns.one_hots.map(&:name) - df.columns
|
695
|
+
end
|
696
|
+
end
|
613
697
|
end
|
614
|
-
df.with_columns(
|
698
|
+
df.with_columns(missing_columns.map { |f| Polars.lit(nil).alias(f) })
|
615
699
|
end
|
616
700
|
|
617
701
|
def drop_columns(all_columns: false)
|
@@ -653,6 +737,19 @@ module EasyML
|
|
653
737
|
apply_date_splitter_config
|
654
738
|
end
|
655
739
|
|
740
|
+
def fully_reload
|
741
|
+
return unless persisted?
|
742
|
+
|
743
|
+
base_vars = self.class.new.instance_variables
|
744
|
+
dirty_vars = (instance_variables - base_vars)
|
745
|
+
in_memory_classes = [EasyML::Data::Splits::InMemorySplit]
|
746
|
+
dirty_vars.each do |ivar|
|
747
|
+
value = instance_variable_get(ivar)
|
748
|
+
remove_instance_variable(ivar) unless in_memory_classes.any? { |in_memory_class| value.is_a?(in_memory_class) }
|
749
|
+
end
|
750
|
+
reload
|
751
|
+
end
|
752
|
+
|
656
753
|
private
|
657
754
|
|
658
755
|
def apply_date_splitter_config
|
@@ -678,8 +775,10 @@ module EasyML
|
|
678
775
|
|
679
776
|
def initialize_splits
|
680
777
|
@raw = nil
|
778
|
+
@clipped = nil
|
681
779
|
@processed = nil
|
682
780
|
raw
|
781
|
+
clipped
|
683
782
|
processed
|
684
783
|
end
|
685
784
|
|
@@ -706,6 +805,8 @@ module EasyML
|
|
706
805
|
after_refresh_datasource
|
707
806
|
end
|
708
807
|
|
808
|
+
measure_method_timing :refresh_datasource
|
809
|
+
|
709
810
|
def refresh_datasource!
|
710
811
|
datasource.reload.refresh!
|
711
812
|
after_refresh_datasource
|
@@ -713,6 +814,8 @@ module EasyML
|
|
713
814
|
|
714
815
|
def after_refresh_datasource
|
715
816
|
update(last_datasource_sha: datasource.sha)
|
817
|
+
schema
|
818
|
+
save
|
716
819
|
initialize_splits
|
717
820
|
end
|
718
821
|
|
@@ -720,7 +823,7 @@ module EasyML
|
|
720
823
|
processed.cleanup
|
721
824
|
|
722
825
|
SPLIT_ORDER.each do |segment|
|
723
|
-
df =
|
826
|
+
df = clipped.read(segment)
|
724
827
|
learn_computed_columns(df) if segment == :train
|
725
828
|
processed_df = normalize(df, all_columns: true)
|
726
829
|
processed.save(segment, processed_df)
|
@@ -728,6 +831,8 @@ module EasyML
|
|
728
831
|
@normalized = true
|
729
832
|
end
|
730
833
|
|
834
|
+
measure_method_timing :normalize_all
|
835
|
+
|
731
836
|
def learn_computed_columns(df)
|
732
837
|
return unless features.ready_to_apply.any?
|
733
838
|
|
@@ -739,6 +844,8 @@ module EasyML
|
|
739
844
|
processed.cleanup
|
740
845
|
end
|
741
846
|
|
847
|
+
measure_method_timing :learn_computed_columns
|
848
|
+
|
742
849
|
def drop_nulls(df)
|
743
850
|
return df if drop_if_null.nil? || drop_if_null.empty?
|
744
851
|
|
@@ -748,6 +855,8 @@ module EasyML
|
|
748
855
|
df.drop_nulls(subset: drop)
|
749
856
|
end
|
750
857
|
|
858
|
+
measure_method_timing :drop_nulls
|
859
|
+
|
751
860
|
# Pass refresh: false for frontend views so we don't query S3 during web requests
|
752
861
|
def load_data(segment, **kwargs, &block)
|
753
862
|
needs_refresh = kwargs.key?(:refresh) ? kwargs[:refresh] : needs_refresh?
|
@@ -761,9 +870,24 @@ module EasyML
|
|
761
870
|
end
|
762
871
|
|
763
872
|
def fit
|
873
|
+
apply_clip
|
764
874
|
learn_statistics(type: :raw)
|
765
875
|
end
|
766
876
|
|
877
|
+
def apply_clip
|
878
|
+
clipped.cleanup
|
879
|
+
|
880
|
+
SPLIT_ORDER.each do |segment|
|
881
|
+
df = raw.send(segment, lazy: true, all_columns: true)
|
882
|
+
clipped.save(
|
883
|
+
segment,
|
884
|
+
columns.apply_clip(df) # Ensuring this returns a LazyFrame means we'll automatically use sink_parquet
|
885
|
+
)
|
886
|
+
end
|
887
|
+
end
|
888
|
+
|
889
|
+
measure_method_timing :apply_clip
|
890
|
+
|
767
891
|
# log_method :fit, "Learning statistics", verbose: true
|
768
892
|
|
769
893
|
def split_data!
|
@@ -779,6 +903,7 @@ module EasyML
|
|
779
903
|
raw.save(segment, df)
|
780
904
|
end
|
781
905
|
end
|
906
|
+
raw_schema # Set if not already set
|
782
907
|
end
|
783
908
|
|
784
909
|
def filter_duplicate_features
|
@@ -828,25 +953,14 @@ module EasyML
|
|
828
953
|
end
|
829
954
|
end
|
830
955
|
|
956
|
+
measure_method_timing :apply_features
|
957
|
+
|
831
958
|
def standardize_preprocessing_steps(type)
|
832
959
|
columns.map(&:name).zip(columns.map do |col|
|
833
960
|
col.preprocessing_steps&.dig(type)
|
834
961
|
end).to_h.compact.reject { |_k, v| v["method"] == "none" }
|
835
962
|
end
|
836
963
|
|
837
|
-
def fully_reload
|
838
|
-
return unless persisted?
|
839
|
-
|
840
|
-
base_vars = self.class.new.instance_variables
|
841
|
-
dirty_vars = (instance_variables - base_vars)
|
842
|
-
in_memory_classes = [EasyML::Data::Splits::InMemorySplit]
|
843
|
-
dirty_vars.each do |ivar|
|
844
|
-
value = instance_variable_get(ivar)
|
845
|
-
remove_instance_variable(ivar) unless in_memory_classes.any? { |in_memory_class| value.is_a?(in_memory_class) }
|
846
|
-
end
|
847
|
-
reload
|
848
|
-
end
|
849
|
-
|
850
964
|
def underscored_name
|
851
965
|
name.gsub(/\s{2,}/, " ").gsub(/\s/, "_").downcase
|
852
966
|
end
|
@@ -98,6 +98,15 @@ module EasyML
|
|
98
98
|
end
|
99
99
|
end
|
100
100
|
|
101
|
+
def to_config
|
102
|
+
EasyML::Export::Datasource.to_config(self)
|
103
|
+
end
|
104
|
+
|
105
|
+
def abort!
|
106
|
+
EasyML::Reaper.kill(EasyML::SyncDatasourceJob, id)
|
107
|
+
update(is_syncing: false)
|
108
|
+
end
|
109
|
+
|
101
110
|
def refresh_async
|
102
111
|
update(is_syncing: true)
|
103
112
|
EasyML::SyncDatasourceJob.perform_later(id)
|
data/app/models/easy_ml/event.rb
CHANGED
@@ -0,0 +1,27 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Export
|
3
|
+
class Column
|
4
|
+
using EasyML::DeepCompact
|
5
|
+
|
6
|
+
UNCONFIGURABLE_COLUMNS = %w(
|
7
|
+
id
|
8
|
+
feature_id
|
9
|
+
dataset_id
|
10
|
+
last_datasource_sha
|
11
|
+
last_feature_sha
|
12
|
+
learned_at
|
13
|
+
is_learning
|
14
|
+
configuration_changed_at
|
15
|
+
statistics
|
16
|
+
sample_values
|
17
|
+
in_raw_dataset
|
18
|
+
created_at
|
19
|
+
updated_at
|
20
|
+
).freeze
|
21
|
+
|
22
|
+
def self.to_config(column)
|
23
|
+
column.as_json.except(*UNCONFIGURABLE_COLUMNS).deep_compact.with_indifferent_access
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Export
|
3
|
+
class Dataset
|
4
|
+
using EasyML::DeepCompact
|
5
|
+
|
6
|
+
UNCONFIGURABLE_COLUMNS = %w(
|
7
|
+
id
|
8
|
+
created_at
|
9
|
+
updated_at
|
10
|
+
statistics
|
11
|
+
root_dir
|
12
|
+
refreshed_at
|
13
|
+
sha
|
14
|
+
statistics
|
15
|
+
datasource_id
|
16
|
+
last_datasource_sha
|
17
|
+
num_rows
|
18
|
+
schema
|
19
|
+
raw_schema
|
20
|
+
status
|
21
|
+
).freeze
|
22
|
+
|
23
|
+
def self.to_config(dataset)
|
24
|
+
dataset.fully_reload
|
25
|
+
|
26
|
+
{
|
27
|
+
dataset: dataset.as_json.except(*UNCONFIGURABLE_COLUMNS).merge!(
|
28
|
+
splitter: dataset.splitter&.to_config,
|
29
|
+
datasource: dataset.datasource.to_config,
|
30
|
+
columns: dataset.columns.map(&:to_config),
|
31
|
+
features: dataset.features.map(&:to_config),
|
32
|
+
),
|
33
|
+
}.deep_compact.with_indifferent_access
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Export
|
3
|
+
class Datasource
|
4
|
+
using EasyML::DeepCompact
|
5
|
+
UNCONFIGURABLE_COLUMNS = %w(id root_dir created_at updated_at refreshed_at sha)
|
6
|
+
|
7
|
+
def self.to_config(datasource)
|
8
|
+
datasource.as_json.except(*UNCONFIGURABLE_COLUMNS).deep_compact.with_indifferent_access
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Export
|
3
|
+
class Feature
|
4
|
+
using EasyML::DeepCompact
|
5
|
+
|
6
|
+
UNCONFIGURABLE_COLUMNS = %w(
|
7
|
+
id
|
8
|
+
created_at
|
9
|
+
updated_at
|
10
|
+
dataset_id
|
11
|
+
sha
|
12
|
+
applied_at
|
13
|
+
fit_at
|
14
|
+
needs_fit
|
15
|
+
workflow_status
|
16
|
+
refresh_every
|
17
|
+
).freeze
|
18
|
+
|
19
|
+
def self.to_config(feature)
|
20
|
+
feature.as_json.except(*EasyML::Feature::UNCONFIGURABLE_COLUMNS).deep_compact.with_indifferent_access
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Export
|
3
|
+
class Model
|
4
|
+
using EasyML::DeepCompact
|
5
|
+
|
6
|
+
UNCONFIGURABLE_COLUMNS = %w(
|
7
|
+
id
|
8
|
+
dataset_id
|
9
|
+
model_file_id
|
10
|
+
root_dir
|
11
|
+
file
|
12
|
+
sha
|
13
|
+
last_trained_at
|
14
|
+
is_training
|
15
|
+
created_at
|
16
|
+
updated_at
|
17
|
+
slug
|
18
|
+
early_stopping_rounds
|
19
|
+
).freeze
|
20
|
+
|
21
|
+
def self.to_config(model, include_dataset: true)
|
22
|
+
config = {
|
23
|
+
model: model.as_json.except(*UNCONFIGURABLE_COLUMNS).merge!(
|
24
|
+
weights: model.weights,
|
25
|
+
),
|
26
|
+
}
|
27
|
+
|
28
|
+
if include_dataset
|
29
|
+
config[:model][:dataset] = model.dataset.to_config["dataset"]
|
30
|
+
end
|
31
|
+
|
32
|
+
if model.retraining_job.present?
|
33
|
+
config[:model][:retraining_job] = EasyML::Export::RetrainingJob.to_config(model.retraining_job)
|
34
|
+
end
|
35
|
+
|
36
|
+
config.deep_compact.with_indifferent_access
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Export
|
3
|
+
class RetrainingJob
|
4
|
+
using EasyML::DeepCompact
|
5
|
+
|
6
|
+
UNCONFIGURABLE_COLUMNS = %w(
|
7
|
+
id
|
8
|
+
model_id
|
9
|
+
last_tuning_at
|
10
|
+
last_run_at
|
11
|
+
created_at
|
12
|
+
updated_at
|
13
|
+
).freeze
|
14
|
+
|
15
|
+
def self.to_config(retraining_job)
|
16
|
+
retraining_job.as_json.except(*UNCONFIGURABLE_COLUMNS).deep_compact.with_indifferent_access
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Export
|
3
|
+
class Splitter
|
4
|
+
using EasyML::DeepCompact
|
5
|
+
UNCONFIGURABLE_COLUMNS = %w[id created_at updated_at dataset_id]
|
6
|
+
|
7
|
+
def self.to_config(splitter)
|
8
|
+
return nil unless splitter.present?
|
9
|
+
|
10
|
+
splitter.as_json.except(*UNCONFIGURABLE_COLUMNS).deep_compact.with_indifferent_access
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -474,6 +474,27 @@ module EasyML
|
|
474
474
|
update!(updates)
|
475
475
|
end
|
476
476
|
|
477
|
+
UNCONFIGURABLE_COLUMNS = %w(
|
478
|
+
id
|
479
|
+
dataset_id
|
480
|
+
sha
|
481
|
+
applied_at
|
482
|
+
fit_at
|
483
|
+
created_at
|
484
|
+
updated_at
|
485
|
+
needs_fit
|
486
|
+
workflow_status
|
487
|
+
refresh_every
|
488
|
+
)
|
489
|
+
|
490
|
+
def to_config
|
491
|
+
EasyML::Export::Feature.to_config(self)
|
492
|
+
end
|
493
|
+
|
494
|
+
def self.from_config(config, dataset, action: :create)
|
495
|
+
EasyML::Import::Feature.from_config(config, dataset, action: action)
|
496
|
+
end
|
497
|
+
|
477
498
|
private
|
478
499
|
|
479
500
|
def bulk_update_positions(features)
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Import
|
3
|
+
class Column
|
4
|
+
def self.permitted_keys
|
5
|
+
@permitted_keys ||= EasyML::Column.columns.map(&:name).map(&:to_sym) -
|
6
|
+
EasyML::Export::Column::UNCONFIGURABLE_COLUMNS.map(&:to_sym)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.from_config(config, dataset, action: :create)
|
10
|
+
column_name = config["name"]
|
11
|
+
existing_column = dataset.columns.find_by(name: column_name)
|
12
|
+
|
13
|
+
case action
|
14
|
+
when :create
|
15
|
+
dataset.columns.create(config)
|
16
|
+
when :update
|
17
|
+
if existing_column
|
18
|
+
existing_column.update!(config)
|
19
|
+
existing_column
|
20
|
+
else
|
21
|
+
# Do not create column if it does not exist in the raw dataset
|
22
|
+
end
|
23
|
+
else
|
24
|
+
raise ArgumentError, "Invalid action: #{action}. Must be :create or :update"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.validate(config, idx)
|
29
|
+
extra_keys = config.keys.map(&:to_sym) - permitted_keys
|
30
|
+
raise ArgumentError, "Invalid keys in column config at index #{idx}: #{extra_keys.join(", ")}" unless extra_keys.empty?
|
31
|
+
config
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|