easy_ml 0.2.0.pre.rc72 → 0.2.0.pre.rc76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +33 -0
  3. data/app/controllers/easy_ml/datasources_controller.rb +7 -0
  4. data/app/controllers/easy_ml/models_controller.rb +38 -0
  5. data/app/frontend/components/DatasetCard.tsx +212 -0
  6. data/app/frontend/components/ModelCard.tsx +69 -29
  7. data/app/frontend/components/StackTrace.tsx +13 -0
  8. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +10 -7
  9. data/app/frontend/components/dataset/PreprocessingConfig.tsx +2 -2
  10. data/app/frontend/components/datasets/UploadDatasetButton.tsx +51 -0
  11. data/app/frontend/components/models/DownloadModelModal.tsx +90 -0
  12. data/app/frontend/components/models/UploadModelModal.tsx +212 -0
  13. data/app/frontend/components/models/index.ts +2 -0
  14. data/app/frontend/pages/DatasetsPage.tsx +36 -130
  15. data/app/frontend/pages/DatasourcesPage.tsx +22 -2
  16. data/app/frontend/pages/ModelsPage.tsx +37 -11
  17. data/app/frontend/types/dataset.ts +1 -2
  18. data/app/frontend/types.ts +1 -1
  19. data/app/jobs/easy_ml/training_job.rb +2 -2
  20. data/app/models/easy_ml/column/imputers/base.rb +4 -0
  21. data/app/models/easy_ml/column/imputers/clip.rb +5 -3
  22. data/app/models/easy_ml/column/imputers/imputer.rb +11 -13
  23. data/app/models/easy_ml/column/imputers/mean.rb +7 -3
  24. data/app/models/easy_ml/column/imputers/null_imputer.rb +3 -0
  25. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +5 -1
  26. data/app/models/easy_ml/column/imputers.rb +3 -1
  27. data/app/models/easy_ml/column/lineage/base.rb +5 -1
  28. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +1 -1
  29. data/app/models/easy_ml/column/lineage/preprocessed.rb +1 -1
  30. data/app/models/easy_ml/column/lineage/raw_dataset.rb +1 -1
  31. data/app/models/easy_ml/column/selector.rb +4 -0
  32. data/app/models/easy_ml/column.rb +79 -63
  33. data/app/models/easy_ml/column_history.rb +28 -28
  34. data/app/models/easy_ml/column_list/imputer.rb +23 -0
  35. data/app/models/easy_ml/column_list.rb +39 -26
  36. data/app/models/easy_ml/dataset/learner/base.rb +34 -0
  37. data/app/models/easy_ml/dataset/learner/eager/boolean.rb +10 -0
  38. data/app/models/easy_ml/dataset/learner/eager/categorical.rb +51 -0
  39. data/app/models/easy_ml/dataset/learner/eager/query.rb +37 -0
  40. data/app/models/easy_ml/dataset/learner/eager.rb +43 -0
  41. data/app/models/easy_ml/dataset/learner/lazy/boolean.rb +13 -0
  42. data/app/models/easy_ml/dataset/learner/lazy/categorical.rb +10 -0
  43. data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +19 -0
  44. data/app/models/easy_ml/dataset/learner/lazy/null.rb +17 -0
  45. data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +19 -0
  46. data/app/models/easy_ml/dataset/learner/lazy/query.rb +69 -0
  47. data/app/models/easy_ml/dataset/learner/lazy/string.rb +19 -0
  48. data/app/models/easy_ml/dataset/learner/lazy.rb +51 -0
  49. data/app/models/easy_ml/dataset/learner/query.rb +25 -0
  50. data/app/models/easy_ml/dataset/learner.rb +100 -0
  51. data/app/models/easy_ml/dataset.rb +150 -36
  52. data/app/models/easy_ml/dataset_history.rb +1 -0
  53. data/app/models/easy_ml/datasource.rb +13 -5
  54. data/app/models/easy_ml/event.rb +4 -0
  55. data/app/models/easy_ml/export/column.rb +27 -0
  56. data/app/models/easy_ml/export/dataset.rb +37 -0
  57. data/app/models/easy_ml/export/datasource.rb +12 -0
  58. data/app/models/easy_ml/export/feature.rb +24 -0
  59. data/app/models/easy_ml/export/model.rb +40 -0
  60. data/app/models/easy_ml/export/retraining_job.rb +20 -0
  61. data/app/models/easy_ml/export/splitter.rb +14 -0
  62. data/app/models/easy_ml/feature.rb +21 -0
  63. data/app/models/easy_ml/import/column.rb +35 -0
  64. data/app/models/easy_ml/import/dataset.rb +148 -0
  65. data/app/models/easy_ml/import/feature.rb +36 -0
  66. data/app/models/easy_ml/import/model.rb +136 -0
  67. data/app/models/easy_ml/import/retraining_job.rb +29 -0
  68. data/app/models/easy_ml/import/splitter.rb +34 -0
  69. data/app/models/easy_ml/lineage.rb +44 -0
  70. data/app/models/easy_ml/model.rb +93 -36
  71. data/app/models/easy_ml/model_file.rb +6 -0
  72. data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -7
  73. data/app/models/easy_ml/models/xgboost.rb +33 -9
  74. data/app/models/easy_ml/retraining_job.rb +8 -1
  75. data/app/models/easy_ml/retraining_run.rb +6 -4
  76. data/app/models/easy_ml/splitter.rb +8 -0
  77. data/app/models/lineage_history.rb +6 -0
  78. data/app/serializers/easy_ml/column_serializer.rb +7 -1
  79. data/app/serializers/easy_ml/dataset_serializer.rb +2 -1
  80. data/app/serializers/easy_ml/lineage_serializer.rb +9 -0
  81. data/config/routes.rb +13 -1
  82. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +3 -3
  83. data/lib/easy_ml/core/tuner.rb +12 -11
  84. data/lib/easy_ml/data/polars_column.rb +149 -100
  85. data/lib/easy_ml/data/polars_reader.rb +8 -5
  86. data/lib/easy_ml/data/polars_schema.rb +56 -0
  87. data/lib/easy_ml/data/splits/file_split.rb +20 -2
  88. data/lib/easy_ml/data/splits/split.rb +10 -1
  89. data/lib/easy_ml/data.rb +1 -0
  90. data/lib/easy_ml/deep_compact.rb +19 -0
  91. data/lib/easy_ml/feature_store.rb +2 -6
  92. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +6 -0
  93. data/lib/easy_ml/railtie/templates/migration/add_extra_metadata_to_columns.rb.tt +9 -0
  94. data/lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt +9 -0
  95. data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt +8 -0
  96. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_lineages.rb.tt +24 -0
  97. data/lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt +7 -0
  98. data/lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt +18 -0
  99. data/lib/easy_ml/timing.rb +34 -0
  100. data/lib/easy_ml/version.rb +1 -1
  101. data/lib/easy_ml.rb +2 -0
  102. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  103. data/public/easy_ml/assets/assets/Application-nnn_XLuL.css +1 -0
  104. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-B1qLZuyu.js +522 -0
  105. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-B1qLZuyu.js.map +1 -0
  106. metadata +52 -12
  107. data/app/models/easy_ml/column/learners/base.rb +0 -103
  108. data/app/models/easy_ml/column/learners/boolean.rb +0 -11
  109. data/app/models/easy_ml/column/learners/categorical.rb +0 -51
  110. data/app/models/easy_ml/column/learners/datetime.rb +0 -19
  111. data/app/models/easy_ml/column/learners/null.rb +0 -22
  112. data/app/models/easy_ml/column/learners/numeric.rb +0 -33
  113. data/app/models/easy_ml/column/learners/string.rb +0 -15
  114. data/public/easy_ml/assets/assets/Application-B3sRjyMT.css +0 -1
  115. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js +0 -489
  116. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js.map +0 -1
@@ -19,6 +19,7 @@
19
19
  # created_at :datetime not null
20
20
  # updated_at :datetime not null
21
21
  # last_datasource_sha :string
22
+ # raw_schema :jsonb
22
23
  #
23
24
  module EasyML
24
25
  class Dataset < ActiveRecord::Base
@@ -86,6 +87,26 @@ module EasyML
86
87
  }
87
88
  end
88
89
 
90
+ UNCONFIGURABLE_COLUMNS = %w(
91
+ id
92
+ statistics
93
+ root_dir
94
+ created_at
95
+ updated_at
96
+ refreshed_at
97
+ sha
98
+ datasource_id
99
+ last_datasource_sha
100
+ )
101
+
102
+ def to_config
103
+ EasyML::Export::Dataset.to_config(self)
104
+ end
105
+
106
+ def self.from_config(json_config, action: nil, dataset: nil)
107
+ EasyML::Import::Dataset.from_config(json_config, action: action, dataset: dataset)
108
+ end
109
+
89
110
  def root_dir=(value)
90
111
  raise "Cannot override value of root_dir!" unless value.to_s == root_dir.to_s
91
112
 
@@ -111,12 +132,41 @@ module EasyML
111
132
  FileUtils.rm_rf(root_dir) if root_dir.present?
112
133
  end
113
134
 
135
+ def as_json
136
+ @serializing = true
137
+ super.tap do
138
+ @serializing = false
139
+ end
140
+ end
141
+
114
142
  def schema
115
- read_attribute(:schema) || datasource.schema || datasource.after_sync.schema
143
+ return @schema if @schema
144
+ return read_attribute(:schema) if @serializing
145
+
146
+ schema = read_attribute(:schema) || datasource.schema || datasource.after_sync.schema
147
+ schema = set_schema(schema)
148
+ @schema = EasyML::Data::PolarsSchema.deserialize(schema)
149
+ end
150
+
151
+ def raw_schema
152
+ return @raw_schema if @raw_schema
153
+ return read_attribute(:raw_schema) if @serializing
154
+
155
+ raw_schema = read_attribute(:raw_schema) || datasource.schema || datasource.after_sync.schema
156
+ raw_schema = set_raw_schema(raw_schema)
157
+ @raw_schema = EasyML::Data::PolarsSchema.deserialize(raw_schema)
158
+ end
159
+
160
+ def set_schema(schema)
161
+ write_attribute(:schema, EasyML::Data::PolarsSchema.serialize(schema))
162
+ end
163
+
164
+ def set_raw_schema(raw_schema)
165
+ write_attribute(:raw_schema, EasyML::Data::PolarsSchema.serialize(raw_schema))
116
166
  end
117
167
 
118
168
  def processed_schema
119
- processed.data(limit: 1)&.schema || raw.data(limit: 1)&.schema
169
+ processed.data(limit: 1, lazy: true)&.schema || raw.data(limit: 1)&.schema
120
170
  end
121
171
 
122
172
  def num_rows
@@ -126,6 +176,12 @@ module EasyML
126
176
  datasource&.num_rows
127
177
  end
128
178
 
179
+ def abort!
180
+ EasyML::Reaper.kill(EasyML::RefreshDatasetJob, id)
181
+ update(workflow_status: :ready)
182
+ unlock!
183
+ end
184
+
129
185
  def refresh_async
130
186
  return if analyzing?
131
187
 
@@ -145,6 +201,12 @@ module EasyML
145
201
  @raw = initialize_split("raw")
146
202
  end
147
203
 
204
+ def clipped
205
+ return @clipped if @clipped && @clipped.dataset
206
+
207
+ @clipped = initialize_split("clipped")
208
+ end
209
+
148
210
  def processed
149
211
  return @processed if @processed && @processed.dataset
150
212
 
@@ -186,23 +248,26 @@ module EasyML
186
248
 
187
249
  def actually_refresh
188
250
  refreshing do
189
- puts "actually_refresh"
190
251
  learn(delete: false) # After syncing datasource, learn new statistics + sync columns
191
252
  process_data
192
- puts "process_data"
193
253
  fully_reload
194
- puts "Learning..."
195
254
  learn
196
255
  learn_statistics(type: :processed) # After processing data, we learn any new statistics
256
+ fully_reload
197
257
  now = UTC.now
198
258
  update(workflow_status: "ready", refreshed_at: now, updated_at: now)
199
259
  fully_reload
200
260
  end
201
261
  end
202
262
 
263
+ include EasyML::Timing
264
+ measure_method_timing :actually_refresh
265
+
203
266
  def refresh!(async: false)
204
267
  refreshing do
268
+ puts "Prepare..."
205
269
  prepare!
270
+ puts "Fit features..."
206
271
  fit_features!(async: async)
207
272
  end
208
273
  end
@@ -218,6 +283,8 @@ module EasyML
218
283
  end
219
284
  end
220
285
 
286
+ measure_method_timing :refresh
287
+
221
288
  def fit_features!(async: false, features: self.features)
222
289
  fit_features(async: async, features: features, force: true)
223
290
  end
@@ -229,6 +296,8 @@ module EasyML
229
296
  features.first.fit(features: features_to_compute, async: async)
230
297
  end
231
298
 
299
+ measure_method_timing :fit_features
300
+
232
301
  def after_fit_features
233
302
  puts "after fit features..."
234
303
  unlock!
@@ -378,15 +447,11 @@ module EasyML
378
447
  end
379
448
 
380
449
  def learn_schema
381
- data = processed.data(limit: 1).to_a.any? ? processed.data : raw.data
382
- return nil if data.nil?
450
+ split = processed.data(limit: 1).to_a.any? ? :processed : :raw
451
+ return nil if split.nil?
383
452
 
384
- schema = data.schema.reduce({}) do |h, (k, v)|
385
- h.tap do
386
- h[k] = EasyML::Data::PolarsColumn.polars_to_sym(v)
387
- end
388
- end
389
- write_attribute(:schema, schema)
453
+ schema = send(split).data(all_columns: true, lazy: true).schema
454
+ set_schema(schema)
390
455
  end
391
456
 
392
457
  def learn_statistics(type: :raw, computed: false)
@@ -401,6 +466,7 @@ module EasyML
401
466
  end
402
467
 
403
468
  def process_data
469
+ learn(delete: false)
404
470
  fit
405
471
  normalize_all
406
472
  end
@@ -452,16 +518,25 @@ module EasyML
452
518
  end
453
519
 
454
520
  def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
455
- df = apply_missing_features(df, inference: inference)
456
- df = drop_nulls(df)
521
+ puts "Apply missing features..."
522
+ df = apply_missing_columns(df, inference: inference)
523
+ puts "Transform columns..."
457
524
  df = columns.transform(df, inference: inference)
525
+ puts "Apply features..."
458
526
  df = apply_features(df, features)
527
+ puts "Transform columns..."
459
528
  df = columns.transform(df, inference: inference, computed: true)
529
+ puts "Apply column mask..."
460
530
  df = apply_column_mask(df, inference: inference) unless all_columns
531
+ puts "Drop nulls..."
532
+ df = drop_nulls(df) unless inference
533
+ puts "Split features and targets..."
461
534
  df, = processed.split_features_targets(df, true, target) if split_ys
462
535
  df
463
536
  end
464
537
 
538
+ measure_method_timing :normalize
539
+
465
540
  def missing_required_fields(df)
466
541
  desc_df = df.describe
467
542
 
@@ -507,6 +582,7 @@ module EasyML
507
582
 
508
583
  def cleanup
509
584
  raw.cleanup
585
+ clipped.cleanup
510
586
  processed.cleanup
511
587
  end
512
588
 
@@ -583,7 +659,7 @@ module EasyML
583
659
  one_hot_cats = columns.allowed_categories.symbolize_keys
584
660
 
585
661
  # Map columns to names, handling one_hot expansion
586
- scope.sort_by(&:id).flat_map do |col|
662
+ scope.flat_map do |col|
587
663
  if col.one_hot?
588
664
  one_hot_cats[col.name.to_sym].map do |cat|
589
665
  "#{col.name}_#{cat}"
@@ -591,7 +667,7 @@ module EasyML
591
667
  else
592
668
  col.name
593
669
  end
594
- end
670
+ end.sort
595
671
  end
596
672
 
597
673
  def column_mask(df, inference: false)
@@ -603,15 +679,23 @@ module EasyML
603
679
  df[column_mask(df, inference: inference)]
604
680
  end
605
681
 
606
- def apply_missing_features(df, inference: false, include_one_hots: false)
682
+ measure_method_timing :apply_column_mask
683
+
684
+ def apply_missing_columns(df, inference: false, include_one_hots: false)
607
685
  return df unless inference
608
686
 
609
- missing_features = (col_order(inference: inference) - df.columns).compact
687
+ missing_columns = (col_order(inference: inference) - df.columns).compact
610
688
  unless include_one_hots
611
- missing_features -= columns.one_hots.flat_map(&:virtual_columns) unless include_one_hots
612
- missing_features += columns.one_hots.map(&:name) - df.columns
689
+ columns.one_hots.each do |one_hot|
690
+ virtual_columns = one_hot.virtual_columns
691
+ if virtual_columns.all? { |vc| df.columns.include?(vc) }
692
+ missing_columns -= columns.one_hots.flat_map(&:virtual_columns)
693
+ else
694
+ missing_columns += columns.one_hots.map(&:name) - df.columns
695
+ end
696
+ end
613
697
  end
614
- df.with_columns(missing_features.map { |f| Polars.lit(nil).alias(f) })
698
+ df.with_columns(missing_columns.map { |f| Polars.lit(nil).alias(f) })
615
699
  end
616
700
 
617
701
  def drop_columns(all_columns: false)
@@ -653,6 +737,19 @@ module EasyML
653
737
  apply_date_splitter_config
654
738
  end
655
739
 
740
+ def fully_reload
741
+ return unless persisted?
742
+
743
+ base_vars = self.class.new.instance_variables
744
+ dirty_vars = (instance_variables - base_vars)
745
+ in_memory_classes = [EasyML::Data::Splits::InMemorySplit]
746
+ dirty_vars.each do |ivar|
747
+ value = instance_variable_get(ivar)
748
+ remove_instance_variable(ivar) unless in_memory_classes.any? { |in_memory_class| value.is_a?(in_memory_class) }
749
+ end
750
+ reload
751
+ end
752
+
656
753
  private
657
754
 
658
755
  def apply_date_splitter_config
@@ -678,8 +775,10 @@ module EasyML
678
775
 
679
776
  def initialize_splits
680
777
  @raw = nil
778
+ @clipped = nil
681
779
  @processed = nil
682
780
  raw
781
+ clipped
683
782
  processed
684
783
  end
685
784
 
@@ -706,6 +805,8 @@ module EasyML
706
805
  after_refresh_datasource
707
806
  end
708
807
 
808
+ measure_method_timing :refresh_datasource
809
+
709
810
  def refresh_datasource!
710
811
  datasource.reload.refresh!
711
812
  after_refresh_datasource
@@ -713,6 +814,8 @@ module EasyML
713
814
 
714
815
  def after_refresh_datasource
715
816
  update(last_datasource_sha: datasource.sha)
817
+ schema
818
+ save
716
819
  initialize_splits
717
820
  end
718
821
 
@@ -720,7 +823,7 @@ module EasyML
720
823
  processed.cleanup
721
824
 
722
825
  SPLIT_ORDER.each do |segment|
723
- df = raw.read(segment)
826
+ df = clipped.read(segment)
724
827
  learn_computed_columns(df) if segment == :train
725
828
  processed_df = normalize(df, all_columns: true)
726
829
  processed.save(segment, processed_df)
@@ -728,6 +831,8 @@ module EasyML
728
831
  @normalized = true
729
832
  end
730
833
 
834
+ measure_method_timing :normalize_all
835
+
731
836
  def learn_computed_columns(df)
732
837
  return unless features.ready_to_apply.any?
733
838
 
@@ -739,6 +844,8 @@ module EasyML
739
844
  processed.cleanup
740
845
  end
741
846
 
847
+ measure_method_timing :learn_computed_columns
848
+
742
849
  def drop_nulls(df)
743
850
  return df if drop_if_null.nil? || drop_if_null.empty?
744
851
 
@@ -748,6 +855,8 @@ module EasyML
748
855
  df.drop_nulls(subset: drop)
749
856
  end
750
857
 
858
+ measure_method_timing :drop_nulls
859
+
751
860
  # Pass refresh: false for frontend views so we don't query S3 during web requests
752
861
  def load_data(segment, **kwargs, &block)
753
862
  needs_refresh = kwargs.key?(:refresh) ? kwargs[:refresh] : needs_refresh?
@@ -761,9 +870,24 @@ module EasyML
761
870
  end
762
871
 
763
872
  def fit
873
+ apply_clip
764
874
  learn_statistics(type: :raw)
765
875
  end
766
876
 
877
+ def apply_clip
878
+ clipped.cleanup
879
+
880
+ SPLIT_ORDER.each do |segment|
881
+ df = raw.send(segment, lazy: true, all_columns: true)
882
+ clipped.save(
883
+ segment,
884
+ columns.apply_clip(df) # Ensuring this returns a LazyFrame means we'll automatically use sink_parquet
885
+ )
886
+ end
887
+ end
888
+
889
+ measure_method_timing :apply_clip
890
+
767
891
  # log_method :fit, "Learning statistics", verbose: true
768
892
 
769
893
  def split_data!
@@ -779,6 +903,7 @@ module EasyML
779
903
  raw.save(segment, df)
780
904
  end
781
905
  end
906
+ raw_schema # Set if not already set
782
907
  end
783
908
 
784
909
  def filter_duplicate_features
@@ -828,25 +953,14 @@ module EasyML
828
953
  end
829
954
  end
830
955
 
956
+ measure_method_timing :apply_features
957
+
831
958
  def standardize_preprocessing_steps(type)
832
959
  columns.map(&:name).zip(columns.map do |col|
833
960
  col.preprocessing_steps&.dig(type)
834
961
  end).to_h.compact.reject { |_k, v| v["method"] == "none" }
835
962
  end
836
963
 
837
- def fully_reload
838
- return unless persisted?
839
-
840
- base_vars = self.class.new.instance_variables
841
- dirty_vars = (instance_variables - base_vars)
842
- in_memory_classes = [EasyML::Data::Splits::InMemorySplit]
843
- dirty_vars.each do |ivar|
844
- value = instance_variable_get(ivar)
845
- remove_instance_variable(ivar) unless in_memory_classes.any? { |in_memory_class| value.is_a?(in_memory_class) }
846
- end
847
- reload
848
- end
849
-
850
964
  def underscored_name
851
965
  name.gsub(/\s{2,}/, " ").gsub(/\s/, "_").downcase
852
966
  end
@@ -24,6 +24,7 @@
24
24
  # history_user_id :integer
25
25
  # snapshot_id :string
26
26
  # last_datasource_sha :string
27
+ # raw_schema :jsonb
27
28
  #
28
29
  module EasyML
29
30
  class DatasetHistory < ActiveRecord::Base
@@ -56,7 +56,8 @@ module EasyML
56
56
 
57
57
  has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
58
58
  attr_accessor :schema, :columns, :num_rows, :is_syncing
59
- belongs_to :dataset, class_name: "EasyML::Dataset", optional: true, dependent: :destroy
59
+ belongs_to :dataset, class_name: "EasyML::Dataset", optional: true
60
+ before_destroy :destroy_dataset
60
61
 
61
62
  add_configuration_attributes :schema, :columns, :num_rows, :polars_args, :verbose, :is_syncing
62
63
  DATASOURCE_CONSTANTS.flat_map(&:configuration_attributes).each do |attribute|
@@ -74,10 +75,8 @@ module EasyML
74
75
  }
75
76
  end
76
77
 
77
- def reread(columns = nil)
78
- return false unless adapter.respond_to?(:convert_to_parquet)
79
-
80
- adapter.convert_to_parquet(columns)
78
+ def destroy_dataset
79
+ dataset&.destroy!
81
80
  end
82
81
 
83
82
  def available_files
@@ -98,6 +97,15 @@ module EasyML
98
97
  end
99
98
  end
100
99
 
100
+ def to_config
101
+ EasyML::Export::Datasource.to_config(self)
102
+ end
103
+
104
+ def abort!
105
+ EasyML::Reaper.kill(EasyML::SyncDatasourceJob, id)
106
+ update(is_syncing: false)
107
+ end
108
+
101
109
  def refresh_async
102
110
  update(is_syncing: true)
103
111
  EasyML::SyncDatasourceJob.perform_later(id)
@@ -60,6 +60,10 @@ module EasyML
60
60
  stacktrace.select { |loc| loc.match?(/easy_ml/) }
61
61
  end
62
62
 
63
+ def self.called_by?(matcher)
64
+ easy_ml_context(caller).any? { |line| line.match?(matcher) }
65
+ end
66
+
63
67
  def self.format_stacktrace(error)
64
68
  return nil if error.nil?
65
69
 
@@ -0,0 +1,27 @@
1
+ module EasyML
2
+ module Export
3
+ class Column
4
+ using EasyML::DeepCompact
5
+
6
+ UNCONFIGURABLE_COLUMNS = %w(
7
+ id
8
+ feature_id
9
+ dataset_id
10
+ last_datasource_sha
11
+ last_feature_sha
12
+ learned_at
13
+ is_learning
14
+ configuration_changed_at
15
+ statistics
16
+ sample_values
17
+ in_raw_dataset
18
+ created_at
19
+ updated_at
20
+ ).freeze
21
+
22
+ def self.to_config(column)
23
+ column.as_json.except(*UNCONFIGURABLE_COLUMNS).deep_compact.with_indifferent_access
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,37 @@
1
+ module EasyML
2
+ module Export
3
+ class Dataset
4
+ using EasyML::DeepCompact
5
+
6
+ UNCONFIGURABLE_COLUMNS = %w(
7
+ id
8
+ created_at
9
+ updated_at
10
+ statistics
11
+ root_dir
12
+ refreshed_at
13
+ sha
14
+ statistics
15
+ datasource_id
16
+ last_datasource_sha
17
+ num_rows
18
+ schema
19
+ raw_schema
20
+ status
21
+ ).freeze
22
+
23
+ def self.to_config(dataset)
24
+ dataset.fully_reload
25
+
26
+ {
27
+ dataset: dataset.as_json.except(*UNCONFIGURABLE_COLUMNS).merge!(
28
+ splitter: dataset.splitter&.to_config,
29
+ datasource: dataset.datasource.to_config,
30
+ columns: dataset.columns.map(&:to_config),
31
+ features: dataset.features.map(&:to_config),
32
+ ),
33
+ }.deep_compact.with_indifferent_access
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,12 @@
1
+ module EasyML
2
+ module Export
3
+ class Datasource
4
+ using EasyML::DeepCompact
5
+ UNCONFIGURABLE_COLUMNS = %w(id root_dir created_at updated_at refreshed_at sha)
6
+
7
+ def self.to_config(datasource)
8
+ datasource.as_json.except(*UNCONFIGURABLE_COLUMNS).deep_compact.with_indifferent_access
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,24 @@
1
+ module EasyML
2
+ module Export
3
+ class Feature
4
+ using EasyML::DeepCompact
5
+
6
+ UNCONFIGURABLE_COLUMNS = %w(
7
+ id
8
+ created_at
9
+ updated_at
10
+ dataset_id
11
+ sha
12
+ applied_at
13
+ fit_at
14
+ needs_fit
15
+ workflow_status
16
+ refresh_every
17
+ ).freeze
18
+
19
+ def self.to_config(feature)
20
+ feature.as_json.except(*EasyML::Feature::UNCONFIGURABLE_COLUMNS).deep_compact.with_indifferent_access
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,40 @@
1
+ module EasyML
2
+ module Export
3
+ class Model
4
+ using EasyML::DeepCompact
5
+
6
+ UNCONFIGURABLE_COLUMNS = %w(
7
+ id
8
+ dataset_id
9
+ model_file_id
10
+ root_dir
11
+ file
12
+ sha
13
+ last_trained_at
14
+ is_training
15
+ created_at
16
+ updated_at
17
+ slug
18
+ early_stopping_rounds
19
+ ).freeze
20
+
21
+ def self.to_config(model, include_dataset: true)
22
+ config = {
23
+ model: model.as_json.except(*UNCONFIGURABLE_COLUMNS).merge!(
24
+ weights: model.weights,
25
+ ),
26
+ }
27
+
28
+ if include_dataset
29
+ config[:model][:dataset] = model.dataset.to_config["dataset"]
30
+ end
31
+
32
+ if model.retraining_job.present?
33
+ config[:model][:retraining_job] = EasyML::Export::RetrainingJob.to_config(model.retraining_job)
34
+ end
35
+
36
+ config.deep_compact.with_indifferent_access
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,20 @@
1
+ module EasyML
2
+ module Export
3
+ class RetrainingJob
4
+ using EasyML::DeepCompact
5
+
6
+ UNCONFIGURABLE_COLUMNS = %w(
7
+ id
8
+ model_id
9
+ last_tuning_at
10
+ last_run_at
11
+ created_at
12
+ updated_at
13
+ ).freeze
14
+
15
+ def self.to_config(retraining_job)
16
+ retraining_job.as_json.except(*UNCONFIGURABLE_COLUMNS).deep_compact.with_indifferent_access
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,14 @@
1
+ module EasyML
2
+ module Export
3
+ class Splitter
4
+ using EasyML::DeepCompact
5
+ UNCONFIGURABLE_COLUMNS = %w[id created_at updated_at dataset_id]
6
+
7
+ def self.to_config(splitter)
8
+ return nil unless splitter.present?
9
+
10
+ splitter.as_json.except(*UNCONFIGURABLE_COLUMNS).deep_compact.with_indifferent_access
11
+ end
12
+ end
13
+ end
14
+ end
@@ -474,6 +474,27 @@ module EasyML
474
474
  update!(updates)
475
475
  end
476
476
 
477
+ UNCONFIGURABLE_COLUMNS = %w(
478
+ id
479
+ dataset_id
480
+ sha
481
+ applied_at
482
+ fit_at
483
+ created_at
484
+ updated_at
485
+ needs_fit
486
+ workflow_status
487
+ refresh_every
488
+ )
489
+
490
+ def to_config
491
+ EasyML::Export::Feature.to_config(self)
492
+ end
493
+
494
+ def self.from_config(config, dataset, action: :create)
495
+ EasyML::Import::Feature.from_config(config, dataset, action: action)
496
+ end
497
+
477
498
  private
478
499
 
479
500
  def bulk_update_positions(features)
@@ -0,0 +1,35 @@
1
+ module EasyML
2
+ module Import
3
+ class Column
4
+ def self.permitted_keys
5
+ @permitted_keys ||= EasyML::Column.columns.map(&:name).map(&:to_sym) -
6
+ EasyML::Export::Column::UNCONFIGURABLE_COLUMNS.map(&:to_sym)
7
+ end
8
+
9
+ def self.from_config(config, dataset, action: :create)
10
+ column_name = config["name"]
11
+ existing_column = dataset.columns.find_by(name: column_name)
12
+
13
+ case action
14
+ when :create
15
+ dataset.columns.create(config)
16
+ when :update
17
+ if existing_column
18
+ existing_column.update!(config)
19
+ existing_column
20
+ else
21
+ # Do not create column if it does not exist in the raw dataset
22
+ end
23
+ else
24
+ raise ArgumentError, "Invalid action: #{action}. Must be :create or :update"
25
+ end
26
+ end
27
+
28
+ def self.validate(config, idx)
29
+ extra_keys = config.keys.map(&:to_sym) - permitted_keys
30
+ raise ArgumentError, "Invalid keys in column config at index #{idx}: #{extra_keys.join(", ")}" unless extra_keys.empty?
31
+ config
32
+ end
33
+ end
34
+ end
35
+ end