easy_ml 0.2.0.pre.rc57 → 0.2.0.pre.rc60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/apis_controller.rb +8 -0
  3. data/app/controllers/easy_ml/application_controller.rb +4 -0
  4. data/app/controllers/easy_ml/datasets_controller.rb +32 -1
  5. data/app/controllers/easy_ml/models_controller.rb +3 -0
  6. data/app/controllers/easy_ml/predictions_controller.rb +10 -5
  7. data/app/frontend/components/DatasetPreview.tsx +50 -19
  8. data/app/frontend/components/ModelForm.tsx +1 -1
  9. data/app/frontend/components/SearchableSelect.tsx +0 -1
  10. data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
  11. data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
  12. data/app/frontend/components/dataset/ColumnList.tsx +14 -2
  13. data/app/frontend/components/dataset/PreprocessingConfig.tsx +82 -21
  14. data/app/frontend/pages/DatasourcesPage.tsx +0 -2
  15. data/app/frontend/types/dataset.ts +3 -0
  16. data/app/jobs/easy_ml/compute_feature_job.rb +0 -2
  17. data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
  18. data/app/models/easy_ml/column/imputers/base.rb +89 -0
  19. data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
  20. data/app/models/easy_ml/column/imputers/clip.rb +30 -0
  21. data/app/models/easy_ml/column/imputers/constant.rb +27 -0
  22. data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
  23. data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
  24. data/app/models/easy_ml/column/imputers/mean.rb +27 -0
  25. data/app/models/easy_ml/column/imputers/median.rb +27 -0
  26. data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
  27. data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
  28. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
  29. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
  30. data/app/models/easy_ml/column/imputers/today.rb +20 -0
  31. data/app/models/easy_ml/column/imputers.rb +126 -0
  32. data/app/models/easy_ml/column/learner.rb +18 -0
  33. data/app/models/easy_ml/column/learners/base.rb +103 -0
  34. data/app/models/easy_ml/column/learners/boolean.rb +11 -0
  35. data/app/models/easy_ml/column/learners/categorical.rb +51 -0
  36. data/app/models/easy_ml/column/learners/datetime.rb +19 -0
  37. data/app/models/easy_ml/column/learners/null.rb +22 -0
  38. data/app/models/easy_ml/column/learners/numeric.rb +33 -0
  39. data/app/models/easy_ml/column/learners/string.rb +15 -0
  40. data/app/models/easy_ml/column/lineage/base.rb +22 -0
  41. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
  42. data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
  43. data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
  44. data/app/models/easy_ml/column/lineage.rb +28 -0
  45. data/app/models/easy_ml/column/selector.rb +96 -0
  46. data/app/models/easy_ml/column.rb +344 -39
  47. data/app/models/easy_ml/column_history.rb +31 -20
  48. data/app/models/easy_ml/column_list.rb +79 -62
  49. data/app/models/easy_ml/dataset.rb +156 -104
  50. data/app/models/easy_ml/dataset_history.rb +23 -23
  51. data/app/models/easy_ml/datasource.rb +4 -0
  52. data/app/models/easy_ml/datasource_history.rb +1 -0
  53. data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
  54. data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
  55. data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
  56. data/app/models/easy_ml/feature.rb +29 -10
  57. data/app/models/easy_ml/feature_history.rb +12 -0
  58. data/app/models/easy_ml/feature_list.rb +15 -0
  59. data/app/models/easy_ml/model.rb +25 -4
  60. data/app/models/easy_ml/model_history.rb +1 -0
  61. data/app/models/easy_ml/retraining_run.rb +1 -0
  62. data/app/serializers/easy_ml/column_serializer.rb +11 -1
  63. data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
  64. data/config/initializers/enumerable.rb +17 -0
  65. data/config/initializers/inflections.rb +2 -0
  66. data/config/routes.rb +3 -0
  67. data/lib/easy_ml/core/tuner.rb +1 -1
  68. data/lib/easy_ml/data/date_converter.rb +137 -30
  69. data/lib/easy_ml/data/polars_column.rb +17 -0
  70. data/lib/easy_ml/data/polars_in_memory.rb +30 -0
  71. data/lib/easy_ml/data/polars_reader.rb +20 -1
  72. data/lib/easy_ml/data/splits/in_memory_split.rb +7 -5
  73. data/lib/easy_ml/data/splits/split.rb +2 -1
  74. data/lib/easy_ml/data/synced_directory.rb +5 -3
  75. data/lib/easy_ml/data.rb +1 -2
  76. data/lib/easy_ml/feature_store.rb +33 -22
  77. data/lib/easy_ml/predict.rb +13 -2
  78. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +7 -0
  79. data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +18 -0
  80. data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt +6 -0
  81. data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
  82. data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
  83. data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
  84. data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt +20 -0
  85. data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
  86. data/lib/easy_ml/version.rb +1 -1
  87. data/lib/tasks/profile.rake +40 -0
  88. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  89. data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
  90. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
  91. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
  92. metadata +45 -10
  93. data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
  94. data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
  95. data/lib/easy_ml/data/preprocessor.rb +0 -383
  96. data/lib/easy_ml/data/simple_imputer.rb +0 -255
  97. data/lib/easy_ml/data/statistics_learner.rb +0 -128
  98. data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
  99. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js +0 -474
  100. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js.map +0 -1
@@ -2,21 +2,29 @@
2
2
  #
3
3
  # Table name: easy_ml_columns
4
4
  #
5
- # id :bigint not null, primary key
6
- # dataset_id :bigint not null
7
- # name :string not null
8
- # description :string
9
- # datatype :string
10
- # polars_datatype :string
11
- # is_target :boolean
12
- # hidden :boolean default(FALSE)
13
- # drop_if_null :boolean default(FALSE)
14
- # preprocessing_steps :json
15
- # sample_values :json
16
- # statistics :json
17
- # created_at :datetime not null
18
- # updated_at :datetime not null
19
- # is_date_column :boolean default(FALSE)
5
+ # id :bigint not null, primary key
6
+ # dataset_id :bigint not null
7
+ # name :string not null
8
+ # description :string
9
+ # datatype :string
10
+ # polars_datatype :string
11
+ # is_target :boolean default(FALSE)
12
+ # hidden :boolean default(FALSE)
13
+ # drop_if_null :boolean default(FALSE)
14
+ # preprocessing_steps :json
15
+ # sample_values :json
16
+ # statistics :json
17
+ # created_at :datetime not null
18
+ # updated_at :datetime not null
19
+ # is_date_column :boolean default(FALSE)
20
+ # computed_by :string
21
+ # is_computed :boolean default(FALSE)
22
+ # feature_id :bigint
23
+ # learned_at :datetime
24
+ # is_learning :boolean default(FALSE)
25
+ # last_datasource_sha :string
26
+ # last_feature_sha :string
27
+ # configuration_changed_at :datetime
20
28
  #
21
29
  module EasyML
22
30
  class Column < ActiveRecord::Base
@@ -25,6 +33,7 @@ module EasyML
25
33
  historiographer_mode :snapshot_only
26
34
 
27
35
  belongs_to :dataset, class_name: "EasyML::Dataset"
36
+ belongs_to :feature, class_name: "EasyML::Feature", optional: true
28
37
 
29
38
  validates :name, presence: true
30
39
  validates :name, uniqueness: { scope: :dataset_id }
@@ -32,6 +41,8 @@ module EasyML
32
41
  before_save :ensure_valid_datatype
33
42
  after_save :handle_date_column_change
34
43
  before_save :set_defaults
44
+ before_save :set_feature_lineage
45
+ before_save :set_polars_datatype
35
46
 
36
47
  # Scopes
37
48
  scope :visible, -> { where(hidden: false) }
@@ -39,8 +50,73 @@ module EasyML
39
50
  scope :categorical, -> { where(datatype: %w[categorical string boolean]) }
40
51
  scope :datetime, -> { where(datatype: "datetime") }
41
52
  scope :date_column, -> { where(is_date_column: true) }
53
+ scope :not_preprocessed, -> { where("preprocessing_steps IS NULL OR preprocessing_steps::text = '{}'::text") }
54
+ scope :preprocessed, -> { where("preprocessing_steps IS NOT NULL AND preprocessing_steps::text != '{}'::text") }
55
+ scope :required, -> { raw.visible.not_target.not_preprocessed }
56
+ scope :optional, -> { required.not }
57
+ scope :target, -> { where(is_target: true) }
58
+ scope :not_target, -> { where(is_target: false) }
59
+ scope :api_inputs, -> { where(is_computed: false, hidden: false, is_target: false) }
60
+ scope :computed, -> { where(is_computed: true) }
61
+ scope :raw, -> { where(is_computed: false) }
62
+ scope :needs_learn, -> {
63
+ datasource_changed
64
+ .or(feature_applied)
65
+ .or(feature_changed)
66
+ .or(column_changed)
67
+ .or(never_learned)
68
+ .or(is_learning)
69
+ }
70
+
71
+ scope :datasource_changed, -> {
72
+ left_joins(dataset: :datasource)
73
+ .left_joins(:feature)
74
+ .where(
75
+ arel_table[:last_datasource_sha].not_eq(
76
+ Datasource.arel_table[:sha]
77
+ )
78
+ )
79
+ }
80
+
81
+ scope :feature_changed, -> {
82
+ where(feature_id: Feature.has_changes.map(&:id))
83
+ }
84
+
85
+ scope :feature_applied, -> {
86
+ left_joins(dataset: :datasource)
87
+ .left_joins(:feature)
88
+ .where(
89
+ Feature.arel_table[:applied_at].gt(
90
+ Arel.sql("COALESCE(#{arel_table.name}.learned_at, '1970-01-01')")
91
+ ).and(
92
+ arel_table[:feature_id].not_eq(nil)
93
+ )
94
+ )
95
+ }
96
+
97
+ scope :column_changed, -> {
98
+ left_joins(dataset: :datasource)
99
+ .left_joins(:feature)
100
+ .where(Dataset.arel_table[:refreshed_at].lt(arel_table[:updated_at]))
101
+ }
102
+
103
+ scope :never_learned, -> {
104
+ left_joins(dataset: :datasource)
105
+ .left_joins(:feature)
106
+ .where(arel_table[:learned_at].eq(nil))
107
+ .where(Datasource.arel_table[:sha].not_eq(nil))
108
+ }
109
+ scope :is_learning, -> { where(is_learning: true) }
110
+
111
+ def display_attributes
112
+ attributes.except(:statistics)
113
+ end
114
+
115
+ def inspect
116
+ "#<#{self.class.name} #{display_attributes.map { |k, v| "#{k}: #{v}" }.join(", ")}>"
117
+ end
42
118
 
43
- def columns
119
+ def aliases
44
120
  [name].concat(virtual_columns)
45
121
  end
46
122
 
@@ -52,12 +128,174 @@ module EasyML
52
128
  end
53
129
  end
54
130
 
131
+ delegate :raw, :processed, :data, :train, :test, :valid, :clipped, to: :data_selector
132
+
133
+ def empty?
134
+ data.blank?
135
+ end
136
+
137
+ def learn(type: :all)
138
+ return if (!in_raw_dataset? && type != :processed)
139
+
140
+ if !in_raw_dataset? && read_attribute(:datatype).nil?
141
+ assign_attributes(datatype: processed.data.to_series.dtype)
142
+ end
143
+ set_sample_values
144
+ assign_attributes(statistics: (read_attribute(:statistics) || {}).symbolize_keys.merge!(learner.learn(type: type).symbolize_keys))
145
+ assign_attributes(
146
+ learned_at: UTC.now,
147
+ last_datasource_sha: dataset.last_datasource_sha,
148
+ last_feature_sha: feature&.sha,
149
+ is_learning: type == :raw,
150
+ )
151
+ end
152
+
153
+ def set_configuration_changed_at
154
+ if preprocessing_steps_changed? || datatype_changed?
155
+ self.configuration_changed_at = Time.now
156
+ end
157
+ end
158
+
159
+ def set_sample_values
160
+ use_processed = !one_hot? && processed.data(limit: 1).present? && in_raw_dataset?
161
+
162
+ base = use_processed ? processed : raw
163
+ sample_values = base.data(limit: 5, unique: true)
164
+ if sample_values.columns.include?(name)
165
+ sample_values = sample_values[name].to_a.uniq[0...5]
166
+ assign_attributes(sample_values: sample_values)
167
+ end
168
+ end
169
+
170
+ def transform(df, inference: false, computed: false)
171
+ imputer = inference && imputers.inference.anything? ? imputers.inference : imputers.training
172
+
173
+ df = imputer.transform(df)
174
+ df
175
+ end
176
+
177
+ def imputers
178
+ @imputers ||= Column::Imputers.new(self)
179
+ end
180
+
181
+ def decode_labels(df)
182
+ imputers.training.decode_labels(df)
183
+ end
184
+
185
+ def preprocessed?
186
+ !preprocessing_steps.blank?
187
+ end
188
+
55
189
  def datatype=(dtype)
190
+ if dtype.is_a?(Polars::DataType)
191
+ dtype = EasyML::Data::PolarsColumn.polars_to_sym(dtype)
192
+ end
56
193
  write_attribute(:datatype, dtype)
57
- write_attribute(:polars_datatype, dtype)
194
+ set_polars_datatype
195
+ end
196
+
197
+ def datatype
198
+ read_attribute(:datatype) || write_attribute(:datatype, assumed_datatype)
199
+ end
200
+
201
+ def raw_dtype
202
+ return @raw_dtype if @raw_dtype
203
+
204
+ if in_raw_dataset?
205
+ @raw_dtype = raw&.data&.to_series&.dtype
206
+ elsif already_computed?
207
+ @raw_dtype = processed&.data&.to_series&.dtype
208
+ end
209
+ end
210
+
211
+ def set_polars_datatype
212
+ raw_type = raw_dtype
213
+ user_type = get_polars_type(datatype)
214
+
215
+ if raw_type == user_type
216
+ # A raw type of Polars::Datetime might have extra information like timezone, so prefer the raw type
217
+ write_attribute(:polars_datatype, raw_type.to_s)
218
+ else
219
+ # If a user specified type doesn't match the raw type, use the user type
220
+ write_attribute(:polars_datatype, user_type.to_s)
221
+ end
222
+ end
223
+
224
+ def polars_datatype
225
+ begin
226
+ raw_attr = read_attribute(:polars_datatype)
227
+ if raw_attr.nil?
228
+ get_polars_type(datatype)
229
+ else
230
+ EasyML::Data::PolarsColumn.parse_polars_dtype(raw_attr)
231
+ end
232
+ rescue => e
233
+ get_polars_type(datatype)
234
+ end
235
+ end
236
+
237
+ EasyML::Data::PolarsColumn::TYPE_MAP.keys.each do |dtype|
238
+ define_method("#{dtype}?") do
239
+ datatype.to_s == dtype.to_s
240
+ end
241
+ end
242
+
243
+ def datasource_raw
244
+ dataset.datasource.query(select: name)
245
+ end
246
+
247
+ def already_computed?
248
+ is_computed && computing_feature&.fit_at.present? || computing_feature&.applied_at.present?
249
+ end
250
+
251
+ def assumed_datatype
252
+ return @assumed_datatype if @assumed_datatype
253
+
254
+ if in_raw_dataset?
255
+ series = (raw.data || datasource_raw).to_series
256
+ @assumed_datatype = EasyML::Data::PolarsColumn.determine_type(series)
257
+ elsif already_computed?
258
+ return nil if processed.data.nil?
259
+
260
+ @assumed_datatype = EasyML::Data::PolarsColumn.determine_type(processed.data.to_series)
261
+ end
262
+ end
263
+
264
+ def in_raw_dataset?
265
+ return false if dataset&.raw&.data.nil?
266
+
267
+ dataset.raw.data(all_columns: true)&.columns&.include?(name) || false
268
+ end
269
+
270
+ def computing_feature
271
+ dataset&.features&.detect { |feature| feature.computes_columns.include?(name) }.tap do |computing_feature|
272
+ if computing_feature.present? && feature_id != computing_feature.id
273
+ update(feature_id: computing_feature.id)
274
+ end
275
+ end
276
+ end
277
+
278
+ alias_method :feature, :computing_feature
279
+
280
+ def set_feature_lineage
281
+ if dataset.features.computed_column_names.include?(name)
282
+ if computed_by.nil?
283
+ assign_attributes(
284
+ is_computed: true,
285
+ computed_by: computing_feature&.name,
286
+ )
287
+ end
288
+ elsif computed_by.present?
289
+ assign_attributes(
290
+ is_computed: false,
291
+ computed_by: nil,
292
+ )
293
+ end
58
294
  end
59
295
 
60
296
  def get_polars_type(dtype)
297
+ return nil if dtype.nil?
298
+
61
299
  EasyML::Data::PolarsColumn::TYPE_MAP[dtype.to_sym]
62
300
  end
63
301
 
@@ -79,7 +317,7 @@ module EasyML
79
317
  next config unless config[:params]&.key?(:constant)
80
318
 
81
319
  config.deep_dup.tap do |c|
82
- c[:params][:constant] = convert_to_type(c[:params][:constant])
320
+ c[:params][:constant] = cast(c[:params][:constant])
83
321
  end
84
322
  end
85
323
 
@@ -98,18 +336,97 @@ module EasyML
98
336
  preprocessing_steps.deep_symbolize_keys.dig(:training, :params, :ordinal_encoding) == true
99
337
  end
100
338
 
339
+ def encoding
340
+ return nil unless categorical?
341
+ return :ordinal if ordinal_encoding?
342
+ return :one_hot
343
+ end
344
+
345
+ def categorical_min
346
+ return default_categorical_min unless categorical?
347
+
348
+ (preprocessing_steps || {}).deep_symbolize_keys.dig(:training, :params, :categorical_min) || default_categorical_min
349
+ end
350
+
351
+ def default_categorical_min
352
+ 1
353
+ end
354
+
355
+ def statistics
356
+ (read_attribute(:statistics) || {}).with_indifferent_access
357
+ end
358
+
101
359
  def allowed_categories
102
- return [] unless one_hot?
103
- stats = dataset.preprocessor.statistics
360
+ stats = statistics
104
361
  return [] if stats.nil? || stats.blank?
105
362
 
106
- stats.dup.to_h.dig(name.to_sym, :allowed_categories).sort.concat(["other"])
363
+ stats = stats.deep_symbolize_keys
364
+ type = is_computed? ? :processed : :raw
365
+ stats = stats.dig(type)
366
+
367
+ # Can we LEARN dtype during LEARN phase... for computed columns to deal with this ish man
368
+ sorted = (stats.dig(:allowed_categories) || []).sort_by(&method(:sort_by))
369
+ sorted = sorted.concat(["other"]) if categorical?
370
+ sorted
371
+ end
372
+
373
+ def sort_by(value)
374
+ case datatype.to_sym
375
+ when :boolean
376
+ value == true ? 1 : 0
377
+ else
378
+ value
379
+ end
107
380
  end
108
381
 
109
382
  def date_column?
110
383
  is_date_column
111
384
  end
112
385
 
386
+ def lineage
387
+ @lineage ||= EasyML::Column::Lineage.new(self).lineage
388
+ end
389
+
390
+ def required?
391
+ !is_computed && (preprocessing_steps.nil? || preprocessing_steps == {}) && !hidden && !is_target
392
+ end
393
+
394
+ def sort_required
395
+ required? ? 0 : 1
396
+ end
397
+
398
+ def to_api
399
+ {
400
+ name: name,
401
+ datatype: datatype,
402
+ description: description,
403
+ required: required?,
404
+ allowed_values: allowed_categories.empty? ? nil : allowed_categories,
405
+ }.compact
406
+ end
407
+
408
+ def cast(value)
409
+ return value if value.nil?
410
+
411
+ case datatype&.to_sym
412
+ when :float
413
+ Float(value)
414
+ when :integer
415
+ Integer(value)
416
+ when :boolean
417
+ ActiveModel::Type::Boolean.new.cast(value)
418
+ when :datetime
419
+ value.is_a?(String) ? Time.parse(value) : value
420
+ when :categorical
421
+ value
422
+ else
423
+ value.to_s
424
+ end
425
+ rescue ArgumentError, TypeError
426
+ # If conversion fails, return original value
427
+ value
428
+ end
429
+
113
430
  private
114
431
 
115
432
  def set_defaults
@@ -209,26 +526,14 @@ module EasyML
209
526
  throw :abort
210
527
  end
211
528
 
212
- def convert_to_type(value)
213
- return value if value.nil?
529
+ NUMERIC_METHODS = %i[mean median].freeze
214
530
 
215
- case datatype&.to_sym
216
- when :float
217
- Float(value)
218
- when :integer
219
- Integer(value)
220
- when :boolean
221
- ActiveModel::Type::Boolean.new.cast(value)
222
- when :datetime
223
- value.is_a?(String) ? Time.parse(value) : value
224
- else
225
- value.to_s
226
- end
227
- rescue ArgumentError, TypeError
228
- # If conversion fails, return original value
229
- value
531
+ def data_selector
532
+ @data_selector ||= Column::Selector.new(self)
230
533
  end
231
534
 
232
- NUMERIC_METHODS = %i[mean median].freeze
535
+ def learner
536
+ @learner ||= Column::Learner.new(self)
537
+ end
233
538
  end
234
539
  end
@@ -2,30 +2,41 @@
2
2
  #
3
3
  # Table name: easy_ml_column_histories
4
4
  #
5
- # id :bigint not null, primary key
6
- # column_id :integer not null
7
- # dataset_id :integer not null
8
- # name :string not null
9
- # description :string
10
- # datatype :string
11
- # polars_datatype :string
12
- # is_target :boolean
13
- # hidden :boolean default(FALSE)
14
- # drop_if_null :boolean default(FALSE)
15
- # preprocessing_steps :json
16
- # sample_values :json
17
- # statistics :json
18
- # created_at :datetime not null
19
- # updated_at :datetime not null
20
- # history_started_at :datetime not null
21
- # history_ended_at :datetime
22
- # history_user_id :integer
23
- # snapshot_id :string
24
- # is_date_column :boolean default(FALSE)
5
+ # id :bigint not null, primary key
6
+ # column_id :integer not null
7
+ # dataset_id :integer not null
8
+ # name :string not null
9
+ # description :string
10
+ # datatype :string
11
+ # polars_datatype :string
12
+ # is_target :boolean default(FALSE)
13
+ # hidden :boolean default(FALSE)
14
+ # drop_if_null :boolean default(FALSE)
15
+ # preprocessing_steps :json
16
+ # sample_values :json
17
+ # statistics :json
18
+ # created_at :datetime not null
19
+ # updated_at :datetime not null
20
+ # history_started_at :datetime not null
21
+ # history_ended_at :datetime
22
+ # history_user_id :integer
23
+ # snapshot_id :string
24
+ # is_date_column :boolean default(FALSE)
25
+ # computed_by :string
26
+ # is_computed :boolean default(FALSE)
27
+ # feature_id :bigint
28
+ # learned_at :datetime
29
+ # is_learning :boolean default(FALSE)
30
+ # last_datasource_sha :string
31
+ # last_feature_sha :string
32
+ # configuration_changed_at :datetime
25
33
  #
26
34
  module EasyML
27
35
  class ColumnHistory < ActiveRecord::Base
28
36
  self.table_name = "easy_ml_column_histories"
29
37
  include Historiographer::History
38
+ scope :required, -> { where(is_computed: false, hidden: false, is_target: false).where("preprocessing_steps IS NULL OR preprocessing_steps::text = '{}'::text") }
39
+ scope :computed, -> { where(is_computed: true) }
40
+ scope :raw, -> { where(is_computed: false) }
30
41
  end
31
42
  end
@@ -1,5 +1,7 @@
1
1
  module EasyML
2
2
  module ColumnList
3
+ include Historiographer::Relation
4
+
3
5
  def sync(delete: true)
4
6
  return unless dataset.schema.present?
5
7
 
@@ -7,10 +9,10 @@ module EasyML
7
9
  col_names = syncable
8
10
  existing_columns = where(name: col_names)
9
11
  import_new(col_names, existing_columns)
10
- update_existing(existing_columns)
12
+ # update_existing(existing_columns)
11
13
 
12
14
  if delete
13
- delete_missing(existing_columns)
15
+ delete_missing(col_names)
14
16
  end
15
17
 
16
18
  if existing_columns.none? # Totally new dataset
@@ -19,6 +21,64 @@ module EasyML
19
21
  end
20
22
  end
21
23
 
24
+ def transform(df, inference: false, computed: false)
25
+ return df if df.nil?
26
+
27
+ if computed
28
+ cols = column_list.computed
29
+ else
30
+ cols = column_list.raw
31
+ end
32
+
33
+ by_name = cols.index_by(&:name)
34
+ df.columns.each do |col|
35
+ column = by_name[col]
36
+ df = column.transform(df, inference: inference, computed: computed) if column
37
+ end
38
+
39
+ df
40
+ end
41
+
42
+ def learn(type: :raw, computed: false)
43
+ cols_to_learn = column_list.reload.needs_learn
44
+ cols_to_learn = cols_to_learn.computed if computed
45
+ cols_to_learn = cols_to_learn.select(&:persisted?).reject(&:empty?)
46
+ cols_to_learn.each { |col| col.learn(type: type) }
47
+ EasyML::Column.import(cols_to_learn, on_duplicate_key_update: { columns: %i[
48
+ statistics
49
+ learned_at
50
+ sample_values
51
+ last_datasource_sha
52
+ is_learning
53
+ datatype
54
+ polars_datatype
55
+ ] })
56
+ set_feature_lineage
57
+ reload
58
+ end
59
+
60
+ def set_feature_lineage
61
+ names = dataset.features.computed_column_names
62
+ columns = where(name: names, computed_by: nil).map do |col|
63
+ col.assign_attributes(
64
+ is_computed: true,
65
+ computed_by: col.computing_feature&.name,
66
+ )
67
+ col
68
+ end
69
+ EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
70
+ end
71
+
72
+ def statistics
73
+ stats = { raw: {}, processed: {} }
74
+ select(&:persisted?).inject(stats) do |h, col|
75
+ h.tap do
76
+ h[:raw][col.name] = col.statistics.dig(:raw)
77
+ h[:processed][col.name] = col.statistics.dig(:processed)
78
+ end
79
+ end.with_indifferent_access
80
+ end
81
+
22
82
  def one_hots
23
83
  column_list.select(&:one_hot?)
24
84
  end
@@ -37,14 +97,9 @@ module EasyML
37
97
  end
38
98
  end
39
99
 
40
- def virtual_column?(column)
41
- false
42
- end
43
-
44
100
  def syncable
45
101
  dataset.processed_schema.keys.select do |col|
46
- !one_hot?(col) &&
47
- !virtual_column?(col)
102
+ !one_hot?(col)
48
103
  end
49
104
  end
50
105
 
@@ -56,73 +111,35 @@ module EasyML
56
111
  proxy_association.owner
57
112
  end
58
113
 
114
+ def sort_by_required
115
+ column_list.sort_by { |col| [col.sort_required, col.name] }
116
+ end
117
+
59
118
  private
60
119
 
61
120
  def import_new(new_columns, existing_columns)
62
121
  new_columns = new_columns - existing_columns.map(&:name)
63
122
  cols_to_insert = new_columns.map do |col_name|
64
- EasyML::Column.new(
123
+ col = EasyML::Column.new(
65
124
  name: col_name,
66
125
  dataset_id: dataset.id,
67
126
  )
127
+ col
68
128
  end
69
129
  EasyML::Column.import(cols_to_insert)
130
+ set_feature_lineage
131
+ column_list.reload
70
132
  end
71
133
 
72
- def update_existing(existing_columns)
73
- stats = dataset.statistics
74
- use_processed = dataset.processed.data(limit: 1).present?
75
- cached_sample = use_processed ? dataset.processed.data(limit: 10, all_columns: true) : dataset.raw.data(limit: 10, all_columns: true)
76
- existing_types = existing_columns.map(&:name).zip(existing_columns.map(&:datatype)).to_h
77
- polars_types = cached_sample.columns.zip((cached_sample.dtypes.map do |dtype|
78
- EasyML::Data::PolarsColumn.polars_to_sym(dtype).to_s
79
- end)).to_h
80
-
81
- existing_columns.each do |column|
82
- new_polars_type = polars_types[column.name]
83
- existing_type = existing_types[column.name]
84
- schema_type = dataset.schema[column.name]
85
-
86
- # Keep both datatype and polars_datatype if it's an ordinal encoding case
87
- if column.ordinal_encoding?
88
- actual_type = existing_type
89
- actual_schema_type = existing_type
90
- else
91
- actual_type = new_polars_type
92
- actual_schema_type = schema_type
93
- end
94
-
95
- if column.one_hot?
96
- base = dataset.raw
97
- processed = stats.dig("raw", column.name).dup
98
- processed["null_count"] = 0
99
- actual_schema_type = "categorical"
100
- actual_type = "categorical"
101
- else
102
- base = use_processed ? dataset.processed : dataset.raw
103
- processed = stats.dig("processed", column.name)
104
- end
105
- sample_values = base.send(:data, unique: true, limit: 5, all_columns: true, select: column.name)[column.name].to_a.uniq[0...5]
106
-
107
- column.assign_attributes(
108
- statistics: {
109
- raw: stats.dig("raw", column.name),
110
- processed: processed,
111
- },
112
- datatype: actual_schema_type,
113
- polars_datatype: actual_type,
114
- sample_values: sample_values,
115
- )
116
- end
117
- EasyML::Column.import(existing_columns.to_a,
118
- { on_duplicate_key_update: { columns: %i[statistics datatype polars_datatype
119
- sample_values] } })
120
- end
121
-
122
- def delete_missing(existing_columns)
123
- raw_cols = dataset.raw.train(all_columns: true, limit: 1).columns
134
+ def delete_missing(col_names)
135
+ raw_cols = dataset.best_segment.data(all_columns: true, limit: 1).columns
124
136
  raw_cols = where(name: raw_cols)
125
- columns_to_delete = column_list - existing_columns - raw_cols
137
+ columns_to_delete = column_list.select do |col|
138
+ col_names.exclude?(col.name) &&
139
+ one_hots.map(&:name).exclude?(col.name) &&
140
+ raw_cols.map(&:name).exclude?(col.name) &&
141
+ dataset.features.flat_map(&:computes_columns).exclude?(col.name)
142
+ end
126
143
  columns_to_delete.each(&:destroy!)
127
144
  end
128
145
  end