easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc61

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/application_controller.rb +4 -0
  3. data/app/controllers/easy_ml/datasets_controller.rb +32 -1
  4. data/app/frontend/components/DatasetPreview.tsx +50 -19
  5. data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
  6. data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
  7. data/app/frontend/components/dataset/ColumnList.tsx +14 -2
  8. data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
  9. data/app/frontend/types/dataset.ts +3 -0
  10. data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
  11. data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
  12. data/app/models/easy_ml/column/imputers/base.rb +89 -0
  13. data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
  14. data/app/models/easy_ml/column/imputers/clip.rb +30 -0
  15. data/app/models/easy_ml/column/imputers/constant.rb +27 -0
  16. data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
  17. data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
  18. data/app/models/easy_ml/column/imputers/mean.rb +27 -0
  19. data/app/models/easy_ml/column/imputers/median.rb +27 -0
  20. data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
  21. data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
  22. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
  23. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
  24. data/app/models/easy_ml/column/imputers/today.rb +20 -0
  25. data/app/models/easy_ml/column/imputers.rb +126 -0
  26. data/app/models/easy_ml/column/learner.rb +18 -0
  27. data/app/models/easy_ml/column/learners/base.rb +103 -0
  28. data/app/models/easy_ml/column/learners/boolean.rb +11 -0
  29. data/app/models/easy_ml/column/learners/categorical.rb +51 -0
  30. data/app/models/easy_ml/column/learners/datetime.rb +19 -0
  31. data/app/models/easy_ml/column/learners/null.rb +22 -0
  32. data/app/models/easy_ml/column/learners/numeric.rb +33 -0
  33. data/app/models/easy_ml/column/learners/string.rb +15 -0
  34. data/app/models/easy_ml/column/lineage/base.rb +22 -0
  35. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
  36. data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
  37. data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
  38. data/app/models/easy_ml/column/lineage.rb +28 -0
  39. data/app/models/easy_ml/column/selector.rb +96 -0
  40. data/app/models/easy_ml/column.rb +319 -52
  41. data/app/models/easy_ml/column_history.rb +29 -22
  42. data/app/models/easy_ml/column_list.rb +63 -78
  43. data/app/models/easy_ml/dataset.rb +128 -96
  44. data/app/models/easy_ml/dataset_history.rb +23 -23
  45. data/app/models/easy_ml/datasource.rb +3 -0
  46. data/app/models/easy_ml/datasource_history.rb +1 -0
  47. data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
  48. data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
  49. data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
  50. data/app/models/easy_ml/feature.rb +19 -7
  51. data/app/models/easy_ml/feature_history.rb +12 -0
  52. data/app/models/easy_ml/feature_list.rb +15 -0
  53. data/app/serializers/easy_ml/column_serializer.rb +11 -1
  54. data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
  55. data/config/initializers/enumerable.rb +17 -0
  56. data/lib/easy_ml/data/date_converter.rb +137 -30
  57. data/lib/easy_ml/data/polars_column.rb +17 -0
  58. data/lib/easy_ml/data/polars_in_memory.rb +30 -0
  59. data/lib/easy_ml/data/polars_reader.rb +20 -1
  60. data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
  61. data/lib/easy_ml/data/splits/split.rb +2 -1
  62. data/lib/easy_ml/data/synced_directory.rb +1 -1
  63. data/lib/easy_ml/data.rb +1 -2
  64. data/lib/easy_ml/engine.rb +1 -0
  65. data/lib/easy_ml/feature_store.rb +33 -22
  66. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
  67. data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
  68. data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
  69. data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
  70. data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
  71. data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
  72. data/lib/easy_ml/version.rb +1 -1
  73. data/lib/tasks/profile.rake +40 -0
  74. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  75. data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
  76. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
  77. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
  78. metadata +41 -10
  79. data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
  80. data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
  81. data/lib/easy_ml/data/preprocessor.rb +0 -340
  82. data/lib/easy_ml/data/simple_imputer.rb +0 -255
  83. data/lib/easy_ml/data/statistics_learner.rb +0 -193
  84. data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
  85. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
  86. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1
@@ -2,23 +2,29 @@
2
2
  #
3
3
  # Table name: easy_ml_columns
4
4
  #
5
- # id :bigint not null, primary key
6
- # dataset_id :bigint not null
7
- # name :string not null
8
- # description :string
9
- # datatype :string
10
- # polars_datatype :string
11
- # is_target :boolean default(FALSE)
12
- # hidden :boolean default(FALSE)
13
- # drop_if_null :boolean default(FALSE)
14
- # preprocessing_steps :json
15
- # sample_values :json
16
- # statistics :json
17
- # created_at :datetime not null
18
- # updated_at :datetime not null
19
- # is_date_column :boolean default(FALSE)
20
- # computed_by :string
21
- # is_computed :boolean default(FALSE)
5
+ # id :bigint not null, primary key
6
+ # dataset_id :bigint not null
7
+ # name :string not null
8
+ # description :string
9
+ # datatype :string
10
+ # polars_datatype :string
11
+ # is_target :boolean default(FALSE)
12
+ # hidden :boolean default(FALSE)
13
+ # drop_if_null :boolean default(FALSE)
14
+ # preprocessing_steps :json
15
+ # sample_values :json
16
+ # statistics :json
17
+ # created_at :datetime not null
18
+ # updated_at :datetime not null
19
+ # is_date_column :boolean default(FALSE)
20
+ # computed_by :string
21
+ # is_computed :boolean default(FALSE)
22
+ # feature_id :bigint
23
+ # learned_at :datetime
24
+ # is_learning :boolean default(FALSE)
25
+ # last_datasource_sha :string
26
+ # last_feature_sha :string
27
+ # configuration_changed_at :datetime
22
28
  #
23
29
  module EasyML
24
30
  class Column < ActiveRecord::Base
@@ -27,6 +33,7 @@ module EasyML
27
33
  historiographer_mode :snapshot_only
28
34
 
29
35
  belongs_to :dataset, class_name: "EasyML::Dataset"
36
+ belongs_to :feature, class_name: "EasyML::Feature", optional: true
30
37
 
31
38
  validates :name, presence: true
32
39
  validates :name, uniqueness: { scope: :dataset_id }
@@ -34,6 +41,8 @@ module EasyML
34
41
  before_save :ensure_valid_datatype
35
42
  after_save :handle_date_column_change
36
43
  before_save :set_defaults
44
+ before_save :set_feature_lineage
45
+ before_save :set_polars_datatype
37
46
 
38
47
  # Scopes
39
48
  scope :visible, -> { where(hidden: false) }
@@ -41,9 +50,71 @@ module EasyML
41
50
  scope :categorical, -> { where(datatype: %w[categorical string boolean]) }
42
51
  scope :datetime, -> { where(datatype: "datetime") }
43
52
  scope :date_column, -> { where(is_date_column: true) }
44
- scope :required, -> { where(is_computed: false, hidden: false, is_target: false).where("preprocessing_steps IS NULL OR preprocessing_steps::text = '{}'::text") }
53
+ scope :not_preprocessed, -> { where("preprocessing_steps IS NULL OR preprocessing_steps::text = '{}'::text") }
54
+ scope :preprocessed, -> { where("preprocessing_steps IS NOT NULL AND preprocessing_steps::text != '{}'::text") }
55
+ scope :required, -> { raw.visible.not_target.not_preprocessed }
56
+ scope :optional, -> { required.not }
57
+ scope :target, -> { where(is_target: true) }
58
+ scope :not_target, -> { where(is_target: false) }
45
59
  scope :api_inputs, -> { where(is_computed: false, hidden: false, is_target: false) }
46
60
  scope :computed, -> { where(is_computed: true) }
61
+ scope :raw, -> { where(is_computed: false) }
62
+ scope :needs_learn, -> {
63
+ datasource_changed
64
+ .or(feature_applied)
65
+ .or(feature_changed)
66
+ .or(column_changed)
67
+ .or(never_learned)
68
+ .or(is_learning)
69
+ }
70
+
71
+ scope :datasource_changed, -> {
72
+ left_joins(dataset: :datasource)
73
+ .left_joins(:feature)
74
+ .where(
75
+ arel_table[:last_datasource_sha].not_eq(
76
+ Datasource.arel_table[:sha]
77
+ )
78
+ )
79
+ }
80
+
81
+ scope :feature_changed, -> {
82
+ where(feature_id: Feature.has_changes.map(&:id))
83
+ }
84
+
85
+ scope :feature_applied, -> {
86
+ left_joins(dataset: :datasource)
87
+ .left_joins(:feature)
88
+ .where(
89
+ Feature.arel_table[:applied_at].gt(
90
+ Arel.sql("COALESCE(#{arel_table.name}.learned_at, '1970-01-01')")
91
+ ).and(
92
+ arel_table[:feature_id].not_eq(nil)
93
+ )
94
+ )
95
+ }
96
+
97
+ scope :column_changed, -> {
98
+ left_joins(dataset: :datasource)
99
+ .left_joins(:feature)
100
+ .where(Dataset.arel_table[:refreshed_at].lt(arel_table[:updated_at]))
101
+ }
102
+
103
+ scope :never_learned, -> {
104
+ left_joins(dataset: :datasource)
105
+ .left_joins(:feature)
106
+ .where(arel_table[:learned_at].eq(nil))
107
+ .where(Datasource.arel_table[:sha].not_eq(nil))
108
+ }
109
+ scope :is_learning, -> { where(is_learning: true) }
110
+
111
+ def display_attributes
112
+ attributes.except(:statistics)
113
+ end
114
+
115
+ def inspect
116
+ "#<#{self.class.name} #{display_attributes.map { |k, v| "#{k}: #{v}" }.join(", ")}>"
117
+ end
47
118
 
48
119
  def aliases
49
120
  [name].concat(virtual_columns)
@@ -57,12 +128,174 @@ module EasyML
57
128
  end
58
129
  end
59
130
 
131
+ delegate :raw, :processed, :data, :train, :test, :valid, :clipped, to: :data_selector
132
+
133
+ def empty?
134
+ data.blank?
135
+ end
136
+
137
+ def learn(type: :all)
138
+ return if (!in_raw_dataset? && type != :processed)
139
+
140
+ if !in_raw_dataset? && read_attribute(:datatype).nil?
141
+ assign_attributes(datatype: processed.data.to_series.dtype)
142
+ end
143
+ set_sample_values
144
+ assign_attributes(statistics: (read_attribute(:statistics) || {}).symbolize_keys.merge!(learner.learn(type: type).symbolize_keys))
145
+ assign_attributes(
146
+ learned_at: UTC.now,
147
+ last_datasource_sha: dataset.last_datasource_sha,
148
+ last_feature_sha: feature&.sha,
149
+ is_learning: type == :raw,
150
+ )
151
+ end
152
+
153
+ def set_configuration_changed_at
154
+ if preprocessing_steps_changed? || datatype_changed?
155
+ self.configuration_changed_at = Time.now
156
+ end
157
+ end
158
+
159
+ def set_sample_values
160
+ use_processed = !one_hot? && processed.data(limit: 1).present? && in_raw_dataset?
161
+
162
+ base = use_processed ? processed : raw
163
+ sample_values = base.data(limit: 5, unique: true)
164
+ if sample_values.columns.include?(name)
165
+ sample_values = sample_values[name].to_a.uniq[0...5]
166
+ assign_attributes(sample_values: sample_values)
167
+ end
168
+ end
169
+
170
+ def transform(df, inference: false, computed: false)
171
+ imputer = inference && imputers.inference.anything? ? imputers.inference : imputers.training
172
+
173
+ df = imputer.transform(df)
174
+ df
175
+ end
176
+
177
+ def imputers
178
+ @imputers ||= Column::Imputers.new(self)
179
+ end
180
+
181
+ def decode_labels(df)
182
+ imputers.training.decode_labels(df)
183
+ end
184
+
185
+ def preprocessed?
186
+ !preprocessing_steps.blank?
187
+ end
188
+
60
189
  def datatype=(dtype)
190
+ if dtype.is_a?(Polars::DataType)
191
+ dtype = EasyML::Data::PolarsColumn.polars_to_sym(dtype)
192
+ end
61
193
  write_attribute(:datatype, dtype)
62
- write_attribute(:polars_datatype, dtype)
194
+ set_polars_datatype
195
+ end
196
+
197
+ def datatype
198
+ read_attribute(:datatype) || write_attribute(:datatype, assumed_datatype)
199
+ end
200
+
201
+ def raw_dtype
202
+ return @raw_dtype if @raw_dtype
203
+
204
+ if in_raw_dataset?
205
+ @raw_dtype = raw&.data&.to_series&.dtype
206
+ elsif already_computed?
207
+ @raw_dtype = processed&.data&.to_series&.dtype
208
+ end
209
+ end
210
+
211
+ def set_polars_datatype
212
+ raw_type = raw_dtype
213
+ user_type = get_polars_type(datatype)
214
+
215
+ if raw_type == user_type
216
+ # A raw type of Polars::Datetime might have extra information like timezone, so prefer the raw type
217
+ write_attribute(:polars_datatype, raw_type.to_s)
218
+ else
219
+ # If a user specified type doesn't match the raw type, use the user type
220
+ write_attribute(:polars_datatype, user_type.to_s)
221
+ end
222
+ end
223
+
224
+ def polars_datatype
225
+ begin
226
+ raw_attr = read_attribute(:polars_datatype)
227
+ if raw_attr.nil?
228
+ get_polars_type(datatype)
229
+ else
230
+ EasyML::Data::PolarsColumn.parse_polars_dtype(raw_attr)
231
+ end
232
+ rescue => e
233
+ get_polars_type(datatype)
234
+ end
235
+ end
236
+
237
+ EasyML::Data::PolarsColumn::TYPE_MAP.keys.each do |dtype|
238
+ define_method("#{dtype}?") do
239
+ datatype.to_s == dtype.to_s
240
+ end
241
+ end
242
+
243
+ def datasource_raw
244
+ dataset.datasource.query(select: name)
245
+ end
246
+
247
+ def already_computed?
248
+ is_computed && computing_feature&.fit_at.present? || computing_feature&.applied_at.present?
249
+ end
250
+
251
+ def assumed_datatype
252
+ return @assumed_datatype if @assumed_datatype
253
+
254
+ if in_raw_dataset?
255
+ series = (raw.data || datasource_raw).to_series
256
+ @assumed_datatype = EasyML::Data::PolarsColumn.determine_type(series)
257
+ elsif already_computed?
258
+ return nil if processed.data.nil?
259
+
260
+ @assumed_datatype = EasyML::Data::PolarsColumn.determine_type(processed.data.to_series)
261
+ end
262
+ end
263
+
264
+ def in_raw_dataset?
265
+ return false if dataset&.raw&.data.nil?
266
+
267
+ dataset.raw.data(all_columns: true)&.columns&.include?(name) || false
268
+ end
269
+
270
+ def computing_feature
271
+ dataset&.features&.detect { |feature| feature.computes_columns.include?(name) }.tap do |computing_feature|
272
+ if computing_feature.present? && feature_id != computing_feature.id
273
+ update(feature_id: computing_feature.id)
274
+ end
275
+ end
276
+ end
277
+
278
+ alias_method :feature, :computing_feature
279
+
280
+ def set_feature_lineage
281
+ if dataset.features.computed_column_names.include?(name)
282
+ if computed_by.nil?
283
+ assign_attributes(
284
+ is_computed: true,
285
+ computed_by: computing_feature&.name,
286
+ )
287
+ end
288
+ elsif computed_by.present?
289
+ assign_attributes(
290
+ is_computed: false,
291
+ computed_by: nil,
292
+ )
293
+ end
63
294
  end
64
295
 
65
296
  def get_polars_type(dtype)
297
+ return nil if dtype.nil?
298
+
66
299
  EasyML::Data::PolarsColumn::TYPE_MAP[dtype.to_sym]
67
300
  end
68
301
 
@@ -84,7 +317,7 @@ module EasyML
84
317
  next config unless config[:params]&.key?(:constant)
85
318
 
86
319
  config.deep_dup.tap do |c|
87
- c[:params][:constant] = convert_to_type(c[:params][:constant])
320
+ c[:params][:constant] = cast(c[:params][:constant])
88
321
  end
89
322
  end
90
323
 
@@ -103,15 +336,47 @@ module EasyML
103
336
  preprocessing_steps.deep_symbolize_keys.dig(:training, :params, :ordinal_encoding) == true
104
337
  end
105
338
 
339
+ def encoding
340
+ return nil unless categorical?
341
+ return :ordinal if ordinal_encoding?
342
+ return :one_hot
343
+ end
344
+
345
+ def categorical_min
346
+ return default_categorical_min unless categorical?
347
+
348
+ (preprocessing_steps || {}).deep_symbolize_keys.dig(:training, :params, :categorical_min) || default_categorical_min
349
+ end
350
+
351
+ def default_categorical_min
352
+ 1
353
+ end
354
+
355
+ def statistics
356
+ (read_attribute(:statistics) || {}).with_indifferent_access
357
+ end
358
+
106
359
  def allowed_categories
107
- return [] unless one_hot?
108
- stats = dataset.statistics
360
+ stats = statistics
109
361
  return [] if stats.nil? || stats.blank?
110
362
 
111
363
  stats = stats.deep_symbolize_keys
112
- stats = stats.dig(:raw)
364
+ type = is_computed? ? :processed : :raw
365
+ stats = stats.dig(type)
113
366
 
114
- (stats.dig(name.to_sym, :allowed_categories) || []).sort.concat(["other"])
367
+ # Can we LEARN dtype during LEARN phase... for computed columns to deal with this ish man
368
+ sorted = (stats.dig(:allowed_categories) || []).sort_by(&method(:sort_by))
369
+ sorted = sorted.concat(["other"]) if categorical?
370
+ sorted
371
+ end
372
+
373
+ def sort_by(value)
374
+ case datatype.to_sym
375
+ when :boolean
376
+ value == true ? 1 : 0
377
+ else
378
+ value
379
+ end
115
380
  end
116
381
 
117
382
  def date_column?
@@ -119,19 +384,11 @@ module EasyML
119
384
  end
120
385
 
121
386
  def lineage
122
- [
123
- present_in_raw_dataset ? "Raw dataset" : nil,
124
- computed_by ? "Computed by #{computed_by}" : nil,
125
- preprocessing_steps.present? ? "Preprocessed using #{preprocessing_steps.keys.join(", ")}" : nil,
126
- ].compact
387
+ @lineage ||= EasyML::Column::Lineage.new(self).lineage
127
388
  end
128
389
 
129
390
  def required?
130
- is_computed && (preprocessing_steps.nil? || preprocessing_steps == {}) && !hidden && !is_target
131
- end
132
-
133
- def present_in_raw_dataset
134
- dataset.raw.data&.columns&.include?(name) || false
391
+ !is_computed && (preprocessing_steps.nil? || preprocessing_steps == {}) && !hidden && !is_target
135
392
  end
136
393
 
137
394
  def sort_required
@@ -148,6 +405,28 @@ module EasyML
148
405
  }.compact
149
406
  end
150
407
 
408
+ def cast(value)
409
+ return value if value.nil?
410
+
411
+ case datatype&.to_sym
412
+ when :float
413
+ Float(value)
414
+ when :integer
415
+ Integer(value)
416
+ when :boolean
417
+ ActiveModel::Type::Boolean.new.cast(value)
418
+ when :datetime
419
+ value.is_a?(String) ? Time.parse(value) : value
420
+ when :categorical
421
+ value
422
+ else
423
+ value.to_s
424
+ end
425
+ rescue ArgumentError, TypeError
426
+ # If conversion fails, return original value
427
+ value
428
+ end
429
+
151
430
  private
152
431
 
153
432
  def set_defaults
@@ -247,26 +526,14 @@ module EasyML
247
526
  throw :abort
248
527
  end
249
528
 
250
- def convert_to_type(value)
251
- return value if value.nil?
529
+ NUMERIC_METHODS = %i[mean median].freeze
252
530
 
253
- case datatype&.to_sym
254
- when :float
255
- Float(value)
256
- when :integer
257
- Integer(value)
258
- when :boolean
259
- ActiveModel::Type::Boolean.new.cast(value)
260
- when :datetime
261
- value.is_a?(String) ? Time.parse(value) : value
262
- else
263
- value.to_s
264
- end
265
- rescue ArgumentError, TypeError
266
- # If conversion fails, return original value
267
- value
531
+ def data_selector
532
+ @data_selector ||= Column::Selector.new(self)
268
533
  end
269
534
 
270
- NUMERIC_METHODS = %i[mean median].freeze
535
+ def learner
536
+ @learner ||= Column::Learner.new(self)
537
+ end
271
538
  end
272
539
  end
@@ -2,28 +2,34 @@
2
2
  #
3
3
  # Table name: easy_ml_column_histories
4
4
  #
5
- # id :bigint not null, primary key
6
- # column_id :integer not null
7
- # dataset_id :integer not null
8
- # name :string not null
9
- # description :string
10
- # datatype :string
11
- # polars_datatype :string
12
- # is_target :boolean default(FALSE)
13
- # hidden :boolean default(FALSE)
14
- # drop_if_null :boolean default(FALSE)
15
- # preprocessing_steps :json
16
- # sample_values :json
17
- # statistics :json
18
- # created_at :datetime not null
19
- # updated_at :datetime not null
20
- # history_started_at :datetime not null
21
- # history_ended_at :datetime
22
- # history_user_id :integer
23
- # snapshot_id :string
24
- # is_date_column :boolean default(FALSE)
25
- # computed_by :string
26
- # is_computed :boolean default(FALSE)
5
+ # id :bigint not null, primary key
6
+ # column_id :integer not null
7
+ # dataset_id :integer not null
8
+ # name :string not null
9
+ # description :string
10
+ # datatype :string
11
+ # polars_datatype :string
12
+ # is_target :boolean default(FALSE)
13
+ # hidden :boolean default(FALSE)
14
+ # drop_if_null :boolean default(FALSE)
15
+ # preprocessing_steps :json
16
+ # sample_values :json
17
+ # statistics :json
18
+ # created_at :datetime not null
19
+ # updated_at :datetime not null
20
+ # history_started_at :datetime not null
21
+ # history_ended_at :datetime
22
+ # history_user_id :integer
23
+ # snapshot_id :string
24
+ # is_date_column :boolean default(FALSE)
25
+ # computed_by :string
26
+ # is_computed :boolean default(FALSE)
27
+ # feature_id :bigint
28
+ # learned_at :datetime
29
+ # is_learning :boolean default(FALSE)
30
+ # last_datasource_sha :string
31
+ # last_feature_sha :string
32
+ # configuration_changed_at :datetime
27
33
  #
28
34
  module EasyML
29
35
  class ColumnHistory < ActiveRecord::Base
@@ -31,5 +37,6 @@ module EasyML
31
37
  include Historiographer::History
32
38
  scope :required, -> { where(is_computed: false, hidden: false, is_target: false).where("preprocessing_steps IS NULL OR preprocessing_steps::text = '{}'::text") }
33
39
  scope :computed, -> { where(is_computed: true) }
40
+ scope :raw, -> { where(is_computed: false) }
34
41
  end
35
42
  end
@@ -9,8 +9,7 @@ module EasyML
9
9
  col_names = syncable
10
10
  existing_columns = where(name: col_names)
11
11
  import_new(col_names, existing_columns)
12
- update_existing(existing_columns)
13
- set_feature_lineage
12
+ # update_existing(existing_columns)
14
13
 
15
14
  if delete
16
15
  delete_missing(col_names)
@@ -22,6 +21,64 @@ module EasyML
22
21
  end
23
22
  end
24
23
 
24
+ def transform(df, inference: false, computed: false)
25
+ return df if df.nil?
26
+
27
+ if computed
28
+ cols = column_list.computed
29
+ else
30
+ cols = column_list.raw
31
+ end
32
+
33
+ by_name = cols.index_by(&:name)
34
+ df.columns.each do |col|
35
+ column = by_name[col]
36
+ df = column.transform(df, inference: inference, computed: computed) if column
37
+ end
38
+
39
+ df
40
+ end
41
+
42
+ def learn(type: :raw, computed: false)
43
+ cols_to_learn = column_list.reload.needs_learn
44
+ cols_to_learn = cols_to_learn.computed if computed
45
+ cols_to_learn = cols_to_learn.select(&:persisted?).reject(&:empty?)
46
+ cols_to_learn.each { |col| col.learn(type: type) }
47
+ EasyML::Column.import(cols_to_learn, on_duplicate_key_update: { columns: %i[
48
+ statistics
49
+ learned_at
50
+ sample_values
51
+ last_datasource_sha
52
+ is_learning
53
+ datatype
54
+ polars_datatype
55
+ ] })
56
+ set_feature_lineage
57
+ reload
58
+ end
59
+
60
+ def set_feature_lineage
61
+ names = dataset.features.computed_column_names
62
+ columns = where(name: names, computed_by: nil).map do |col|
63
+ col.assign_attributes(
64
+ is_computed: true,
65
+ computed_by: col.computing_feature&.name,
66
+ )
67
+ col
68
+ end
69
+ EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
70
+ end
71
+
72
+ def statistics
73
+ stats = { raw: {}, processed: {} }
74
+ select(&:persisted?).inject(stats) do |h, col|
75
+ h.tap do
76
+ h[:raw][col.name] = col.statistics.dig(:raw)
77
+ h[:processed][col.name] = col.statistics.dig(:processed)
78
+ end
79
+ end.with_indifferent_access
80
+ end
81
+
25
82
  def one_hots
26
83
  column_list.select(&:one_hot?)
27
84
  end
@@ -60,94 +117,22 @@ module EasyML
60
117
 
61
118
  private
62
119
 
63
- def set_feature_lineage
64
- # Get all features that compute columns
65
- features_computing_columns = dataset.features.all.map do |feature|
66
- [feature.name, feature.computes_columns]
67
- end.compact.to_h
68
-
69
- updates = column_list.reload.map do |column|
70
- # Check if column is computed by any feature
71
- computing_feature = features_computing_columns.find { |_, cols| cols.include?(column.name) }&.first
72
- is_computed = !computing_feature.nil?
73
-
74
- column.assign_attributes(
75
- computed_by: computing_feature,
76
- is_computed: is_computed,
77
- )
78
- next unless column.changed?
79
-
80
- column
81
- end.compact
82
- EasyML::Column.import(updates.to_a, { on_duplicate_key_update: { columns: %i[computed_by is_computed] } })
83
- cols = EasyML::Column.where(id: updates.map(&:id)).to_a
84
- column_list.bulk_record_history(cols, { history_user_id: 1 })
85
- end
86
-
87
120
  def import_new(new_columns, existing_columns)
88
121
  new_columns = new_columns - existing_columns.map(&:name)
89
122
  cols_to_insert = new_columns.map do |col_name|
90
- EasyML::Column.new(
123
+ col = EasyML::Column.new(
91
124
  name: col_name,
92
125
  dataset_id: dataset.id,
93
126
  )
127
+ col
94
128
  end
95
129
  EasyML::Column.import(cols_to_insert)
130
+ set_feature_lineage
96
131
  column_list.reload
97
132
  end
98
133
 
99
- def update_existing(existing_columns)
100
- stats = dataset.statistics
101
- use_processed = dataset.processed.data(limit: 1).present?
102
- cached_sample = use_processed ? dataset.processed.data(limit: 10, all_columns: true) : dataset.raw.data(limit: 10, all_columns: true)
103
- existing_types = existing_columns.map(&:name).zip(existing_columns.map(&:datatype)).to_h
104
- polars_types = cached_sample.columns.zip((cached_sample.dtypes.map do |dtype|
105
- EasyML::Data::PolarsColumn.polars_to_sym(dtype).to_s
106
- end)).to_h
107
-
108
- existing_columns.each do |column|
109
- new_polars_type = polars_types[column.name]
110
- existing_type = existing_types[column.name]
111
- schema_type = dataset.schema[column.name]
112
-
113
- # Keep both datatype and polars_datatype if it's an ordinal encoding case
114
- if column.ordinal_encoding?
115
- actual_type = existing_type
116
- actual_schema_type = existing_type
117
- else
118
- actual_type = new_polars_type
119
- actual_schema_type = schema_type
120
- end
121
-
122
- if column.one_hot?
123
- base = dataset.raw
124
- processed = stats.dig("raw", column.name).dup
125
- processed["null_count"] = 0
126
- actual_schema_type = "categorical"
127
- actual_type = "categorical"
128
- else
129
- base = use_processed ? dataset.processed : dataset.raw
130
- processed = stats.dig("processed", column.name)
131
- end
132
- sample_values = base.send(:data, unique: true, limit: 5, all_columns: true, select: column.name)[column.name].to_a.uniq[0...5]
133
-
134
- column.assign_attributes(
135
- statistics: {
136
- raw: stats.dig("raw", column.name),
137
- processed: processed,
138
- },
139
- datatype: actual_schema_type,
140
- polars_datatype: actual_type,
141
- sample_values: sample_values,
142
- )
143
- end
144
- EasyML::Column.import(existing_columns.to_a,
145
- { on_duplicate_key_update: { columns: %i[statistics datatype polars_datatype
146
- sample_values computed_by is_computed] } })
147
- end
148
-
149
134
  def delete_missing(col_names)
150
- raw_cols = dataset.best_segment.train(all_columns: true, limit: 1).columns
135
+ raw_cols = dataset.best_segment.data(all_columns: true, limit: 1).columns
151
136
  raw_cols = where(name: raw_cols)
152
137
  columns_to_delete = column_list.select do |col|
153
138
  col_names.exclude?(col.name) &&