easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/application_controller.rb +4 -0
- data/app/controllers/easy_ml/datasets_controller.rb +32 -1
- data/app/frontend/components/DatasetPreview.tsx +50 -19
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
- data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
- data/app/frontend/components/dataset/ColumnList.tsx +14 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
- data/app/frontend/types/dataset.ts +3 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
- data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
- data/app/models/easy_ml/column/imputers/base.rb +89 -0
- data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
- data/app/models/easy_ml/column/imputers/clip.rb +30 -0
- data/app/models/easy_ml/column/imputers/constant.rb +27 -0
- data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
- data/app/models/easy_ml/column/imputers/mean.rb +27 -0
- data/app/models/easy_ml/column/imputers/median.rb +27 -0
- data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
- data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
- data/app/models/easy_ml/column/imputers/today.rb +20 -0
- data/app/models/easy_ml/column/imputers.rb +126 -0
- data/app/models/easy_ml/column/learner.rb +18 -0
- data/app/models/easy_ml/column/learners/base.rb +103 -0
- data/app/models/easy_ml/column/learners/boolean.rb +11 -0
- data/app/models/easy_ml/column/learners/categorical.rb +51 -0
- data/app/models/easy_ml/column/learners/datetime.rb +19 -0
- data/app/models/easy_ml/column/learners/null.rb +22 -0
- data/app/models/easy_ml/column/learners/numeric.rb +33 -0
- data/app/models/easy_ml/column/learners/string.rb +15 -0
- data/app/models/easy_ml/column/lineage/base.rb +22 -0
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
- data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
- data/app/models/easy_ml/column/lineage.rb +28 -0
- data/app/models/easy_ml/column/selector.rb +96 -0
- data/app/models/easy_ml/column.rb +319 -52
- data/app/models/easy_ml/column_history.rb +29 -22
- data/app/models/easy_ml/column_list.rb +63 -78
- data/app/models/easy_ml/dataset.rb +128 -96
- data/app/models/easy_ml/dataset_history.rb +23 -23
- data/app/models/easy_ml/datasource.rb +3 -0
- data/app/models/easy_ml/datasource_history.rb +1 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
- data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
- data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
- data/app/models/easy_ml/feature.rb +19 -7
- data/app/models/easy_ml/feature_history.rb +12 -0
- data/app/models/easy_ml/feature_list.rb +15 -0
- data/app/serializers/easy_ml/column_serializer.rb +11 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
- data/config/initializers/enumerable.rb +17 -0
- data/lib/easy_ml/data/date_converter.rb +137 -30
- data/lib/easy_ml/data/polars_column.rb +17 -0
- data/lib/easy_ml/data/polars_in_memory.rb +30 -0
- data/lib/easy_ml/data/polars_reader.rb +20 -1
- data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
- data/lib/easy_ml/data/splits/split.rb +2 -1
- data/lib/easy_ml/data/synced_directory.rb +1 -1
- data/lib/easy_ml/data.rb +1 -2
- data/lib/easy_ml/feature_store.rb +33 -22
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
- data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
- data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
- data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
- data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/tasks/profile.rake +40 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
- metadata +41 -10
- data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
- data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
- data/lib/easy_ml/data/preprocessor.rb +0 -340
- data/lib/easy_ml/data/simple_imputer.rb +0 -255
- data/lib/easy_ml/data/statistics_learner.rb +0 -193
- data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1
@@ -2,23 +2,29 @@
|
|
2
2
|
#
|
3
3
|
# Table name: easy_ml_columns
|
4
4
|
#
|
5
|
-
# id
|
6
|
-
# dataset_id
|
7
|
-
# name
|
8
|
-
# description
|
9
|
-
# datatype
|
10
|
-
# polars_datatype
|
11
|
-
# is_target
|
12
|
-
# hidden
|
13
|
-
# drop_if_null
|
14
|
-
# preprocessing_steps
|
15
|
-
# sample_values
|
16
|
-
# statistics
|
17
|
-
# created_at
|
18
|
-
# updated_at
|
19
|
-
# is_date_column
|
20
|
-
# computed_by
|
21
|
-
# is_computed
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# dataset_id :bigint not null
|
7
|
+
# name :string not null
|
8
|
+
# description :string
|
9
|
+
# datatype :string
|
10
|
+
# polars_datatype :string
|
11
|
+
# is_target :boolean default(FALSE)
|
12
|
+
# hidden :boolean default(FALSE)
|
13
|
+
# drop_if_null :boolean default(FALSE)
|
14
|
+
# preprocessing_steps :json
|
15
|
+
# sample_values :json
|
16
|
+
# statistics :json
|
17
|
+
# created_at :datetime not null
|
18
|
+
# updated_at :datetime not null
|
19
|
+
# is_date_column :boolean default(FALSE)
|
20
|
+
# computed_by :string
|
21
|
+
# is_computed :boolean default(FALSE)
|
22
|
+
# feature_id :bigint
|
23
|
+
# learned_at :datetime
|
24
|
+
# is_learning :boolean default(FALSE)
|
25
|
+
# last_datasource_sha :string
|
26
|
+
# last_feature_sha :string
|
27
|
+
# configuration_changed_at :datetime
|
22
28
|
#
|
23
29
|
module EasyML
|
24
30
|
class Column < ActiveRecord::Base
|
@@ -27,6 +33,7 @@ module EasyML
|
|
27
33
|
historiographer_mode :snapshot_only
|
28
34
|
|
29
35
|
belongs_to :dataset, class_name: "EasyML::Dataset"
|
36
|
+
belongs_to :feature, class_name: "EasyML::Feature", optional: true
|
30
37
|
|
31
38
|
validates :name, presence: true
|
32
39
|
validates :name, uniqueness: { scope: :dataset_id }
|
@@ -34,6 +41,8 @@ module EasyML
|
|
34
41
|
before_save :ensure_valid_datatype
|
35
42
|
after_save :handle_date_column_change
|
36
43
|
before_save :set_defaults
|
44
|
+
before_save :set_feature_lineage
|
45
|
+
before_save :set_polars_datatype
|
37
46
|
|
38
47
|
# Scopes
|
39
48
|
scope :visible, -> { where(hidden: false) }
|
@@ -41,9 +50,71 @@ module EasyML
|
|
41
50
|
scope :categorical, -> { where(datatype: %w[categorical string boolean]) }
|
42
51
|
scope :datetime, -> { where(datatype: "datetime") }
|
43
52
|
scope :date_column, -> { where(is_date_column: true) }
|
44
|
-
scope :
|
53
|
+
scope :not_preprocessed, -> { where("preprocessing_steps IS NULL OR preprocessing_steps::text = '{}'::text") }
|
54
|
+
scope :preprocessed, -> { where("preprocessing_steps IS NOT NULL AND preprocessing_steps::text != '{}'::text") }
|
55
|
+
scope :required, -> { raw.visible.not_target.not_preprocessed }
|
56
|
+
scope :optional, -> { required.not }
|
57
|
+
scope :target, -> { where(is_target: true) }
|
58
|
+
scope :not_target, -> { where(is_target: false) }
|
45
59
|
scope :api_inputs, -> { where(is_computed: false, hidden: false, is_target: false) }
|
46
60
|
scope :computed, -> { where(is_computed: true) }
|
61
|
+
scope :raw, -> { where(is_computed: false) }
|
62
|
+
scope :needs_learn, -> {
|
63
|
+
datasource_changed
|
64
|
+
.or(feature_applied)
|
65
|
+
.or(feature_changed)
|
66
|
+
.or(column_changed)
|
67
|
+
.or(never_learned)
|
68
|
+
.or(is_learning)
|
69
|
+
}
|
70
|
+
|
71
|
+
scope :datasource_changed, -> {
|
72
|
+
left_joins(dataset: :datasource)
|
73
|
+
.left_joins(:feature)
|
74
|
+
.where(
|
75
|
+
arel_table[:last_datasource_sha].not_eq(
|
76
|
+
Datasource.arel_table[:sha]
|
77
|
+
)
|
78
|
+
)
|
79
|
+
}
|
80
|
+
|
81
|
+
scope :feature_changed, -> {
|
82
|
+
where(feature_id: Feature.has_changes.map(&:id))
|
83
|
+
}
|
84
|
+
|
85
|
+
scope :feature_applied, -> {
|
86
|
+
left_joins(dataset: :datasource)
|
87
|
+
.left_joins(:feature)
|
88
|
+
.where(
|
89
|
+
Feature.arel_table[:applied_at].gt(
|
90
|
+
Arel.sql("COALESCE(#{arel_table.name}.learned_at, '1970-01-01')")
|
91
|
+
).and(
|
92
|
+
arel_table[:feature_id].not_eq(nil)
|
93
|
+
)
|
94
|
+
)
|
95
|
+
}
|
96
|
+
|
97
|
+
scope :column_changed, -> {
|
98
|
+
left_joins(dataset: :datasource)
|
99
|
+
.left_joins(:feature)
|
100
|
+
.where(Dataset.arel_table[:refreshed_at].lt(arel_table[:updated_at]))
|
101
|
+
}
|
102
|
+
|
103
|
+
scope :never_learned, -> {
|
104
|
+
left_joins(dataset: :datasource)
|
105
|
+
.left_joins(:feature)
|
106
|
+
.where(arel_table[:learned_at].eq(nil))
|
107
|
+
.where(Datasource.arel_table[:sha].not_eq(nil))
|
108
|
+
}
|
109
|
+
scope :is_learning, -> { where(is_learning: true) }
|
110
|
+
|
111
|
+
def display_attributes
|
112
|
+
attributes.except(:statistics)
|
113
|
+
end
|
114
|
+
|
115
|
+
def inspect
|
116
|
+
"#<#{self.class.name} #{display_attributes.map { |k, v| "#{k}: #{v}" }.join(", ")}>"
|
117
|
+
end
|
47
118
|
|
48
119
|
def aliases
|
49
120
|
[name].concat(virtual_columns)
|
@@ -57,12 +128,174 @@ module EasyML
|
|
57
128
|
end
|
58
129
|
end
|
59
130
|
|
131
|
+
delegate :raw, :processed, :data, :train, :test, :valid, :clipped, to: :data_selector
|
132
|
+
|
133
|
+
def empty?
|
134
|
+
data.blank?
|
135
|
+
end
|
136
|
+
|
137
|
+
def learn(type: :all)
|
138
|
+
return if (!in_raw_dataset? && type != :processed)
|
139
|
+
|
140
|
+
if !in_raw_dataset? && read_attribute(:datatype).nil?
|
141
|
+
assign_attributes(datatype: processed.data.to_series.dtype)
|
142
|
+
end
|
143
|
+
set_sample_values
|
144
|
+
assign_attributes(statistics: (read_attribute(:statistics) || {}).symbolize_keys.merge!(learner.learn(type: type).symbolize_keys))
|
145
|
+
assign_attributes(
|
146
|
+
learned_at: UTC.now,
|
147
|
+
last_datasource_sha: dataset.last_datasource_sha,
|
148
|
+
last_feature_sha: feature&.sha,
|
149
|
+
is_learning: type == :raw,
|
150
|
+
)
|
151
|
+
end
|
152
|
+
|
153
|
+
def set_configuration_changed_at
|
154
|
+
if preprocessing_steps_changed? || datatype_changed?
|
155
|
+
self.configuration_changed_at = Time.now
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
def set_sample_values
|
160
|
+
use_processed = !one_hot? && processed.data(limit: 1).present? && in_raw_dataset?
|
161
|
+
|
162
|
+
base = use_processed ? processed : raw
|
163
|
+
sample_values = base.data(limit: 5, unique: true)
|
164
|
+
if sample_values.columns.include?(name)
|
165
|
+
sample_values = sample_values[name].to_a.uniq[0...5]
|
166
|
+
assign_attributes(sample_values: sample_values)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
def transform(df, inference: false, computed: false)
|
171
|
+
imputer = inference && imputers.inference.anything? ? imputers.inference : imputers.training
|
172
|
+
|
173
|
+
df = imputer.transform(df)
|
174
|
+
df
|
175
|
+
end
|
176
|
+
|
177
|
+
def imputers
|
178
|
+
@imputers ||= Column::Imputers.new(self)
|
179
|
+
end
|
180
|
+
|
181
|
+
def decode_labels(df)
|
182
|
+
imputers.training.decode_labels(df)
|
183
|
+
end
|
184
|
+
|
185
|
+
def preprocessed?
|
186
|
+
!preprocessing_steps.blank?
|
187
|
+
end
|
188
|
+
|
60
189
|
def datatype=(dtype)
|
190
|
+
if dtype.is_a?(Polars::DataType)
|
191
|
+
dtype = EasyML::Data::PolarsColumn.polars_to_sym(dtype)
|
192
|
+
end
|
61
193
|
write_attribute(:datatype, dtype)
|
62
|
-
|
194
|
+
set_polars_datatype
|
195
|
+
end
|
196
|
+
|
197
|
+
def datatype
|
198
|
+
read_attribute(:datatype) || write_attribute(:datatype, assumed_datatype)
|
199
|
+
end
|
200
|
+
|
201
|
+
def raw_dtype
|
202
|
+
return @raw_dtype if @raw_dtype
|
203
|
+
|
204
|
+
if in_raw_dataset?
|
205
|
+
@raw_dtype = raw&.data&.to_series&.dtype
|
206
|
+
elsif already_computed?
|
207
|
+
@raw_dtype = processed&.data&.to_series&.dtype
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
def set_polars_datatype
|
212
|
+
raw_type = raw_dtype
|
213
|
+
user_type = get_polars_type(datatype)
|
214
|
+
|
215
|
+
if raw_type == user_type
|
216
|
+
# A raw type of Polars::Datetime might have extra information like timezone, so prefer the raw type
|
217
|
+
write_attribute(:polars_datatype, raw_type.to_s)
|
218
|
+
else
|
219
|
+
# If a user specified type doesn't match the raw type, use the user type
|
220
|
+
write_attribute(:polars_datatype, user_type.to_s)
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
def polars_datatype
|
225
|
+
begin
|
226
|
+
raw_attr = read_attribute(:polars_datatype)
|
227
|
+
if raw_attr.nil?
|
228
|
+
get_polars_type(datatype)
|
229
|
+
else
|
230
|
+
EasyML::Data::PolarsColumn.parse_polars_dtype(raw_attr)
|
231
|
+
end
|
232
|
+
rescue => e
|
233
|
+
get_polars_type(datatype)
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
EasyML::Data::PolarsColumn::TYPE_MAP.keys.each do |dtype|
|
238
|
+
define_method("#{dtype}?") do
|
239
|
+
datatype.to_s == dtype.to_s
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
def datasource_raw
|
244
|
+
dataset.datasource.query(select: name)
|
245
|
+
end
|
246
|
+
|
247
|
+
def already_computed?
|
248
|
+
is_computed && computing_feature&.fit_at.present? || computing_feature&.applied_at.present?
|
249
|
+
end
|
250
|
+
|
251
|
+
def assumed_datatype
|
252
|
+
return @assumed_datatype if @assumed_datatype
|
253
|
+
|
254
|
+
if in_raw_dataset?
|
255
|
+
series = (raw.data || datasource_raw).to_series
|
256
|
+
@assumed_datatype = EasyML::Data::PolarsColumn.determine_type(series)
|
257
|
+
elsif already_computed?
|
258
|
+
return nil if processed.data.nil?
|
259
|
+
|
260
|
+
@assumed_datatype = EasyML::Data::PolarsColumn.determine_type(processed.data.to_series)
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
def in_raw_dataset?
|
265
|
+
return false if dataset&.raw&.data.nil?
|
266
|
+
|
267
|
+
dataset.raw.data(all_columns: true)&.columns&.include?(name) || false
|
268
|
+
end
|
269
|
+
|
270
|
+
def computing_feature
|
271
|
+
dataset&.features&.detect { |feature| feature.computes_columns.include?(name) }.tap do |computing_feature|
|
272
|
+
if computing_feature.present? && feature_id != computing_feature.id
|
273
|
+
update(feature_id: computing_feature.id)
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
alias_method :feature, :computing_feature
|
279
|
+
|
280
|
+
def set_feature_lineage
|
281
|
+
if dataset.features.computed_column_names.include?(name)
|
282
|
+
if computed_by.nil?
|
283
|
+
assign_attributes(
|
284
|
+
is_computed: true,
|
285
|
+
computed_by: computing_feature&.name,
|
286
|
+
)
|
287
|
+
end
|
288
|
+
elsif computed_by.present?
|
289
|
+
assign_attributes(
|
290
|
+
is_computed: false,
|
291
|
+
computed_by: nil,
|
292
|
+
)
|
293
|
+
end
|
63
294
|
end
|
64
295
|
|
65
296
|
def get_polars_type(dtype)
|
297
|
+
return nil if dtype.nil?
|
298
|
+
|
66
299
|
EasyML::Data::PolarsColumn::TYPE_MAP[dtype.to_sym]
|
67
300
|
end
|
68
301
|
|
@@ -84,7 +317,7 @@ module EasyML
|
|
84
317
|
next config unless config[:params]&.key?(:constant)
|
85
318
|
|
86
319
|
config.deep_dup.tap do |c|
|
87
|
-
c[:params][:constant] =
|
320
|
+
c[:params][:constant] = cast(c[:params][:constant])
|
88
321
|
end
|
89
322
|
end
|
90
323
|
|
@@ -103,15 +336,47 @@ module EasyML
|
|
103
336
|
preprocessing_steps.deep_symbolize_keys.dig(:training, :params, :ordinal_encoding) == true
|
104
337
|
end
|
105
338
|
|
339
|
+
def encoding
|
340
|
+
return nil unless categorical?
|
341
|
+
return :ordinal if ordinal_encoding?
|
342
|
+
return :one_hot
|
343
|
+
end
|
344
|
+
|
345
|
+
def categorical_min
|
346
|
+
return default_categorical_min unless categorical?
|
347
|
+
|
348
|
+
(preprocessing_steps || {}).deep_symbolize_keys.dig(:training, :params, :categorical_min) || default_categorical_min
|
349
|
+
end
|
350
|
+
|
351
|
+
def default_categorical_min
|
352
|
+
1
|
353
|
+
end
|
354
|
+
|
355
|
+
def statistics
|
356
|
+
(read_attribute(:statistics) || {}).with_indifferent_access
|
357
|
+
end
|
358
|
+
|
106
359
|
def allowed_categories
|
107
|
-
|
108
|
-
stats = dataset.statistics
|
360
|
+
stats = statistics
|
109
361
|
return [] if stats.nil? || stats.blank?
|
110
362
|
|
111
363
|
stats = stats.deep_symbolize_keys
|
112
|
-
|
364
|
+
type = is_computed? ? :processed : :raw
|
365
|
+
stats = stats.dig(type)
|
113
366
|
|
114
|
-
|
367
|
+
# Can we LEARN dtype during LEARN phase... for computed columns to deal with this ish man
|
368
|
+
sorted = (stats.dig(:allowed_categories) || []).sort_by(&method(:sort_by))
|
369
|
+
sorted = sorted.concat(["other"]) if categorical?
|
370
|
+
sorted
|
371
|
+
end
|
372
|
+
|
373
|
+
def sort_by(value)
|
374
|
+
case datatype.to_sym
|
375
|
+
when :boolean
|
376
|
+
value == true ? 1 : 0
|
377
|
+
else
|
378
|
+
value
|
379
|
+
end
|
115
380
|
end
|
116
381
|
|
117
382
|
def date_column?
|
@@ -119,19 +384,11 @@ module EasyML
|
|
119
384
|
end
|
120
385
|
|
121
386
|
def lineage
|
122
|
-
|
123
|
-
present_in_raw_dataset ? "Raw dataset" : nil,
|
124
|
-
computed_by ? "Computed by #{computed_by}" : nil,
|
125
|
-
preprocessing_steps.present? ? "Preprocessed using #{preprocessing_steps.keys.join(", ")}" : nil,
|
126
|
-
].compact
|
387
|
+
@lineage ||= EasyML::Column::Lineage.new(self).lineage
|
127
388
|
end
|
128
389
|
|
129
390
|
def required?
|
130
|
-
is_computed && (preprocessing_steps.nil? || preprocessing_steps == {}) && !hidden && !is_target
|
131
|
-
end
|
132
|
-
|
133
|
-
def present_in_raw_dataset
|
134
|
-
dataset.raw.data&.columns&.include?(name) || false
|
391
|
+
!is_computed && (preprocessing_steps.nil? || preprocessing_steps == {}) && !hidden && !is_target
|
135
392
|
end
|
136
393
|
|
137
394
|
def sort_required
|
@@ -148,6 +405,28 @@ module EasyML
|
|
148
405
|
}.compact
|
149
406
|
end
|
150
407
|
|
408
|
+
def cast(value)
|
409
|
+
return value if value.nil?
|
410
|
+
|
411
|
+
case datatype&.to_sym
|
412
|
+
when :float
|
413
|
+
Float(value)
|
414
|
+
when :integer
|
415
|
+
Integer(value)
|
416
|
+
when :boolean
|
417
|
+
ActiveModel::Type::Boolean.new.cast(value)
|
418
|
+
when :datetime
|
419
|
+
value.is_a?(String) ? Time.parse(value) : value
|
420
|
+
when :categorical
|
421
|
+
value
|
422
|
+
else
|
423
|
+
value.to_s
|
424
|
+
end
|
425
|
+
rescue ArgumentError, TypeError
|
426
|
+
# If conversion fails, return original value
|
427
|
+
value
|
428
|
+
end
|
429
|
+
|
151
430
|
private
|
152
431
|
|
153
432
|
def set_defaults
|
@@ -247,26 +526,14 @@ module EasyML
|
|
247
526
|
throw :abort
|
248
527
|
end
|
249
528
|
|
250
|
-
|
251
|
-
return value if value.nil?
|
529
|
+
NUMERIC_METHODS = %i[mean median].freeze
|
252
530
|
|
253
|
-
|
254
|
-
|
255
|
-
Float(value)
|
256
|
-
when :integer
|
257
|
-
Integer(value)
|
258
|
-
when :boolean
|
259
|
-
ActiveModel::Type::Boolean.new.cast(value)
|
260
|
-
when :datetime
|
261
|
-
value.is_a?(String) ? Time.parse(value) : value
|
262
|
-
else
|
263
|
-
value.to_s
|
264
|
-
end
|
265
|
-
rescue ArgumentError, TypeError
|
266
|
-
# If conversion fails, return original value
|
267
|
-
value
|
531
|
+
def data_selector
|
532
|
+
@data_selector ||= Column::Selector.new(self)
|
268
533
|
end
|
269
534
|
|
270
|
-
|
535
|
+
def learner
|
536
|
+
@learner ||= Column::Learner.new(self)
|
537
|
+
end
|
271
538
|
end
|
272
539
|
end
|
@@ -2,28 +2,34 @@
|
|
2
2
|
#
|
3
3
|
# Table name: easy_ml_column_histories
|
4
4
|
#
|
5
|
-
# id
|
6
|
-
# column_id
|
7
|
-
# dataset_id
|
8
|
-
# name
|
9
|
-
# description
|
10
|
-
# datatype
|
11
|
-
# polars_datatype
|
12
|
-
# is_target
|
13
|
-
# hidden
|
14
|
-
# drop_if_null
|
15
|
-
# preprocessing_steps
|
16
|
-
# sample_values
|
17
|
-
# statistics
|
18
|
-
# created_at
|
19
|
-
# updated_at
|
20
|
-
# history_started_at
|
21
|
-
# history_ended_at
|
22
|
-
# history_user_id
|
23
|
-
# snapshot_id
|
24
|
-
# is_date_column
|
25
|
-
# computed_by
|
26
|
-
# is_computed
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# column_id :integer not null
|
7
|
+
# dataset_id :integer not null
|
8
|
+
# name :string not null
|
9
|
+
# description :string
|
10
|
+
# datatype :string
|
11
|
+
# polars_datatype :string
|
12
|
+
# is_target :boolean default(FALSE)
|
13
|
+
# hidden :boolean default(FALSE)
|
14
|
+
# drop_if_null :boolean default(FALSE)
|
15
|
+
# preprocessing_steps :json
|
16
|
+
# sample_values :json
|
17
|
+
# statistics :json
|
18
|
+
# created_at :datetime not null
|
19
|
+
# updated_at :datetime not null
|
20
|
+
# history_started_at :datetime not null
|
21
|
+
# history_ended_at :datetime
|
22
|
+
# history_user_id :integer
|
23
|
+
# snapshot_id :string
|
24
|
+
# is_date_column :boolean default(FALSE)
|
25
|
+
# computed_by :string
|
26
|
+
# is_computed :boolean default(FALSE)
|
27
|
+
# feature_id :bigint
|
28
|
+
# learned_at :datetime
|
29
|
+
# is_learning :boolean default(FALSE)
|
30
|
+
# last_datasource_sha :string
|
31
|
+
# last_feature_sha :string
|
32
|
+
# configuration_changed_at :datetime
|
27
33
|
#
|
28
34
|
module EasyML
|
29
35
|
class ColumnHistory < ActiveRecord::Base
|
@@ -31,5 +37,6 @@ module EasyML
|
|
31
37
|
include Historiographer::History
|
32
38
|
scope :required, -> { where(is_computed: false, hidden: false, is_target: false).where("preprocessing_steps IS NULL OR preprocessing_steps::text = '{}'::text") }
|
33
39
|
scope :computed, -> { where(is_computed: true) }
|
40
|
+
scope :raw, -> { where(is_computed: false) }
|
34
41
|
end
|
35
42
|
end
|
@@ -9,8 +9,7 @@ module EasyML
|
|
9
9
|
col_names = syncable
|
10
10
|
existing_columns = where(name: col_names)
|
11
11
|
import_new(col_names, existing_columns)
|
12
|
-
update_existing(existing_columns)
|
13
|
-
set_feature_lineage
|
12
|
+
# update_existing(existing_columns)
|
14
13
|
|
15
14
|
if delete
|
16
15
|
delete_missing(col_names)
|
@@ -22,6 +21,64 @@ module EasyML
|
|
22
21
|
end
|
23
22
|
end
|
24
23
|
|
24
|
+
def transform(df, inference: false, computed: false)
|
25
|
+
return df if df.nil?
|
26
|
+
|
27
|
+
if computed
|
28
|
+
cols = column_list.computed
|
29
|
+
else
|
30
|
+
cols = column_list.raw
|
31
|
+
end
|
32
|
+
|
33
|
+
by_name = cols.index_by(&:name)
|
34
|
+
df.columns.each do |col|
|
35
|
+
column = by_name[col]
|
36
|
+
df = column.transform(df, inference: inference, computed: computed) if column
|
37
|
+
end
|
38
|
+
|
39
|
+
df
|
40
|
+
end
|
41
|
+
|
42
|
+
def learn(type: :raw, computed: false)
|
43
|
+
cols_to_learn = column_list.reload.needs_learn
|
44
|
+
cols_to_learn = cols_to_learn.computed if computed
|
45
|
+
cols_to_learn = cols_to_learn.select(&:persisted?).reject(&:empty?)
|
46
|
+
cols_to_learn.each { |col| col.learn(type: type) }
|
47
|
+
EasyML::Column.import(cols_to_learn, on_duplicate_key_update: { columns: %i[
|
48
|
+
statistics
|
49
|
+
learned_at
|
50
|
+
sample_values
|
51
|
+
last_datasource_sha
|
52
|
+
is_learning
|
53
|
+
datatype
|
54
|
+
polars_datatype
|
55
|
+
] })
|
56
|
+
set_feature_lineage
|
57
|
+
reload
|
58
|
+
end
|
59
|
+
|
60
|
+
def set_feature_lineage
|
61
|
+
names = dataset.features.computed_column_names
|
62
|
+
columns = where(name: names, computed_by: nil).map do |col|
|
63
|
+
col.assign_attributes(
|
64
|
+
is_computed: true,
|
65
|
+
computed_by: col.computing_feature&.name,
|
66
|
+
)
|
67
|
+
col
|
68
|
+
end
|
69
|
+
EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
|
70
|
+
end
|
71
|
+
|
72
|
+
def statistics
|
73
|
+
stats = { raw: {}, processed: {} }
|
74
|
+
select(&:persisted?).inject(stats) do |h, col|
|
75
|
+
h.tap do
|
76
|
+
h[:raw][col.name] = col.statistics.dig(:raw)
|
77
|
+
h[:processed][col.name] = col.statistics.dig(:processed)
|
78
|
+
end
|
79
|
+
end.with_indifferent_access
|
80
|
+
end
|
81
|
+
|
25
82
|
def one_hots
|
26
83
|
column_list.select(&:one_hot?)
|
27
84
|
end
|
@@ -60,94 +117,22 @@ module EasyML
|
|
60
117
|
|
61
118
|
private
|
62
119
|
|
63
|
-
def set_feature_lineage
|
64
|
-
# Get all features that compute columns
|
65
|
-
features_computing_columns = dataset.features.all.map do |feature|
|
66
|
-
[feature.name, feature.computes_columns]
|
67
|
-
end.compact.to_h
|
68
|
-
|
69
|
-
updates = column_list.reload.map do |column|
|
70
|
-
# Check if column is computed by any feature
|
71
|
-
computing_feature = features_computing_columns.find { |_, cols| cols.include?(column.name) }&.first
|
72
|
-
is_computed = !computing_feature.nil?
|
73
|
-
|
74
|
-
column.assign_attributes(
|
75
|
-
computed_by: computing_feature,
|
76
|
-
is_computed: is_computed,
|
77
|
-
)
|
78
|
-
next unless column.changed?
|
79
|
-
|
80
|
-
column
|
81
|
-
end.compact
|
82
|
-
EasyML::Column.import(updates.to_a, { on_duplicate_key_update: { columns: %i[computed_by is_computed] } })
|
83
|
-
cols = EasyML::Column.where(id: updates.map(&:id)).to_a
|
84
|
-
column_list.bulk_record_history(cols, { history_user_id: 1 })
|
85
|
-
end
|
86
|
-
|
87
120
|
def import_new(new_columns, existing_columns)
|
88
121
|
new_columns = new_columns - existing_columns.map(&:name)
|
89
122
|
cols_to_insert = new_columns.map do |col_name|
|
90
|
-
EasyML::Column.new(
|
123
|
+
col = EasyML::Column.new(
|
91
124
|
name: col_name,
|
92
125
|
dataset_id: dataset.id,
|
93
126
|
)
|
127
|
+
col
|
94
128
|
end
|
95
129
|
EasyML::Column.import(cols_to_insert)
|
130
|
+
set_feature_lineage
|
96
131
|
column_list.reload
|
97
132
|
end
|
98
133
|
|
99
|
-
def update_existing(existing_columns)
|
100
|
-
stats = dataset.statistics
|
101
|
-
use_processed = dataset.processed.data(limit: 1).present?
|
102
|
-
cached_sample = use_processed ? dataset.processed.data(limit: 10, all_columns: true) : dataset.raw.data(limit: 10, all_columns: true)
|
103
|
-
existing_types = existing_columns.map(&:name).zip(existing_columns.map(&:datatype)).to_h
|
104
|
-
polars_types = cached_sample.columns.zip((cached_sample.dtypes.map do |dtype|
|
105
|
-
EasyML::Data::PolarsColumn.polars_to_sym(dtype).to_s
|
106
|
-
end)).to_h
|
107
|
-
|
108
|
-
existing_columns.each do |column|
|
109
|
-
new_polars_type = polars_types[column.name]
|
110
|
-
existing_type = existing_types[column.name]
|
111
|
-
schema_type = dataset.schema[column.name]
|
112
|
-
|
113
|
-
# Keep both datatype and polars_datatype if it's an ordinal encoding case
|
114
|
-
if column.ordinal_encoding?
|
115
|
-
actual_type = existing_type
|
116
|
-
actual_schema_type = existing_type
|
117
|
-
else
|
118
|
-
actual_type = new_polars_type
|
119
|
-
actual_schema_type = schema_type
|
120
|
-
end
|
121
|
-
|
122
|
-
if column.one_hot?
|
123
|
-
base = dataset.raw
|
124
|
-
processed = stats.dig("raw", column.name).dup
|
125
|
-
processed["null_count"] = 0
|
126
|
-
actual_schema_type = "categorical"
|
127
|
-
actual_type = "categorical"
|
128
|
-
else
|
129
|
-
base = use_processed ? dataset.processed : dataset.raw
|
130
|
-
processed = stats.dig("processed", column.name)
|
131
|
-
end
|
132
|
-
sample_values = base.send(:data, unique: true, limit: 5, all_columns: true, select: column.name)[column.name].to_a.uniq[0...5]
|
133
|
-
|
134
|
-
column.assign_attributes(
|
135
|
-
statistics: {
|
136
|
-
raw: stats.dig("raw", column.name),
|
137
|
-
processed: processed,
|
138
|
-
},
|
139
|
-
datatype: actual_schema_type,
|
140
|
-
polars_datatype: actual_type,
|
141
|
-
sample_values: sample_values,
|
142
|
-
)
|
143
|
-
end
|
144
|
-
EasyML::Column.import(existing_columns.to_a,
|
145
|
-
{ on_duplicate_key_update: { columns: %i[statistics datatype polars_datatype
|
146
|
-
sample_values computed_by is_computed] } })
|
147
|
-
end
|
148
|
-
|
149
134
|
def delete_missing(col_names)
|
150
|
-
raw_cols = dataset.best_segment.
|
135
|
+
raw_cols = dataset.best_segment.data(all_columns: true, limit: 1).columns
|
151
136
|
raw_cols = where(name: raw_cols)
|
152
137
|
columns_to_delete = column_list.select do |col|
|
153
138
|
col_names.exclude?(col.name) &&
|