easy_ml 0.2.0.pre.rc57 → 0.2.0.pre.rc60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/apis_controller.rb +8 -0
- data/app/controllers/easy_ml/application_controller.rb +4 -0
- data/app/controllers/easy_ml/datasets_controller.rb +32 -1
- data/app/controllers/easy_ml/models_controller.rb +3 -0
- data/app/controllers/easy_ml/predictions_controller.rb +10 -5
- data/app/frontend/components/DatasetPreview.tsx +50 -19
- data/app/frontend/components/ModelForm.tsx +1 -1
- data/app/frontend/components/SearchableSelect.tsx +0 -1
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
- data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
- data/app/frontend/components/dataset/ColumnList.tsx +14 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +82 -21
- data/app/frontend/pages/DatasourcesPage.tsx +0 -2
- data/app/frontend/types/dataset.ts +3 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +0 -2
- data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
- data/app/models/easy_ml/column/imputers/base.rb +89 -0
- data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
- data/app/models/easy_ml/column/imputers/clip.rb +30 -0
- data/app/models/easy_ml/column/imputers/constant.rb +27 -0
- data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
- data/app/models/easy_ml/column/imputers/mean.rb +27 -0
- data/app/models/easy_ml/column/imputers/median.rb +27 -0
- data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
- data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
- data/app/models/easy_ml/column/imputers/today.rb +20 -0
- data/app/models/easy_ml/column/imputers.rb +126 -0
- data/app/models/easy_ml/column/learner.rb +18 -0
- data/app/models/easy_ml/column/learners/base.rb +103 -0
- data/app/models/easy_ml/column/learners/boolean.rb +11 -0
- data/app/models/easy_ml/column/learners/categorical.rb +51 -0
- data/app/models/easy_ml/column/learners/datetime.rb +19 -0
- data/app/models/easy_ml/column/learners/null.rb +22 -0
- data/app/models/easy_ml/column/learners/numeric.rb +33 -0
- data/app/models/easy_ml/column/learners/string.rb +15 -0
- data/app/models/easy_ml/column/lineage/base.rb +22 -0
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
- data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
- data/app/models/easy_ml/column/lineage.rb +28 -0
- data/app/models/easy_ml/column/selector.rb +96 -0
- data/app/models/easy_ml/column.rb +344 -39
- data/app/models/easy_ml/column_history.rb +31 -20
- data/app/models/easy_ml/column_list.rb +79 -62
- data/app/models/easy_ml/dataset.rb +156 -104
- data/app/models/easy_ml/dataset_history.rb +23 -23
- data/app/models/easy_ml/datasource.rb +4 -0
- data/app/models/easy_ml/datasource_history.rb +1 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
- data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
- data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
- data/app/models/easy_ml/feature.rb +29 -10
- data/app/models/easy_ml/feature_history.rb +12 -0
- data/app/models/easy_ml/feature_list.rb +15 -0
- data/app/models/easy_ml/model.rb +25 -4
- data/app/models/easy_ml/model_history.rb +1 -0
- data/app/models/easy_ml/retraining_run.rb +1 -0
- data/app/serializers/easy_ml/column_serializer.rb +11 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
- data/config/initializers/enumerable.rb +17 -0
- data/config/initializers/inflections.rb +2 -0
- data/config/routes.rb +3 -0
- data/lib/easy_ml/core/tuner.rb +1 -1
- data/lib/easy_ml/data/date_converter.rb +137 -30
- data/lib/easy_ml/data/polars_column.rb +17 -0
- data/lib/easy_ml/data/polars_in_memory.rb +30 -0
- data/lib/easy_ml/data/polars_reader.rb +20 -1
- data/lib/easy_ml/data/splits/in_memory_split.rb +7 -5
- data/lib/easy_ml/data/splits/split.rb +2 -1
- data/lib/easy_ml/data/synced_directory.rb +5 -3
- data/lib/easy_ml/data.rb +1 -2
- data/lib/easy_ml/feature_store.rb +33 -22
- data/lib/easy_ml/predict.rb +13 -2
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +7 -0
- data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +18 -0
- data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
- data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
- data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt +20 -0
- data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/tasks/profile.rake +40 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
- metadata +45 -10
- data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
- data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
- data/lib/easy_ml/data/preprocessor.rb +0 -383
- data/lib/easy_ml/data/simple_imputer.rb +0 -255
- data/lib/easy_ml/data/statistics_learner.rb +0 -128
- data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js +0 -474
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js.map +0 -1
@@ -2,21 +2,29 @@
|
|
2
2
|
#
|
3
3
|
# Table name: easy_ml_columns
|
4
4
|
#
|
5
|
-
# id
|
6
|
-
# dataset_id
|
7
|
-
# name
|
8
|
-
# description
|
9
|
-
# datatype
|
10
|
-
# polars_datatype
|
11
|
-
# is_target
|
12
|
-
# hidden
|
13
|
-
# drop_if_null
|
14
|
-
# preprocessing_steps
|
15
|
-
# sample_values
|
16
|
-
# statistics
|
17
|
-
# created_at
|
18
|
-
# updated_at
|
19
|
-
# is_date_column
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# dataset_id :bigint not null
|
7
|
+
# name :string not null
|
8
|
+
# description :string
|
9
|
+
# datatype :string
|
10
|
+
# polars_datatype :string
|
11
|
+
# is_target :boolean default(FALSE)
|
12
|
+
# hidden :boolean default(FALSE)
|
13
|
+
# drop_if_null :boolean default(FALSE)
|
14
|
+
# preprocessing_steps :json
|
15
|
+
# sample_values :json
|
16
|
+
# statistics :json
|
17
|
+
# created_at :datetime not null
|
18
|
+
# updated_at :datetime not null
|
19
|
+
# is_date_column :boolean default(FALSE)
|
20
|
+
# computed_by :string
|
21
|
+
# is_computed :boolean default(FALSE)
|
22
|
+
# feature_id :bigint
|
23
|
+
# learned_at :datetime
|
24
|
+
# is_learning :boolean default(FALSE)
|
25
|
+
# last_datasource_sha :string
|
26
|
+
# last_feature_sha :string
|
27
|
+
# configuration_changed_at :datetime
|
20
28
|
#
|
21
29
|
module EasyML
|
22
30
|
class Column < ActiveRecord::Base
|
@@ -25,6 +33,7 @@ module EasyML
|
|
25
33
|
historiographer_mode :snapshot_only
|
26
34
|
|
27
35
|
belongs_to :dataset, class_name: "EasyML::Dataset"
|
36
|
+
belongs_to :feature, class_name: "EasyML::Feature", optional: true
|
28
37
|
|
29
38
|
validates :name, presence: true
|
30
39
|
validates :name, uniqueness: { scope: :dataset_id }
|
@@ -32,6 +41,8 @@ module EasyML
|
|
32
41
|
before_save :ensure_valid_datatype
|
33
42
|
after_save :handle_date_column_change
|
34
43
|
before_save :set_defaults
|
44
|
+
before_save :set_feature_lineage
|
45
|
+
before_save :set_polars_datatype
|
35
46
|
|
36
47
|
# Scopes
|
37
48
|
scope :visible, -> { where(hidden: false) }
|
@@ -39,8 +50,73 @@ module EasyML
|
|
39
50
|
scope :categorical, -> { where(datatype: %w[categorical string boolean]) }
|
40
51
|
scope :datetime, -> { where(datatype: "datetime") }
|
41
52
|
scope :date_column, -> { where(is_date_column: true) }
|
53
|
+
scope :not_preprocessed, -> { where("preprocessing_steps IS NULL OR preprocessing_steps::text = '{}'::text") }
|
54
|
+
scope :preprocessed, -> { where("preprocessing_steps IS NOT NULL AND preprocessing_steps::text != '{}'::text") }
|
55
|
+
scope :required, -> { raw.visible.not_target.not_preprocessed }
|
56
|
+
scope :optional, -> { required.not }
|
57
|
+
scope :target, -> { where(is_target: true) }
|
58
|
+
scope :not_target, -> { where(is_target: false) }
|
59
|
+
scope :api_inputs, -> { where(is_computed: false, hidden: false, is_target: false) }
|
60
|
+
scope :computed, -> { where(is_computed: true) }
|
61
|
+
scope :raw, -> { where(is_computed: false) }
|
62
|
+
scope :needs_learn, -> {
|
63
|
+
datasource_changed
|
64
|
+
.or(feature_applied)
|
65
|
+
.or(feature_changed)
|
66
|
+
.or(column_changed)
|
67
|
+
.or(never_learned)
|
68
|
+
.or(is_learning)
|
69
|
+
}
|
70
|
+
|
71
|
+
scope :datasource_changed, -> {
|
72
|
+
left_joins(dataset: :datasource)
|
73
|
+
.left_joins(:feature)
|
74
|
+
.where(
|
75
|
+
arel_table[:last_datasource_sha].not_eq(
|
76
|
+
Datasource.arel_table[:sha]
|
77
|
+
)
|
78
|
+
)
|
79
|
+
}
|
80
|
+
|
81
|
+
scope :feature_changed, -> {
|
82
|
+
where(feature_id: Feature.has_changes.map(&:id))
|
83
|
+
}
|
84
|
+
|
85
|
+
scope :feature_applied, -> {
|
86
|
+
left_joins(dataset: :datasource)
|
87
|
+
.left_joins(:feature)
|
88
|
+
.where(
|
89
|
+
Feature.arel_table[:applied_at].gt(
|
90
|
+
Arel.sql("COALESCE(#{arel_table.name}.learned_at, '1970-01-01')")
|
91
|
+
).and(
|
92
|
+
arel_table[:feature_id].not_eq(nil)
|
93
|
+
)
|
94
|
+
)
|
95
|
+
}
|
96
|
+
|
97
|
+
scope :column_changed, -> {
|
98
|
+
left_joins(dataset: :datasource)
|
99
|
+
.left_joins(:feature)
|
100
|
+
.where(Dataset.arel_table[:refreshed_at].lt(arel_table[:updated_at]))
|
101
|
+
}
|
102
|
+
|
103
|
+
scope :never_learned, -> {
|
104
|
+
left_joins(dataset: :datasource)
|
105
|
+
.left_joins(:feature)
|
106
|
+
.where(arel_table[:learned_at].eq(nil))
|
107
|
+
.where(Datasource.arel_table[:sha].not_eq(nil))
|
108
|
+
}
|
109
|
+
scope :is_learning, -> { where(is_learning: true) }
|
110
|
+
|
111
|
+
def display_attributes
|
112
|
+
attributes.except(:statistics)
|
113
|
+
end
|
114
|
+
|
115
|
+
def inspect
|
116
|
+
"#<#{self.class.name} #{display_attributes.map { |k, v| "#{k}: #{v}" }.join(", ")}>"
|
117
|
+
end
|
42
118
|
|
43
|
-
def
|
119
|
+
def aliases
|
44
120
|
[name].concat(virtual_columns)
|
45
121
|
end
|
46
122
|
|
@@ -52,12 +128,174 @@ module EasyML
|
|
52
128
|
end
|
53
129
|
end
|
54
130
|
|
131
|
+
delegate :raw, :processed, :data, :train, :test, :valid, :clipped, to: :data_selector
|
132
|
+
|
133
|
+
def empty?
|
134
|
+
data.blank?
|
135
|
+
end
|
136
|
+
|
137
|
+
def learn(type: :all)
|
138
|
+
return if (!in_raw_dataset? && type != :processed)
|
139
|
+
|
140
|
+
if !in_raw_dataset? && read_attribute(:datatype).nil?
|
141
|
+
assign_attributes(datatype: processed.data.to_series.dtype)
|
142
|
+
end
|
143
|
+
set_sample_values
|
144
|
+
assign_attributes(statistics: (read_attribute(:statistics) || {}).symbolize_keys.merge!(learner.learn(type: type).symbolize_keys))
|
145
|
+
assign_attributes(
|
146
|
+
learned_at: UTC.now,
|
147
|
+
last_datasource_sha: dataset.last_datasource_sha,
|
148
|
+
last_feature_sha: feature&.sha,
|
149
|
+
is_learning: type == :raw,
|
150
|
+
)
|
151
|
+
end
|
152
|
+
|
153
|
+
def set_configuration_changed_at
|
154
|
+
if preprocessing_steps_changed? || datatype_changed?
|
155
|
+
self.configuration_changed_at = Time.now
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
def set_sample_values
|
160
|
+
use_processed = !one_hot? && processed.data(limit: 1).present? && in_raw_dataset?
|
161
|
+
|
162
|
+
base = use_processed ? processed : raw
|
163
|
+
sample_values = base.data(limit: 5, unique: true)
|
164
|
+
if sample_values.columns.include?(name)
|
165
|
+
sample_values = sample_values[name].to_a.uniq[0...5]
|
166
|
+
assign_attributes(sample_values: sample_values)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
def transform(df, inference: false, computed: false)
|
171
|
+
imputer = inference && imputers.inference.anything? ? imputers.inference : imputers.training
|
172
|
+
|
173
|
+
df = imputer.transform(df)
|
174
|
+
df
|
175
|
+
end
|
176
|
+
|
177
|
+
def imputers
|
178
|
+
@imputers ||= Column::Imputers.new(self)
|
179
|
+
end
|
180
|
+
|
181
|
+
def decode_labels(df)
|
182
|
+
imputers.training.decode_labels(df)
|
183
|
+
end
|
184
|
+
|
185
|
+
def preprocessed?
|
186
|
+
!preprocessing_steps.blank?
|
187
|
+
end
|
188
|
+
|
55
189
|
def datatype=(dtype)
|
190
|
+
if dtype.is_a?(Polars::DataType)
|
191
|
+
dtype = EasyML::Data::PolarsColumn.polars_to_sym(dtype)
|
192
|
+
end
|
56
193
|
write_attribute(:datatype, dtype)
|
57
|
-
|
194
|
+
set_polars_datatype
|
195
|
+
end
|
196
|
+
|
197
|
+
def datatype
|
198
|
+
read_attribute(:datatype) || write_attribute(:datatype, assumed_datatype)
|
199
|
+
end
|
200
|
+
|
201
|
+
def raw_dtype
|
202
|
+
return @raw_dtype if @raw_dtype
|
203
|
+
|
204
|
+
if in_raw_dataset?
|
205
|
+
@raw_dtype = raw&.data&.to_series&.dtype
|
206
|
+
elsif already_computed?
|
207
|
+
@raw_dtype = processed&.data&.to_series&.dtype
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
def set_polars_datatype
|
212
|
+
raw_type = raw_dtype
|
213
|
+
user_type = get_polars_type(datatype)
|
214
|
+
|
215
|
+
if raw_type == user_type
|
216
|
+
# A raw type of Polars::Datetime might have extra information like timezone, so prefer the raw type
|
217
|
+
write_attribute(:polars_datatype, raw_type.to_s)
|
218
|
+
else
|
219
|
+
# If a user specified type doesn't match the raw type, use the user type
|
220
|
+
write_attribute(:polars_datatype, user_type.to_s)
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
def polars_datatype
|
225
|
+
begin
|
226
|
+
raw_attr = read_attribute(:polars_datatype)
|
227
|
+
if raw_attr.nil?
|
228
|
+
get_polars_type(datatype)
|
229
|
+
else
|
230
|
+
EasyML::Data::PolarsColumn.parse_polars_dtype(raw_attr)
|
231
|
+
end
|
232
|
+
rescue => e
|
233
|
+
get_polars_type(datatype)
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
EasyML::Data::PolarsColumn::TYPE_MAP.keys.each do |dtype|
|
238
|
+
define_method("#{dtype}?") do
|
239
|
+
datatype.to_s == dtype.to_s
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
def datasource_raw
|
244
|
+
dataset.datasource.query(select: name)
|
245
|
+
end
|
246
|
+
|
247
|
+
def already_computed?
|
248
|
+
is_computed && computing_feature&.fit_at.present? || computing_feature&.applied_at.present?
|
249
|
+
end
|
250
|
+
|
251
|
+
def assumed_datatype
|
252
|
+
return @assumed_datatype if @assumed_datatype
|
253
|
+
|
254
|
+
if in_raw_dataset?
|
255
|
+
series = (raw.data || datasource_raw).to_series
|
256
|
+
@assumed_datatype = EasyML::Data::PolarsColumn.determine_type(series)
|
257
|
+
elsif already_computed?
|
258
|
+
return nil if processed.data.nil?
|
259
|
+
|
260
|
+
@assumed_datatype = EasyML::Data::PolarsColumn.determine_type(processed.data.to_series)
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
def in_raw_dataset?
|
265
|
+
return false if dataset&.raw&.data.nil?
|
266
|
+
|
267
|
+
dataset.raw.data(all_columns: true)&.columns&.include?(name) || false
|
268
|
+
end
|
269
|
+
|
270
|
+
def computing_feature
|
271
|
+
dataset&.features&.detect { |feature| feature.computes_columns.include?(name) }.tap do |computing_feature|
|
272
|
+
if computing_feature.present? && feature_id != computing_feature.id
|
273
|
+
update(feature_id: computing_feature.id)
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
alias_method :feature, :computing_feature
|
279
|
+
|
280
|
+
def set_feature_lineage
|
281
|
+
if dataset.features.computed_column_names.include?(name)
|
282
|
+
if computed_by.nil?
|
283
|
+
assign_attributes(
|
284
|
+
is_computed: true,
|
285
|
+
computed_by: computing_feature&.name,
|
286
|
+
)
|
287
|
+
end
|
288
|
+
elsif computed_by.present?
|
289
|
+
assign_attributes(
|
290
|
+
is_computed: false,
|
291
|
+
computed_by: nil,
|
292
|
+
)
|
293
|
+
end
|
58
294
|
end
|
59
295
|
|
60
296
|
def get_polars_type(dtype)
|
297
|
+
return nil if dtype.nil?
|
298
|
+
|
61
299
|
EasyML::Data::PolarsColumn::TYPE_MAP[dtype.to_sym]
|
62
300
|
end
|
63
301
|
|
@@ -79,7 +317,7 @@ module EasyML
|
|
79
317
|
next config unless config[:params]&.key?(:constant)
|
80
318
|
|
81
319
|
config.deep_dup.tap do |c|
|
82
|
-
c[:params][:constant] =
|
320
|
+
c[:params][:constant] = cast(c[:params][:constant])
|
83
321
|
end
|
84
322
|
end
|
85
323
|
|
@@ -98,18 +336,97 @@ module EasyML
|
|
98
336
|
preprocessing_steps.deep_symbolize_keys.dig(:training, :params, :ordinal_encoding) == true
|
99
337
|
end
|
100
338
|
|
339
|
+
def encoding
|
340
|
+
return nil unless categorical?
|
341
|
+
return :ordinal if ordinal_encoding?
|
342
|
+
return :one_hot
|
343
|
+
end
|
344
|
+
|
345
|
+
def categorical_min
|
346
|
+
return default_categorical_min unless categorical?
|
347
|
+
|
348
|
+
(preprocessing_steps || {}).deep_symbolize_keys.dig(:training, :params, :categorical_min) || default_categorical_min
|
349
|
+
end
|
350
|
+
|
351
|
+
def default_categorical_min
|
352
|
+
1
|
353
|
+
end
|
354
|
+
|
355
|
+
def statistics
|
356
|
+
(read_attribute(:statistics) || {}).with_indifferent_access
|
357
|
+
end
|
358
|
+
|
101
359
|
def allowed_categories
|
102
|
-
|
103
|
-
stats = dataset.preprocessor.statistics
|
360
|
+
stats = statistics
|
104
361
|
return [] if stats.nil? || stats.blank?
|
105
362
|
|
106
|
-
stats
|
363
|
+
stats = stats.deep_symbolize_keys
|
364
|
+
type = is_computed? ? :processed : :raw
|
365
|
+
stats = stats.dig(type)
|
366
|
+
|
367
|
+
# Can we LEARN dtype during LEARN phase... for computed columns to deal with this ish man
|
368
|
+
sorted = (stats.dig(:allowed_categories) || []).sort_by(&method(:sort_by))
|
369
|
+
sorted = sorted.concat(["other"]) if categorical?
|
370
|
+
sorted
|
371
|
+
end
|
372
|
+
|
373
|
+
def sort_by(value)
|
374
|
+
case datatype.to_sym
|
375
|
+
when :boolean
|
376
|
+
value == true ? 1 : 0
|
377
|
+
else
|
378
|
+
value
|
379
|
+
end
|
107
380
|
end
|
108
381
|
|
109
382
|
def date_column?
|
110
383
|
is_date_column
|
111
384
|
end
|
112
385
|
|
386
|
+
def lineage
|
387
|
+
@lineage ||= EasyML::Column::Lineage.new(self).lineage
|
388
|
+
end
|
389
|
+
|
390
|
+
def required?
|
391
|
+
!is_computed && (preprocessing_steps.nil? || preprocessing_steps == {}) && !hidden && !is_target
|
392
|
+
end
|
393
|
+
|
394
|
+
def sort_required
|
395
|
+
required? ? 0 : 1
|
396
|
+
end
|
397
|
+
|
398
|
+
def to_api
|
399
|
+
{
|
400
|
+
name: name,
|
401
|
+
datatype: datatype,
|
402
|
+
description: description,
|
403
|
+
required: required?,
|
404
|
+
allowed_values: allowed_categories.empty? ? nil : allowed_categories,
|
405
|
+
}.compact
|
406
|
+
end
|
407
|
+
|
408
|
+
def cast(value)
|
409
|
+
return value if value.nil?
|
410
|
+
|
411
|
+
case datatype&.to_sym
|
412
|
+
when :float
|
413
|
+
Float(value)
|
414
|
+
when :integer
|
415
|
+
Integer(value)
|
416
|
+
when :boolean
|
417
|
+
ActiveModel::Type::Boolean.new.cast(value)
|
418
|
+
when :datetime
|
419
|
+
value.is_a?(String) ? Time.parse(value) : value
|
420
|
+
when :categorical
|
421
|
+
value
|
422
|
+
else
|
423
|
+
value.to_s
|
424
|
+
end
|
425
|
+
rescue ArgumentError, TypeError
|
426
|
+
# If conversion fails, return original value
|
427
|
+
value
|
428
|
+
end
|
429
|
+
|
113
430
|
private
|
114
431
|
|
115
432
|
def set_defaults
|
@@ -209,26 +526,14 @@ module EasyML
|
|
209
526
|
throw :abort
|
210
527
|
end
|
211
528
|
|
212
|
-
|
213
|
-
return value if value.nil?
|
529
|
+
NUMERIC_METHODS = %i[mean median].freeze
|
214
530
|
|
215
|
-
|
216
|
-
|
217
|
-
Float(value)
|
218
|
-
when :integer
|
219
|
-
Integer(value)
|
220
|
-
when :boolean
|
221
|
-
ActiveModel::Type::Boolean.new.cast(value)
|
222
|
-
when :datetime
|
223
|
-
value.is_a?(String) ? Time.parse(value) : value
|
224
|
-
else
|
225
|
-
value.to_s
|
226
|
-
end
|
227
|
-
rescue ArgumentError, TypeError
|
228
|
-
# If conversion fails, return original value
|
229
|
-
value
|
531
|
+
def data_selector
|
532
|
+
@data_selector ||= Column::Selector.new(self)
|
230
533
|
end
|
231
534
|
|
232
|
-
|
535
|
+
def learner
|
536
|
+
@learner ||= Column::Learner.new(self)
|
537
|
+
end
|
233
538
|
end
|
234
539
|
end
|
@@ -2,30 +2,41 @@
|
|
2
2
|
#
|
3
3
|
# Table name: easy_ml_column_histories
|
4
4
|
#
|
5
|
-
# id
|
6
|
-
# column_id
|
7
|
-
# dataset_id
|
8
|
-
# name
|
9
|
-
# description
|
10
|
-
# datatype
|
11
|
-
# polars_datatype
|
12
|
-
# is_target
|
13
|
-
# hidden
|
14
|
-
# drop_if_null
|
15
|
-
# preprocessing_steps
|
16
|
-
# sample_values
|
17
|
-
# statistics
|
18
|
-
# created_at
|
19
|
-
# updated_at
|
20
|
-
# history_started_at
|
21
|
-
# history_ended_at
|
22
|
-
# history_user_id
|
23
|
-
# snapshot_id
|
24
|
-
# is_date_column
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# column_id :integer not null
|
7
|
+
# dataset_id :integer not null
|
8
|
+
# name :string not null
|
9
|
+
# description :string
|
10
|
+
# datatype :string
|
11
|
+
# polars_datatype :string
|
12
|
+
# is_target :boolean default(FALSE)
|
13
|
+
# hidden :boolean default(FALSE)
|
14
|
+
# drop_if_null :boolean default(FALSE)
|
15
|
+
# preprocessing_steps :json
|
16
|
+
# sample_values :json
|
17
|
+
# statistics :json
|
18
|
+
# created_at :datetime not null
|
19
|
+
# updated_at :datetime not null
|
20
|
+
# history_started_at :datetime not null
|
21
|
+
# history_ended_at :datetime
|
22
|
+
# history_user_id :integer
|
23
|
+
# snapshot_id :string
|
24
|
+
# is_date_column :boolean default(FALSE)
|
25
|
+
# computed_by :string
|
26
|
+
# is_computed :boolean default(FALSE)
|
27
|
+
# feature_id :bigint
|
28
|
+
# learned_at :datetime
|
29
|
+
# is_learning :boolean default(FALSE)
|
30
|
+
# last_datasource_sha :string
|
31
|
+
# last_feature_sha :string
|
32
|
+
# configuration_changed_at :datetime
|
25
33
|
#
|
26
34
|
module EasyML
|
27
35
|
class ColumnHistory < ActiveRecord::Base
|
28
36
|
self.table_name = "easy_ml_column_histories"
|
29
37
|
include Historiographer::History
|
38
|
+
scope :required, -> { where(is_computed: false, hidden: false, is_target: false).where("preprocessing_steps IS NULL OR preprocessing_steps::text = '{}'::text") }
|
39
|
+
scope :computed, -> { where(is_computed: true) }
|
40
|
+
scope :raw, -> { where(is_computed: false) }
|
30
41
|
end
|
31
42
|
end
|
@@ -1,5 +1,7 @@
|
|
1
1
|
module EasyML
|
2
2
|
module ColumnList
|
3
|
+
include Historiographer::Relation
|
4
|
+
|
3
5
|
def sync(delete: true)
|
4
6
|
return unless dataset.schema.present?
|
5
7
|
|
@@ -7,10 +9,10 @@ module EasyML
|
|
7
9
|
col_names = syncable
|
8
10
|
existing_columns = where(name: col_names)
|
9
11
|
import_new(col_names, existing_columns)
|
10
|
-
update_existing(existing_columns)
|
12
|
+
# update_existing(existing_columns)
|
11
13
|
|
12
14
|
if delete
|
13
|
-
delete_missing(
|
15
|
+
delete_missing(col_names)
|
14
16
|
end
|
15
17
|
|
16
18
|
if existing_columns.none? # Totally new dataset
|
@@ -19,6 +21,64 @@ module EasyML
|
|
19
21
|
end
|
20
22
|
end
|
21
23
|
|
24
|
+
def transform(df, inference: false, computed: false)
|
25
|
+
return df if df.nil?
|
26
|
+
|
27
|
+
if computed
|
28
|
+
cols = column_list.computed
|
29
|
+
else
|
30
|
+
cols = column_list.raw
|
31
|
+
end
|
32
|
+
|
33
|
+
by_name = cols.index_by(&:name)
|
34
|
+
df.columns.each do |col|
|
35
|
+
column = by_name[col]
|
36
|
+
df = column.transform(df, inference: inference, computed: computed) if column
|
37
|
+
end
|
38
|
+
|
39
|
+
df
|
40
|
+
end
|
41
|
+
|
42
|
+
def learn(type: :raw, computed: false)
|
43
|
+
cols_to_learn = column_list.reload.needs_learn
|
44
|
+
cols_to_learn = cols_to_learn.computed if computed
|
45
|
+
cols_to_learn = cols_to_learn.select(&:persisted?).reject(&:empty?)
|
46
|
+
cols_to_learn.each { |col| col.learn(type: type) }
|
47
|
+
EasyML::Column.import(cols_to_learn, on_duplicate_key_update: { columns: %i[
|
48
|
+
statistics
|
49
|
+
learned_at
|
50
|
+
sample_values
|
51
|
+
last_datasource_sha
|
52
|
+
is_learning
|
53
|
+
datatype
|
54
|
+
polars_datatype
|
55
|
+
] })
|
56
|
+
set_feature_lineage
|
57
|
+
reload
|
58
|
+
end
|
59
|
+
|
60
|
+
def set_feature_lineage
|
61
|
+
names = dataset.features.computed_column_names
|
62
|
+
columns = where(name: names, computed_by: nil).map do |col|
|
63
|
+
col.assign_attributes(
|
64
|
+
is_computed: true,
|
65
|
+
computed_by: col.computing_feature&.name,
|
66
|
+
)
|
67
|
+
col
|
68
|
+
end
|
69
|
+
EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
|
70
|
+
end
|
71
|
+
|
72
|
+
def statistics
|
73
|
+
stats = { raw: {}, processed: {} }
|
74
|
+
select(&:persisted?).inject(stats) do |h, col|
|
75
|
+
h.tap do
|
76
|
+
h[:raw][col.name] = col.statistics.dig(:raw)
|
77
|
+
h[:processed][col.name] = col.statistics.dig(:processed)
|
78
|
+
end
|
79
|
+
end.with_indifferent_access
|
80
|
+
end
|
81
|
+
|
22
82
|
def one_hots
|
23
83
|
column_list.select(&:one_hot?)
|
24
84
|
end
|
@@ -37,14 +97,9 @@ module EasyML
|
|
37
97
|
end
|
38
98
|
end
|
39
99
|
|
40
|
-
def virtual_column?(column)
|
41
|
-
false
|
42
|
-
end
|
43
|
-
|
44
100
|
def syncable
|
45
101
|
dataset.processed_schema.keys.select do |col|
|
46
|
-
!one_hot?(col)
|
47
|
-
!virtual_column?(col)
|
102
|
+
!one_hot?(col)
|
48
103
|
end
|
49
104
|
end
|
50
105
|
|
@@ -56,73 +111,35 @@ module EasyML
|
|
56
111
|
proxy_association.owner
|
57
112
|
end
|
58
113
|
|
114
|
+
def sort_by_required
|
115
|
+
column_list.sort_by { |col| [col.sort_required, col.name] }
|
116
|
+
end
|
117
|
+
|
59
118
|
private
|
60
119
|
|
61
120
|
def import_new(new_columns, existing_columns)
|
62
121
|
new_columns = new_columns - existing_columns.map(&:name)
|
63
122
|
cols_to_insert = new_columns.map do |col_name|
|
64
|
-
EasyML::Column.new(
|
123
|
+
col = EasyML::Column.new(
|
65
124
|
name: col_name,
|
66
125
|
dataset_id: dataset.id,
|
67
126
|
)
|
127
|
+
col
|
68
128
|
end
|
69
129
|
EasyML::Column.import(cols_to_insert)
|
130
|
+
set_feature_lineage
|
131
|
+
column_list.reload
|
70
132
|
end
|
71
133
|
|
72
|
-
def
|
73
|
-
|
74
|
-
use_processed = dataset.processed.data(limit: 1).present?
|
75
|
-
cached_sample = use_processed ? dataset.processed.data(limit: 10, all_columns: true) : dataset.raw.data(limit: 10, all_columns: true)
|
76
|
-
existing_types = existing_columns.map(&:name).zip(existing_columns.map(&:datatype)).to_h
|
77
|
-
polars_types = cached_sample.columns.zip((cached_sample.dtypes.map do |dtype|
|
78
|
-
EasyML::Data::PolarsColumn.polars_to_sym(dtype).to_s
|
79
|
-
end)).to_h
|
80
|
-
|
81
|
-
existing_columns.each do |column|
|
82
|
-
new_polars_type = polars_types[column.name]
|
83
|
-
existing_type = existing_types[column.name]
|
84
|
-
schema_type = dataset.schema[column.name]
|
85
|
-
|
86
|
-
# Keep both datatype and polars_datatype if it's an ordinal encoding case
|
87
|
-
if column.ordinal_encoding?
|
88
|
-
actual_type = existing_type
|
89
|
-
actual_schema_type = existing_type
|
90
|
-
else
|
91
|
-
actual_type = new_polars_type
|
92
|
-
actual_schema_type = schema_type
|
93
|
-
end
|
94
|
-
|
95
|
-
if column.one_hot?
|
96
|
-
base = dataset.raw
|
97
|
-
processed = stats.dig("raw", column.name).dup
|
98
|
-
processed["null_count"] = 0
|
99
|
-
actual_schema_type = "categorical"
|
100
|
-
actual_type = "categorical"
|
101
|
-
else
|
102
|
-
base = use_processed ? dataset.processed : dataset.raw
|
103
|
-
processed = stats.dig("processed", column.name)
|
104
|
-
end
|
105
|
-
sample_values = base.send(:data, unique: true, limit: 5, all_columns: true, select: column.name)[column.name].to_a.uniq[0...5]
|
106
|
-
|
107
|
-
column.assign_attributes(
|
108
|
-
statistics: {
|
109
|
-
raw: stats.dig("raw", column.name),
|
110
|
-
processed: processed,
|
111
|
-
},
|
112
|
-
datatype: actual_schema_type,
|
113
|
-
polars_datatype: actual_type,
|
114
|
-
sample_values: sample_values,
|
115
|
-
)
|
116
|
-
end
|
117
|
-
EasyML::Column.import(existing_columns.to_a,
|
118
|
-
{ on_duplicate_key_update: { columns: %i[statistics datatype polars_datatype
|
119
|
-
sample_values] } })
|
120
|
-
end
|
121
|
-
|
122
|
-
def delete_missing(existing_columns)
|
123
|
-
raw_cols = dataset.raw.train(all_columns: true, limit: 1).columns
|
134
|
+
def delete_missing(col_names)
|
135
|
+
raw_cols = dataset.best_segment.data(all_columns: true, limit: 1).columns
|
124
136
|
raw_cols = where(name: raw_cols)
|
125
|
-
columns_to_delete = column_list
|
137
|
+
columns_to_delete = column_list.select do |col|
|
138
|
+
col_names.exclude?(col.name) &&
|
139
|
+
one_hots.map(&:name).exclude?(col.name) &&
|
140
|
+
raw_cols.map(&:name).exclude?(col.name) &&
|
141
|
+
dataset.features.flat_map(&:computes_columns).exclude?(col.name)
|
142
|
+
end
|
126
143
|
columns_to_delete.each(&:destroy!)
|
127
144
|
end
|
128
145
|
end
|