easy_ml 0.2.0.pre.rc72 → 0.2.0.pre.rc75
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +33 -0
- data/app/controllers/easy_ml/datasources_controller.rb +7 -0
- data/app/controllers/easy_ml/models_controller.rb +38 -0
- data/app/frontend/components/DatasetCard.tsx +212 -0
- data/app/frontend/components/ModelCard.tsx +69 -29
- data/app/frontend/components/StackTrace.tsx +13 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +10 -7
- data/app/frontend/components/datasets/UploadDatasetButton.tsx +51 -0
- data/app/frontend/components/models/DownloadModelModal.tsx +90 -0
- data/app/frontend/components/models/UploadModelModal.tsx +212 -0
- data/app/frontend/components/models/index.ts +2 -0
- data/app/frontend/pages/DatasetsPage.tsx +36 -130
- data/app/frontend/pages/DatasourcesPage.tsx +22 -2
- data/app/frontend/pages/ModelsPage.tsx +37 -11
- data/app/frontend/types/dataset.ts +1 -2
- data/app/frontend/types.ts +1 -1
- data/app/jobs/easy_ml/training_job.rb +2 -2
- data/app/models/easy_ml/column/imputers/base.rb +4 -0
- data/app/models/easy_ml/column/imputers/clip.rb +5 -3
- data/app/models/easy_ml/column/imputers/imputer.rb +11 -13
- data/app/models/easy_ml/column/imputers/mean.rb +7 -3
- data/app/models/easy_ml/column/imputers/null_imputer.rb +3 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +5 -1
- data/app/models/easy_ml/column/imputers.rb +3 -1
- data/app/models/easy_ml/column/lineage/base.rb +5 -1
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +1 -1
- data/app/models/easy_ml/column/lineage/preprocessed.rb +1 -1
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +1 -1
- data/app/models/easy_ml/column/selector.rb +4 -0
- data/app/models/easy_ml/column.rb +79 -63
- data/app/models/easy_ml/column_history.rb +28 -28
- data/app/models/easy_ml/column_list/imputer.rb +23 -0
- data/app/models/easy_ml/column_list.rb +39 -26
- data/app/models/easy_ml/dataset/learner/base.rb +34 -0
- data/app/models/easy_ml/dataset/learner/eager/boolean.rb +10 -0
- data/app/models/easy_ml/dataset/learner/eager/categorical.rb +51 -0
- data/app/models/easy_ml/dataset/learner/eager/query.rb +37 -0
- data/app/models/easy_ml/dataset/learner/eager.rb +43 -0
- data/app/models/easy_ml/dataset/learner/lazy/boolean.rb +13 -0
- data/app/models/easy_ml/dataset/learner/lazy/categorical.rb +10 -0
- data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy/null.rb +17 -0
- data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy/query.rb +69 -0
- data/app/models/easy_ml/dataset/learner/lazy/string.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy.rb +51 -0
- data/app/models/easy_ml/dataset/learner/query.rb +25 -0
- data/app/models/easy_ml/dataset/learner.rb +100 -0
- data/app/models/easy_ml/dataset.rb +150 -36
- data/app/models/easy_ml/dataset_history.rb +1 -0
- data/app/models/easy_ml/datasource.rb +9 -0
- data/app/models/easy_ml/event.rb +4 -0
- data/app/models/easy_ml/export/column.rb +27 -0
- data/app/models/easy_ml/export/dataset.rb +37 -0
- data/app/models/easy_ml/export/datasource.rb +12 -0
- data/app/models/easy_ml/export/feature.rb +24 -0
- data/app/models/easy_ml/export/model.rb +40 -0
- data/app/models/easy_ml/export/retraining_job.rb +20 -0
- data/app/models/easy_ml/export/splitter.rb +14 -0
- data/app/models/easy_ml/feature.rb +21 -0
- data/app/models/easy_ml/import/column.rb +35 -0
- data/app/models/easy_ml/import/dataset.rb +148 -0
- data/app/models/easy_ml/import/feature.rb +36 -0
- data/app/models/easy_ml/import/model.rb +136 -0
- data/app/models/easy_ml/import/retraining_job.rb +29 -0
- data/app/models/easy_ml/import/splitter.rb +34 -0
- data/app/models/easy_ml/lineage.rb +44 -0
- data/app/models/easy_ml/model.rb +93 -36
- data/app/models/easy_ml/model_file.rb +6 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -7
- data/app/models/easy_ml/models/xgboost.rb +33 -9
- data/app/models/easy_ml/retraining_job.rb +8 -1
- data/app/models/easy_ml/retraining_run.rb +6 -4
- data/app/models/easy_ml/splitter.rb +8 -0
- data/app/models/lineage_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +7 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +2 -1
- data/app/serializers/easy_ml/lineage_serializer.rb +9 -0
- data/config/routes.rb +13 -1
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +3 -3
- data/lib/easy_ml/core/tuner.rb +12 -11
- data/lib/easy_ml/data/polars_column.rb +149 -100
- data/lib/easy_ml/data/polars_reader.rb +8 -5
- data/lib/easy_ml/data/polars_schema.rb +56 -0
- data/lib/easy_ml/data/splits/file_split.rb +20 -2
- data/lib/easy_ml/data/splits/split.rb +10 -1
- data/lib/easy_ml/data.rb +1 -0
- data/lib/easy_ml/deep_compact.rb +19 -0
- data/lib/easy_ml/feature_store.rb +2 -6
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_extra_metadata_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt +8 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_lineages.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt +7 -0
- data/lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt +18 -0
- data/lib/easy_ml/timing.rb +34 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/easy_ml.rb +2 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-Q7L6ioxr.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js +522 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js.map +1 -0
- metadata +52 -12
- data/app/models/easy_ml/column/learners/base.rb +0 -103
- data/app/models/easy_ml/column/learners/boolean.rb +0 -11
- data/app/models/easy_ml/column/learners/categorical.rb +0 -51
- data/app/models/easy_ml/column/learners/datetime.rb +0 -19
- data/app/models/easy_ml/column/learners/null.rb +0 -22
- data/app/models/easy_ml/column/learners/numeric.rb +0 -33
- data/app/models/easy_ml/column/learners/string.rb +0 -15
- data/public/easy_ml/assets/assets/Application-B3sRjyMT.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js +0 -489
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js.map +0 -1
@@ -8,13 +8,17 @@ module EasyML
|
|
8
8
|
"Mean imputation"
|
9
9
|
end
|
10
10
|
|
11
|
+
def expr
|
12
|
+
return super unless mean.present?
|
13
|
+
|
14
|
+
Polars.col(column.name).fill_null(mean).alias(column.name)
|
15
|
+
end
|
16
|
+
|
11
17
|
def transform(df)
|
12
18
|
return df unless mean.present?
|
13
19
|
|
14
20
|
mean = statistics(:mean)
|
15
|
-
df = df.with_column(
|
16
|
-
Polars.col(column.name).fill_null(mean).alias(column.name)
|
17
|
-
)
|
21
|
+
df = df.with_column(expr)
|
18
22
|
df
|
19
23
|
end
|
20
24
|
|
@@ -74,9 +74,10 @@ module EasyML
|
|
74
74
|
@supported_methods ||= []
|
75
75
|
end
|
76
76
|
|
77
|
-
def initialize(column)
|
77
|
+
def initialize(column, imputers: [])
|
78
78
|
@column = column
|
79
79
|
@dataset = column.dataset
|
80
|
+
@_imputers = imputers
|
80
81
|
end
|
81
82
|
|
82
83
|
class << self
|
@@ -97,6 +98,7 @@ module EasyML
|
|
97
98
|
hash[key.to_sym] = Imputer.new(
|
98
99
|
column,
|
99
100
|
column.preprocessing_steps[key],
|
101
|
+
@_imputers
|
100
102
|
)
|
101
103
|
end
|
102
104
|
end
|
@@ -9,11 +9,15 @@ module EasyML
|
|
9
9
|
@dataset = column.dataset
|
10
10
|
end
|
11
11
|
|
12
|
+
def expr
|
13
|
+
Polars.col(column.name)
|
14
|
+
end
|
15
|
+
|
12
16
|
def as_json
|
13
17
|
{
|
14
18
|
key: key,
|
15
19
|
description: description,
|
16
|
-
|
20
|
+
occurred_at: occurred_at,
|
17
21
|
}.with_indifferent_access
|
18
22
|
end
|
19
23
|
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module EasyML
|
2
2
|
class Column
|
3
3
|
class Selector
|
4
|
+
include EasyML::Timing
|
5
|
+
|
4
6
|
attr_accessor :selected, :dataset, :column, :transform
|
5
7
|
|
6
8
|
def initialize(column, selected = nil, &block)
|
@@ -28,6 +30,8 @@ module EasyML
|
|
28
30
|
end
|
29
31
|
end
|
30
32
|
|
33
|
+
measure_method_timing :clipped
|
34
|
+
|
31
35
|
def processed
|
32
36
|
Selector.new(column, :processed)
|
33
37
|
end
|
@@ -2,29 +2,29 @@
|
|
2
2
|
#
|
3
3
|
# Table name: easy_ml_columns
|
4
4
|
#
|
5
|
-
# id
|
6
|
-
# dataset_id
|
7
|
-
# name
|
8
|
-
# description
|
9
|
-
# datatype
|
10
|
-
# polars_datatype
|
11
|
-
# is_target
|
12
|
-
# hidden
|
13
|
-
# drop_if_null
|
14
|
-
# preprocessing_steps
|
15
|
-
# sample_values
|
16
|
-
# statistics
|
17
|
-
# created_at
|
18
|
-
# updated_at
|
19
|
-
# is_date_column
|
20
|
-
# computed_by
|
21
|
-
# is_computed
|
22
|
-
# feature_id
|
23
|
-
# learned_at
|
24
|
-
# is_learning
|
25
|
-
# last_datasource_sha
|
26
|
-
# last_feature_sha
|
27
|
-
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# dataset_id :bigint not null
|
7
|
+
# name :string not null
|
8
|
+
# description :string
|
9
|
+
# datatype :string
|
10
|
+
# polars_datatype :string
|
11
|
+
# is_target :boolean default(FALSE)
|
12
|
+
# hidden :boolean default(FALSE)
|
13
|
+
# drop_if_null :boolean default(FALSE)
|
14
|
+
# preprocessing_steps :jsonb
|
15
|
+
# sample_values :json
|
16
|
+
# statistics :json
|
17
|
+
# created_at :datetime not null
|
18
|
+
# updated_at :datetime not null
|
19
|
+
# is_date_column :boolean default(FALSE)
|
20
|
+
# computed_by :string
|
21
|
+
# is_computed :boolean default(FALSE)
|
22
|
+
# feature_id :bigint
|
23
|
+
# learned_at :datetime
|
24
|
+
# is_learning :boolean default(FALSE)
|
25
|
+
# last_datasource_sha :string
|
26
|
+
# last_feature_sha :string
|
27
|
+
# in_raw_dataset :boolean
|
28
28
|
#
|
29
29
|
module EasyML
|
30
30
|
class Column < ActiveRecord::Base
|
@@ -32,8 +32,11 @@ module EasyML
|
|
32
32
|
include Historiographer::Silent
|
33
33
|
historiographer_mode :snapshot_only
|
34
34
|
|
35
|
+
include EasyML::Timing
|
36
|
+
|
35
37
|
belongs_to :dataset, class_name: "EasyML::Dataset"
|
36
38
|
belongs_to :feature, class_name: "EasyML::Feature", optional: true
|
39
|
+
has_many :lineages, class_name: "EasyML::Lineage"
|
37
40
|
|
38
41
|
validates :name, presence: true
|
39
42
|
validates :name, uniqueness: { scope: :dataset_id }
|
@@ -43,7 +46,7 @@ module EasyML
|
|
43
46
|
before_save :set_defaults
|
44
47
|
before_save :set_feature_lineage
|
45
48
|
before_save :set_polars_datatype
|
46
|
-
after_find :ensure_feature_exists
|
49
|
+
# after_find :ensure_feature_exists
|
47
50
|
|
48
51
|
# Scopes
|
49
52
|
scope :visible, -> { where(hidden: false) }
|
@@ -60,6 +63,7 @@ module EasyML
|
|
60
63
|
scope :api_inputs, -> { where(is_computed: false, hidden: false, is_target: false) }
|
61
64
|
scope :computed, -> { where(is_computed: true) }
|
62
65
|
scope :raw, -> { where(is_computed: false) }
|
66
|
+
scope :has_clip, -> { where("preprocessing_steps->'training'->>'params' IS NOT NULL AND preprocessing_steps->'training'->'params' @> jsonb_build_object('clip', jsonb_build_object())") }
|
63
67
|
scope :needs_learn, -> {
|
64
68
|
datasource_changed
|
65
69
|
.or(feature_applied)
|
@@ -142,26 +146,10 @@ module EasyML
|
|
142
146
|
data.blank?
|
143
147
|
end
|
144
148
|
|
145
|
-
def
|
146
|
-
return
|
147
|
-
|
148
|
-
if !in_raw_dataset? && read_attribute(:datatype).nil?
|
149
|
-
assign_attributes(datatype: processed.data.to_series.dtype)
|
150
|
-
end
|
151
|
-
set_sample_values
|
152
|
-
new_stats = learner.learn(type: type).symbolize_keys
|
153
|
-
|
154
|
-
if !in_raw_dataset?
|
155
|
-
new_stats[:raw] = new_stats[:processed]
|
156
|
-
end
|
149
|
+
def merge_statistics(new_stats)
|
150
|
+
return unless new_stats.present?
|
157
151
|
|
158
|
-
assign_attributes(statistics: (
|
159
|
-
assign_attributes(
|
160
|
-
learned_at: UTC.now,
|
161
|
-
last_datasource_sha: dataset.last_datasource_sha,
|
162
|
-
last_feature_sha: feature&.sha,
|
163
|
-
is_learning: type == :raw,
|
164
|
-
)
|
152
|
+
assign_attributes(statistics: (statistics || {}).symbolize_keys.deep_merge!(new_stats))
|
165
153
|
end
|
166
154
|
|
167
155
|
def set_configuration_changed_at
|
@@ -174,7 +162,7 @@ module EasyML
|
|
174
162
|
use_processed = !one_hot? && processed.data(limit: 1).present? && in_raw_dataset?
|
175
163
|
|
176
164
|
base = use_processed ? processed : raw
|
177
|
-
sample_values = base.data(limit: 5, unique: true)
|
165
|
+
sample_values = base.data(limit: 5, unique: true, select: [name])
|
178
166
|
if sample_values.columns.include?(name)
|
179
167
|
sample_values = sample_values[name].to_a.uniq[0...5]
|
180
168
|
assign_attributes(sample_values: sample_values)
|
@@ -188,8 +176,8 @@ module EasyML
|
|
188
176
|
df
|
189
177
|
end
|
190
178
|
|
191
|
-
def imputers
|
192
|
-
@imputers ||= Column::Imputers.new(self)
|
179
|
+
def imputers(imputers = [])
|
180
|
+
@imputers ||= Column::Imputers.new(self, imputers: imputers)
|
193
181
|
end
|
194
182
|
|
195
183
|
def decode_labels(df)
|
@@ -202,29 +190,29 @@ module EasyML
|
|
202
190
|
|
203
191
|
def datatype=(dtype)
|
204
192
|
if dtype.is_a?(Polars::DataType)
|
205
|
-
dtype =
|
193
|
+
dtype = polars_to_sym(dtype)
|
206
194
|
end
|
207
195
|
write_attribute(:datatype, dtype)
|
208
196
|
set_polars_datatype
|
209
197
|
end
|
210
198
|
|
199
|
+
def polars_to_sym(dtype)
|
200
|
+
EasyML::Data::PolarsColumn.polars_to_sym(dtype)
|
201
|
+
end
|
202
|
+
|
211
203
|
def datatype
|
212
|
-
read_attribute(:datatype) || write_attribute(:datatype, assumed_datatype)
|
204
|
+
read_attribute(:datatype) || write_attribute(:datatype, polars_to_sym(assumed_datatype))
|
213
205
|
end
|
214
206
|
|
215
207
|
def raw_dtype
|
216
|
-
|
217
|
-
|
208
|
+
dtype = dataset.raw_schema[name]
|
209
|
+
return nil if dtype.nil?
|
218
210
|
|
219
|
-
|
220
|
-
@raw_dtype = raw&.data&.to_series.try(:dtype)
|
221
|
-
elsif already_computed?
|
222
|
-
@raw_dtype = processed&.data&.to_series&.dtype
|
223
|
-
end
|
211
|
+
polars_to_sym(dtype)
|
224
212
|
end
|
225
213
|
|
226
214
|
def set_polars_datatype
|
227
|
-
raw_type =
|
215
|
+
raw_type = datatype
|
228
216
|
user_type = get_polars_type(datatype)
|
229
217
|
|
230
218
|
if raw_type == user_type
|
@@ -267,8 +255,11 @@ module EasyML
|
|
267
255
|
return @assumed_datatype if @assumed_datatype
|
268
256
|
|
269
257
|
if in_raw_dataset?
|
270
|
-
|
271
|
-
|
258
|
+
@assumed_datatype = dataset.raw_schema[name]
|
259
|
+
# series = (raw.data || datasource_raw).to_series
|
260
|
+
# @assumed_datatype = EasyML::Data::PolarsColumn.determine_type(series)
|
261
|
+
elsif dataset.processed_schema.present?
|
262
|
+
@assumed_datatype = dataset.processed_schema[name]
|
272
263
|
elsif already_computed?
|
273
264
|
return nil if processed.data.nil?
|
274
265
|
|
@@ -277,9 +268,16 @@ module EasyML
|
|
277
268
|
end
|
278
269
|
|
279
270
|
def in_raw_dataset?
|
271
|
+
value = read_attribute(:in_raw_dataset)
|
272
|
+
return value unless value.nil?
|
273
|
+
|
274
|
+
write_attribute(:in_raw_dataset, check_in_raw_dataset?)
|
275
|
+
end
|
276
|
+
|
277
|
+
def check_in_raw_dataset?
|
280
278
|
return false if dataset&.raw&.data.nil?
|
281
279
|
|
282
|
-
dataset.raw.data(all_columns: true)
|
280
|
+
dataset.raw.data(all_columns: true, lazy: true).schema.key?(name) || false
|
283
281
|
end
|
284
282
|
|
285
283
|
def computing_feature
|
@@ -398,10 +396,6 @@ module EasyML
|
|
398
396
|
is_date_column
|
399
397
|
end
|
400
398
|
|
401
|
-
def lineage
|
402
|
-
@lineage ||= EasyML::Column::Lineage.new(self).lineage
|
403
|
-
end
|
404
|
-
|
405
399
|
def required?
|
406
400
|
!is_computed && (preprocessing_steps.nil? || preprocessing_steps == {}) && !hidden && !is_target
|
407
401
|
end
|
@@ -420,6 +414,28 @@ module EasyML
|
|
420
414
|
}.compact
|
421
415
|
end
|
422
416
|
|
417
|
+
UNCONFIGURABLE_COLUMNS = %w(
|
418
|
+
id
|
419
|
+
feature_id
|
420
|
+
dataset_id
|
421
|
+
last_datasource_sha
|
422
|
+
last_feature_sha
|
423
|
+
learned_at
|
424
|
+
is_learning
|
425
|
+
configuration_changed_at
|
426
|
+
statistics
|
427
|
+
created_at
|
428
|
+
updated_at
|
429
|
+
)
|
430
|
+
|
431
|
+
def to_config
|
432
|
+
EasyML::Export::Column.to_config(self)
|
433
|
+
end
|
434
|
+
|
435
|
+
def self.from_config(config, dataset, action: :create)
|
436
|
+
EasyML::Import::Column.from_config(config, dataset, action: action)
|
437
|
+
end
|
438
|
+
|
423
439
|
def cast(value)
|
424
440
|
return value if value.nil?
|
425
441
|
|
@@ -2,34 +2,34 @@
|
|
2
2
|
#
|
3
3
|
# Table name: easy_ml_column_histories
|
4
4
|
#
|
5
|
-
# id
|
6
|
-
# column_id
|
7
|
-
# dataset_id
|
8
|
-
# name
|
9
|
-
# description
|
10
|
-
# datatype
|
11
|
-
# polars_datatype
|
12
|
-
# is_target
|
13
|
-
# hidden
|
14
|
-
# drop_if_null
|
15
|
-
# preprocessing_steps
|
16
|
-
# sample_values
|
17
|
-
# statistics
|
18
|
-
# created_at
|
19
|
-
# updated_at
|
20
|
-
# history_started_at
|
21
|
-
# history_ended_at
|
22
|
-
# history_user_id
|
23
|
-
# snapshot_id
|
24
|
-
# is_date_column
|
25
|
-
# computed_by
|
26
|
-
# is_computed
|
27
|
-
# feature_id
|
28
|
-
# learned_at
|
29
|
-
# is_learning
|
30
|
-
# last_datasource_sha
|
31
|
-
# last_feature_sha
|
32
|
-
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# column_id :integer not null
|
7
|
+
# dataset_id :integer not null
|
8
|
+
# name :string not null
|
9
|
+
# description :string
|
10
|
+
# datatype :string
|
11
|
+
# polars_datatype :string
|
12
|
+
# is_target :boolean default(FALSE)
|
13
|
+
# hidden :boolean default(FALSE)
|
14
|
+
# drop_if_null :boolean default(FALSE)
|
15
|
+
# preprocessing_steps :jsonb
|
16
|
+
# sample_values :json
|
17
|
+
# statistics :json
|
18
|
+
# created_at :datetime not null
|
19
|
+
# updated_at :datetime not null
|
20
|
+
# history_started_at :datetime not null
|
21
|
+
# history_ended_at :datetime
|
22
|
+
# history_user_id :integer
|
23
|
+
# snapshot_id :string
|
24
|
+
# is_date_column :boolean default(FALSE)
|
25
|
+
# computed_by :string
|
26
|
+
# is_computed :boolean default(FALSE)
|
27
|
+
# feature_id :bigint
|
28
|
+
# learned_at :datetime
|
29
|
+
# is_learning :boolean default(FALSE)
|
30
|
+
# last_datasource_sha :string
|
31
|
+
# last_feature_sha :string
|
32
|
+
# in_raw_dataset :boolean
|
33
33
|
#
|
34
34
|
module EasyML
|
35
35
|
class ColumnHistory < ActiveRecord::Base
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module EasyML
|
2
|
+
module ColumnList
|
3
|
+
class Imputer
|
4
|
+
attr_accessor :dataset, :df, :inference, :columns
|
5
|
+
|
6
|
+
def initialize(dataset, df, columns: nil, imputers: [], inference: false)
|
7
|
+
@dataset = dataset
|
8
|
+
@df = df
|
9
|
+
@columns = (columns.nil? || columns.empty?) ? dataset.columns : columns
|
10
|
+
@inference = inference
|
11
|
+
@_imputers = imputers
|
12
|
+
end
|
13
|
+
|
14
|
+
def imputers
|
15
|
+
@imputers ||= columns.map { |column| inference ? column.imputers(@_imputers).inference : column.imputers(@_imputers).training }
|
16
|
+
end
|
17
|
+
|
18
|
+
def exprs
|
19
|
+
imputers.flat_map(&:exprs).compact
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
module EasyML
|
2
2
|
module ColumnList
|
3
3
|
include Historiographer::Relation
|
4
|
+
include EasyML::Timing
|
4
5
|
|
5
6
|
def sync(delete: true)
|
6
7
|
return unless dataset.schema.present?
|
@@ -39,35 +40,28 @@ module EasyML
|
|
39
40
|
df
|
40
41
|
end
|
41
42
|
|
43
|
+
measure_method_timing :transform
|
44
|
+
|
45
|
+
def apply_clip(df)
|
46
|
+
clip_cols = has_clip.raw
|
47
|
+
return df unless clip_cols.any?
|
48
|
+
|
49
|
+
clipped_exprs = EasyML::ColumnList::Imputer.new(
|
50
|
+
dataset,
|
51
|
+
df,
|
52
|
+
columns: clip_cols,
|
53
|
+
imputers: [:clip],
|
54
|
+
).exprs
|
55
|
+
|
56
|
+
df.with_columns(clipped_exprs)
|
57
|
+
end
|
58
|
+
|
42
59
|
def learn(type: :raw, computed: false)
|
43
|
-
|
44
|
-
cols_to_learn = cols_to_learn.computed if computed
|
45
|
-
cols_to_learn = cols_to_learn.select(&:persisted?).reject(&:empty?)
|
46
|
-
cols_to_learn.each { |col| col.learn(type: type) }
|
47
|
-
EasyML::Column.import(cols_to_learn, on_duplicate_key_update: { columns: %i[
|
48
|
-
statistics
|
49
|
-
learned_at
|
50
|
-
sample_values
|
51
|
-
last_datasource_sha
|
52
|
-
is_learning
|
53
|
-
datatype
|
54
|
-
polars_datatype
|
55
|
-
] })
|
56
|
-
set_feature_lineage
|
60
|
+
EasyML::Dataset::Learner.new(dataset, type: type).learn
|
57
61
|
reload
|
58
62
|
end
|
59
63
|
|
60
|
-
|
61
|
-
names = dataset.features.computed_column_names
|
62
|
-
columns = where(name: names, computed_by: nil).map do |col|
|
63
|
-
col.assign_attributes(
|
64
|
-
is_computed: true,
|
65
|
-
computed_by: col.computing_feature&.name,
|
66
|
-
)
|
67
|
-
col
|
68
|
-
end
|
69
|
-
EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
|
70
|
-
end
|
64
|
+
measure_method_timing :learn
|
71
65
|
|
72
66
|
def statistics
|
73
67
|
stats = { raw: {}, processed: {} }
|
@@ -115,6 +109,25 @@ module EasyML
|
|
115
109
|
column_list.sort_by { |col| [col.sort_required, col.name] }
|
116
110
|
end
|
117
111
|
|
112
|
+
def set_feature_lineage(cols_to_learn)
|
113
|
+
names = dataset.features.computed_column_names
|
114
|
+
columns = where(name: names, computed_by: nil).map do |col|
|
115
|
+
col.assign_attributes(
|
116
|
+
is_computed: true,
|
117
|
+
computed_by: col.computing_feature&.name,
|
118
|
+
)
|
119
|
+
col
|
120
|
+
end
|
121
|
+
EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
|
122
|
+
|
123
|
+
lineage = cols_to_learn.flat_map do |col|
|
124
|
+
EasyML::Lineage.learn(col)
|
125
|
+
end.compact
|
126
|
+
EasyML::Lineage.import(lineage, on_duplicate_key_update: { columns: %i[ column_id key occurred_at description ] })
|
127
|
+
end
|
128
|
+
|
129
|
+
measure_method_timing :set_feature_lineage
|
130
|
+
|
118
131
|
private
|
119
132
|
|
120
133
|
def import_new(new_columns, existing_columns)
|
@@ -127,7 +140,7 @@ module EasyML
|
|
127
140
|
col
|
128
141
|
end
|
129
142
|
EasyML::Column.import(cols_to_insert)
|
130
|
-
set_feature_lineage
|
143
|
+
set_feature_lineage(cols_to_insert)
|
131
144
|
column_list.reload
|
132
145
|
end
|
133
146
|
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Base
|
5
|
+
attr_reader :dataset, :columns, :type
|
6
|
+
|
7
|
+
def initialize(dataset, columns, type: :raw)
|
8
|
+
@dataset = dataset
|
9
|
+
@columns = columns
|
10
|
+
@type = type
|
11
|
+
end
|
12
|
+
|
13
|
+
def skip_processing?(column, type)
|
14
|
+
(!column.in_raw_dataset? && type.to_sym != :processed) ||
|
15
|
+
(column.one_hot? && type.to_sym == :processed)
|
16
|
+
end
|
17
|
+
|
18
|
+
TYPES_ALL = %i(raw clipped processed)
|
19
|
+
TYPES_RAW = %i(raw clipped)
|
20
|
+
TYPES_PROCESSED = %i(processed)
|
21
|
+
|
22
|
+
def types(type = :all)
|
23
|
+
case type
|
24
|
+
when :all then TYPES_ALL
|
25
|
+
when :raw then TYPES_RAW
|
26
|
+
when :processed then TYPES_PROCESSED
|
27
|
+
else
|
28
|
+
TYPES_ALL
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Eager
|
5
|
+
class Categorical < Query
|
6
|
+
def train_query(df)
|
7
|
+
{
|
8
|
+
counts: counts(df).to_hash,
|
9
|
+
allowed_categories: allowed_categories(df).to_series.to_a,
|
10
|
+
}.merge!(
|
11
|
+
learn_encoder_decoder(df)
|
12
|
+
)
|
13
|
+
end
|
14
|
+
|
15
|
+
def learn_encoder_decoder(df)
|
16
|
+
unsorted = allowed_categories(df).lazy.with_row_count.collect.to_hash.invert
|
17
|
+
|
18
|
+
label_encoder = unsorted.transform_keys(&column.method(:cast)).keys.compact.sort_by(&column.method(:sort_by)).each.with_index.reduce({}) do |h, (k, i)|
|
19
|
+
h.tap do
|
20
|
+
h[k] = i
|
21
|
+
end
|
22
|
+
end
|
23
|
+
label_decoder = label_encoder.invert
|
24
|
+
|
25
|
+
{
|
26
|
+
label_encoder: label_encoder,
|
27
|
+
label_decoder: label_decoder,
|
28
|
+
}
|
29
|
+
end
|
30
|
+
|
31
|
+
def counts(df)
|
32
|
+
return @counts if @counts
|
33
|
+
|
34
|
+
@counts = df.group_by(column.name)
|
35
|
+
.agg(Polars.col(column.name).count.alias("count"))
|
36
|
+
end
|
37
|
+
|
38
|
+
def allowed_categories(df)
|
39
|
+
return @allowed_categories if @allowed_categories
|
40
|
+
|
41
|
+
@allowed_categories = df.join(counts(df), on: column.name)
|
42
|
+
.filter(Polars.col("count").ge(column.categorical_min))
|
43
|
+
.select(column.name)
|
44
|
+
.unique
|
45
|
+
.sort(column.name, reverse: true)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|