easy_ml 0.2.0.pre.rc57 → 0.2.0.pre.rc60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/apis_controller.rb +8 -0
- data/app/controllers/easy_ml/application_controller.rb +4 -0
- data/app/controllers/easy_ml/datasets_controller.rb +32 -1
- data/app/controllers/easy_ml/models_controller.rb +3 -0
- data/app/controllers/easy_ml/predictions_controller.rb +10 -5
- data/app/frontend/components/DatasetPreview.tsx +50 -19
- data/app/frontend/components/ModelForm.tsx +1 -1
- data/app/frontend/components/SearchableSelect.tsx +0 -1
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
- data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
- data/app/frontend/components/dataset/ColumnList.tsx +14 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +82 -21
- data/app/frontend/pages/DatasourcesPage.tsx +0 -2
- data/app/frontend/types/dataset.ts +3 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +0 -2
- data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
- data/app/models/easy_ml/column/imputers/base.rb +89 -0
- data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
- data/app/models/easy_ml/column/imputers/clip.rb +30 -0
- data/app/models/easy_ml/column/imputers/constant.rb +27 -0
- data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
- data/app/models/easy_ml/column/imputers/mean.rb +27 -0
- data/app/models/easy_ml/column/imputers/median.rb +27 -0
- data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
- data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
- data/app/models/easy_ml/column/imputers/today.rb +20 -0
- data/app/models/easy_ml/column/imputers.rb +126 -0
- data/app/models/easy_ml/column/learner.rb +18 -0
- data/app/models/easy_ml/column/learners/base.rb +103 -0
- data/app/models/easy_ml/column/learners/boolean.rb +11 -0
- data/app/models/easy_ml/column/learners/categorical.rb +51 -0
- data/app/models/easy_ml/column/learners/datetime.rb +19 -0
- data/app/models/easy_ml/column/learners/null.rb +22 -0
- data/app/models/easy_ml/column/learners/numeric.rb +33 -0
- data/app/models/easy_ml/column/learners/string.rb +15 -0
- data/app/models/easy_ml/column/lineage/base.rb +22 -0
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
- data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
- data/app/models/easy_ml/column/lineage.rb +28 -0
- data/app/models/easy_ml/column/selector.rb +96 -0
- data/app/models/easy_ml/column.rb +344 -39
- data/app/models/easy_ml/column_history.rb +31 -20
- data/app/models/easy_ml/column_list.rb +79 -62
- data/app/models/easy_ml/dataset.rb +156 -104
- data/app/models/easy_ml/dataset_history.rb +23 -23
- data/app/models/easy_ml/datasource.rb +4 -0
- data/app/models/easy_ml/datasource_history.rb +1 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
- data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
- data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
- data/app/models/easy_ml/feature.rb +29 -10
- data/app/models/easy_ml/feature_history.rb +12 -0
- data/app/models/easy_ml/feature_list.rb +15 -0
- data/app/models/easy_ml/model.rb +25 -4
- data/app/models/easy_ml/model_history.rb +1 -0
- data/app/models/easy_ml/retraining_run.rb +1 -0
- data/app/serializers/easy_ml/column_serializer.rb +11 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
- data/config/initializers/enumerable.rb +17 -0
- data/config/initializers/inflections.rb +2 -0
- data/config/routes.rb +3 -0
- data/lib/easy_ml/core/tuner.rb +1 -1
- data/lib/easy_ml/data/date_converter.rb +137 -30
- data/lib/easy_ml/data/polars_column.rb +17 -0
- data/lib/easy_ml/data/polars_in_memory.rb +30 -0
- data/lib/easy_ml/data/polars_reader.rb +20 -1
- data/lib/easy_ml/data/splits/in_memory_split.rb +7 -5
- data/lib/easy_ml/data/splits/split.rb +2 -1
- data/lib/easy_ml/data/synced_directory.rb +5 -3
- data/lib/easy_ml/data.rb +1 -2
- data/lib/easy_ml/feature_store.rb +33 -22
- data/lib/easy_ml/predict.rb +13 -2
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +7 -0
- data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +18 -0
- data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
- data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
- data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt +20 -0
- data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/tasks/profile.rake +40 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
- metadata +45 -10
- data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
- data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
- data/lib/easy_ml/data/preprocessor.rb +0 -383
- data/lib/easy_ml/data/simple_imputer.rb +0 -255
- data/lib/easy_ml/data/statistics_learner.rb +0 -128
- data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js +0 -474
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js.map +0 -1
@@ -0,0 +1,89 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Base
|
5
|
+
class << self
|
6
|
+
def param_applies(p)
|
7
|
+
Imputers.supported_params << p
|
8
|
+
Imputers.params_by_class[self] ||= []
|
9
|
+
Imputers.params_by_class[self] << p.to_sym
|
10
|
+
end
|
11
|
+
|
12
|
+
def method_applies(m)
|
13
|
+
Imputers.supported_methods << m.to_sym
|
14
|
+
Imputers.methods_by_class[self] ||= []
|
15
|
+
Imputers.methods_by_class[self] << m.to_sym
|
16
|
+
end
|
17
|
+
|
18
|
+
def description
|
19
|
+
"Unknown preprocessing method"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
attr_accessor :column, :preprocessing_step
|
24
|
+
|
25
|
+
def initialize(column, preprocessing_step)
|
26
|
+
@column = column
|
27
|
+
@preprocessing_step = preprocessing_step.with_indifferent_access
|
28
|
+
end
|
29
|
+
|
30
|
+
def applies?
|
31
|
+
method_applies? || param_applies?
|
32
|
+
end
|
33
|
+
|
34
|
+
def method_applies?
|
35
|
+
imputers_own_methods.include?(method.to_sym)
|
36
|
+
end
|
37
|
+
|
38
|
+
def param_applies?
|
39
|
+
params.keys.any? { |p| imputers_own_params.include?(p.to_sym) && params[p] != false }
|
40
|
+
end
|
41
|
+
|
42
|
+
def imputers_own_methods
|
43
|
+
Imputers.methods_by_class[self.class] || []
|
44
|
+
end
|
45
|
+
|
46
|
+
def imputers_own_params
|
47
|
+
Imputers.params_by_class[self.class] || []
|
48
|
+
end
|
49
|
+
|
50
|
+
def params
|
51
|
+
@preprocessing_step.dig(:params)
|
52
|
+
end
|
53
|
+
|
54
|
+
def method
|
55
|
+
@preprocessing_step.dig(:method)
|
56
|
+
end
|
57
|
+
|
58
|
+
def statistics(*args)
|
59
|
+
if column.is_computed
|
60
|
+
column.statistics.dig(:processed, *args)
|
61
|
+
else
|
62
|
+
column.statistics.dig(:clipped, *args) || column.statistics.dig(:raw, *args)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def anything?
|
67
|
+
true
|
68
|
+
end
|
69
|
+
|
70
|
+
def inspect
|
71
|
+
params_str = params ? params.map { |k, v| "#{k}: #{v}" }.join(", ") : "none"
|
72
|
+
method_str = method ? method : "none"
|
73
|
+
|
74
|
+
"#<#{self.class.name} method=#{method_str.inspect} params={#{params_str}}>"
|
75
|
+
end
|
76
|
+
|
77
|
+
alias_method :to_s, :inspect
|
78
|
+
|
79
|
+
def transform(df)
|
80
|
+
raise "Method not implemented"
|
81
|
+
end
|
82
|
+
|
83
|
+
def description
|
84
|
+
self.class.description
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Categorical < Base
|
5
|
+
method_applies :categorical
|
6
|
+
param_applies :categorical_min
|
7
|
+
|
8
|
+
def self.description
|
9
|
+
"Categorical imputation"
|
10
|
+
end
|
11
|
+
|
12
|
+
def transform(df)
|
13
|
+
return df unless allowed_categories.present?
|
14
|
+
|
15
|
+
case column.datatype
|
16
|
+
when :categorical
|
17
|
+
df = df.with_column(
|
18
|
+
Polars.when(Polars.col(column.name).is_in(allowed_categories))
|
19
|
+
.then(Polars.col(column.name))
|
20
|
+
.otherwise(Polars.lit("other"))
|
21
|
+
.alias(column.name)
|
22
|
+
)
|
23
|
+
when :boolean
|
24
|
+
# no-op
|
25
|
+
end
|
26
|
+
df
|
27
|
+
end
|
28
|
+
|
29
|
+
def allowed_categories
|
30
|
+
column.allowed_categories
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Clip < Base
|
5
|
+
attr_accessor :column, :dataset, :preprocessing_step
|
6
|
+
|
7
|
+
param_applies :clip
|
8
|
+
|
9
|
+
def self.description
|
10
|
+
"Clip"
|
11
|
+
end
|
12
|
+
|
13
|
+
def transform(df)
|
14
|
+
df = df.with_column(
|
15
|
+
Polars.col(column.name).clip(min, max).alias(column.name)
|
16
|
+
)
|
17
|
+
df
|
18
|
+
end
|
19
|
+
|
20
|
+
def min
|
21
|
+
params.dig(:clip, :min) || 0
|
22
|
+
end
|
23
|
+
|
24
|
+
def max
|
25
|
+
params.dig(:clip, :max) || Float::INFINITY
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Constant < Base
|
5
|
+
method_applies :constant
|
6
|
+
param_applies :constant
|
7
|
+
|
8
|
+
def self.description
|
9
|
+
"Constant value imputation"
|
10
|
+
end
|
11
|
+
|
12
|
+
def transform(df)
|
13
|
+
return df unless constant.present?
|
14
|
+
|
15
|
+
df = df.with_column(
|
16
|
+
Polars.col(column.name).fill_null(constant).alias(column.name)
|
17
|
+
)
|
18
|
+
df
|
19
|
+
end
|
20
|
+
|
21
|
+
def constant
|
22
|
+
params.dig(:constant)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Ffill < Base
|
5
|
+
method_applies :ffill
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"Forward fill imputation"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
return df unless last_value.present?
|
13
|
+
|
14
|
+
df = df.with_column(
|
15
|
+
Polars.when(Polars.col(column.name).is_null)
|
16
|
+
.then(Polars.lit(last_value).cast(column.polars_datatype))
|
17
|
+
.otherwise(Polars.col(column.name).cast(column.polars_datatype))
|
18
|
+
.alias(column.name)
|
19
|
+
)
|
20
|
+
df
|
21
|
+
end
|
22
|
+
|
23
|
+
def last_value
|
24
|
+
statistics(:last_value)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Imputer
|
5
|
+
attr_accessor :dataset, :column, :preprocessing_step
|
6
|
+
|
7
|
+
def initialize(column, preprocessing_step)
|
8
|
+
@column = column
|
9
|
+
@dataset = column.dataset
|
10
|
+
@preprocessing_step = preprocessing_step.with_indifferent_access
|
11
|
+
validate_preprocessing_step!
|
12
|
+
end
|
13
|
+
|
14
|
+
def inspect
|
15
|
+
"#<#{self.class.name} adapters=#{adapters.map(&:inspect).join(", ")}>"
|
16
|
+
end
|
17
|
+
|
18
|
+
def ordered_adapters
|
19
|
+
[
|
20
|
+
Clip,
|
21
|
+
Mean,
|
22
|
+
Median,
|
23
|
+
Constant,
|
24
|
+
Ffill,
|
25
|
+
Categorical,
|
26
|
+
MostFrequent,
|
27
|
+
Today,
|
28
|
+
OneHotEncoder,
|
29
|
+
OrdinalEncoder,
|
30
|
+
]
|
31
|
+
end
|
32
|
+
|
33
|
+
def adapters
|
34
|
+
@adapters ||= ordered_adapters.map { |klass| klass.new(column, preprocessing_step) }.select(&:applies?)
|
35
|
+
end
|
36
|
+
|
37
|
+
def imputers
|
38
|
+
return nil if column.preprocessing_steps.blank?
|
39
|
+
|
40
|
+
@imputers ||= column.preprocessing_steps.keys.reduce({}) do |hash, key|
|
41
|
+
hash[key.to_sym] = Imputer.new(
|
42
|
+
column: column,
|
43
|
+
preprocessing_step: column.preprocessing_steps[key],
|
44
|
+
)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def description
|
49
|
+
adapters.map(&:description).compact.join(", ")
|
50
|
+
end
|
51
|
+
|
52
|
+
def anything?
|
53
|
+
adapters.any?
|
54
|
+
end
|
55
|
+
|
56
|
+
def transform(df)
|
57
|
+
return df unless anything?
|
58
|
+
|
59
|
+
adapters.reduce(df) do |df, adapter|
|
60
|
+
adapter.transform(df)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def clip(df)
|
65
|
+
return df unless adapters.map(&:class).include?(Clip)
|
66
|
+
|
67
|
+
EasyML::Column::Imputers::Clip.new(column, preprocessing_step).transform(df)
|
68
|
+
end
|
69
|
+
|
70
|
+
def decode_labels(df)
|
71
|
+
return df unless adapters.map(&:class).include?(OrdinalEncoder)
|
72
|
+
|
73
|
+
EasyML::Column::Imputers::OrdinalEncoder.new(column, preprocessing_step).decode_labels(df)
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def validate_preprocessing_step!
|
79
|
+
validate_params!
|
80
|
+
validate_method!
|
81
|
+
end
|
82
|
+
|
83
|
+
def validate_params!
|
84
|
+
return unless preprocessing_step[:params]
|
85
|
+
|
86
|
+
preprocessing_step[:params].keys.each do |param|
|
87
|
+
unless Imputers.supported_params.include?(param.to_sym)
|
88
|
+
raise ArgumentError, "Unsupported preprocessing parameter '#{param}'. Supported parameters are: #{Imputers.supported_params.join(", ")}"
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def validate_method!
|
94
|
+
return unless preprocessing_step[:method]
|
95
|
+
|
96
|
+
unless Imputers.supported_methods.include?(preprocessing_step[:method].to_sym)
|
97
|
+
raise ArgumentError, "Unsupported preprocessing method '#{preprocessing_step[:method]}'. Supported methods are: #{Imputers.supported_methods.join(", ")}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Mean < Base
|
5
|
+
method_applies :mean
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"Mean imputation"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
return df unless mean.present?
|
13
|
+
|
14
|
+
mean = statistics(:mean)
|
15
|
+
df = df.with_column(
|
16
|
+
Polars.col(column.name).fill_null(mean).alias(column.name)
|
17
|
+
)
|
18
|
+
df
|
19
|
+
end
|
20
|
+
|
21
|
+
def mean
|
22
|
+
statistics(:mean)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Median < Base
|
5
|
+
method_applies :median
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"Median imputation"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
return df unless median.present?
|
13
|
+
|
14
|
+
median = statistics(:median)
|
15
|
+
df = df.with_column(
|
16
|
+
Polars.col(column.name).fill_null(median).alias(column.name)
|
17
|
+
)
|
18
|
+
df
|
19
|
+
end
|
20
|
+
|
21
|
+
def median
|
22
|
+
statistics(:median)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class MostFrequent < Base
|
5
|
+
method_applies :most_frequent
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"Most frequent value imputation"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
return df unless most_frequent.present?
|
13
|
+
|
14
|
+
most_frequent = statistics(:most_frequent_value)
|
15
|
+
df = df.with_column(
|
16
|
+
Polars.col(column.name).fill_null(most_frequent).alias(column.name)
|
17
|
+
)
|
18
|
+
df
|
19
|
+
end
|
20
|
+
|
21
|
+
def most_frequent
|
22
|
+
statistics(:most_frequent_value)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class OneHotEncoder < Base
|
5
|
+
param_applies :one_hot
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"One-hot encoder"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
return df unless allowed_categories.present?
|
13
|
+
|
14
|
+
allowed_categories.each do |value|
|
15
|
+
new_col_name = "#{column.name}_#{value}".gsub(/-/, "_")
|
16
|
+
df = df.with_column(
|
17
|
+
df[column.name].cast(Polars::String).eq(value.to_s).cast(Polars::Boolean).alias(new_col_name)
|
18
|
+
)
|
19
|
+
end
|
20
|
+
df = df.drop([column.name])
|
21
|
+
df
|
22
|
+
end
|
23
|
+
|
24
|
+
def allowed_categories
|
25
|
+
column.allowed_categories.sort
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class OrdinalEncoder < Base
|
5
|
+
param_applies :ordinal_encoding
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"Ordinal encoder"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
return df unless label_encoder.present?
|
13
|
+
|
14
|
+
case column.datatype
|
15
|
+
when :categorical
|
16
|
+
df = df.with_column(
|
17
|
+
Polars.when(Polars.col(column.name).is_in(allowed_categories))
|
18
|
+
.then(Polars.col(column.name))
|
19
|
+
.otherwise(Polars.lit("other"))
|
20
|
+
.alias(column.name)
|
21
|
+
)
|
22
|
+
when :boolean
|
23
|
+
# no-op
|
24
|
+
end
|
25
|
+
|
26
|
+
df = df.with_column(
|
27
|
+
df[column.name].map { |v| label_encoder[column.cast(v)] || other_value }.alias(column.name)
|
28
|
+
)
|
29
|
+
|
30
|
+
df
|
31
|
+
end
|
32
|
+
|
33
|
+
def decode_labels(df)
|
34
|
+
if df.is_a?(Array)
|
35
|
+
return df.map { |v| label_decoder[v.to_i] }
|
36
|
+
end
|
37
|
+
|
38
|
+
df = df.with_column(
|
39
|
+
df[column.name].map { |v| label_decoder[v.to_i] }.alias(column.name)
|
40
|
+
)
|
41
|
+
df
|
42
|
+
end
|
43
|
+
|
44
|
+
def categories
|
45
|
+
label_encoder.keys
|
46
|
+
end
|
47
|
+
|
48
|
+
def values
|
49
|
+
label_encoder.values
|
50
|
+
end
|
51
|
+
|
52
|
+
def cast_encoder(encoder)
|
53
|
+
encoder.transform_keys { |k| column.cast(k) }
|
54
|
+
end
|
55
|
+
|
56
|
+
def cast_decoder(decoder)
|
57
|
+
decoder.transform_keys { |k| k.to_i }
|
58
|
+
end
|
59
|
+
|
60
|
+
def label_encoder
|
61
|
+
@label_encoder ||= cast_encoder(statistics(:label_encoder))
|
62
|
+
end
|
63
|
+
|
64
|
+
def label_decoder
|
65
|
+
@label_decoder ||= cast_decoder(statistics(:label_decoder))
|
66
|
+
end
|
67
|
+
|
68
|
+
def other_value
|
69
|
+
label_encoder.values.max + 1
|
70
|
+
end
|
71
|
+
|
72
|
+
def allowed_categories
|
73
|
+
column.allowed_categories
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Today < Base
|
5
|
+
method_applies :today
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"Current date imputation"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
df = df.with_column(
|
13
|
+
Polars.col(column.name).fill_null(Polars.lit(UTC.today.beginning_of_day)).alias(column.name)
|
14
|
+
)
|
15
|
+
df
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
attr_accessor :dataset, :column
|
5
|
+
|
6
|
+
ALLOWED_PARAMS = {
|
7
|
+
constant: [:constant],
|
8
|
+
categorical: %i[categorical_min one_hot ordinal_encoding],
|
9
|
+
most_frequent: %i[one_hot ordinal_encoding],
|
10
|
+
mean: [:clip],
|
11
|
+
median: [:clip],
|
12
|
+
}
|
13
|
+
|
14
|
+
PREPROCESSING_STRATEGIES = {
|
15
|
+
float: [
|
16
|
+
{ value: "ffill", label: "Forward Fill" },
|
17
|
+
{ value: "mean", label: "Mean" },
|
18
|
+
{ value: "median", label: "Median" },
|
19
|
+
{ value: "constant", label: "Constant Value" },
|
20
|
+
],
|
21
|
+
integer: [
|
22
|
+
{ value: "ffill", label: "Forward Fill" },
|
23
|
+
{ value: "mean", label: "Mean" },
|
24
|
+
{ value: "median", label: "Median" },
|
25
|
+
{ value: "constant", label: "Constant Value" },
|
26
|
+
],
|
27
|
+
boolean: [
|
28
|
+
{ value: "ffill", label: "Forward Fill" },
|
29
|
+
{ value: "most_frequent", label: "Most Frequent" },
|
30
|
+
{ value: "constant", label: "Constant Value" },
|
31
|
+
],
|
32
|
+
datetime: [
|
33
|
+
{ value: "ffill", label: "Forward Fill" },
|
34
|
+
{ value: "constant", label: "Constant Value" },
|
35
|
+
{ value: "today", label: "Current Date" },
|
36
|
+
],
|
37
|
+
string: [
|
38
|
+
{ value: "ffill", label: "Forward Fill" },
|
39
|
+
{ value: "most_frequent", label: "Most Frequent" },
|
40
|
+
{ value: "constant", label: "Constant Value" },
|
41
|
+
],
|
42
|
+
text: [
|
43
|
+
{ value: "ffill", label: "Forward Fill" },
|
44
|
+
{ value: "most_frequent", label: "Most Frequent" },
|
45
|
+
{ value: "constant", label: "Constant Value" },
|
46
|
+
],
|
47
|
+
categorical: [
|
48
|
+
{ value: "ffill", label: "Forward Fill" },
|
49
|
+
{ value: "categorical", label: "Categorical" },
|
50
|
+
{ value: "most_frequent", label: "Most Frequent" },
|
51
|
+
{ value: "constant", label: "Constant Value" },
|
52
|
+
],
|
53
|
+
}.freeze
|
54
|
+
|
55
|
+
def self.constants
|
56
|
+
{
|
57
|
+
preprocessing_strategies: PREPROCESSING_STRATEGIES,
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.params_by_class
|
62
|
+
@params_by_class ||= {}
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.methods_by_class
|
66
|
+
@methods_by_class ||= {}
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.supported_params
|
70
|
+
@supported_params ||= []
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.supported_methods
|
74
|
+
@supported_methods ||= []
|
75
|
+
end
|
76
|
+
|
77
|
+
def initialize(column)
|
78
|
+
@column = column
|
79
|
+
@dataset = column.dataset
|
80
|
+
end
|
81
|
+
|
82
|
+
class << self
|
83
|
+
def supported_params
|
84
|
+
@supported_params ||= []
|
85
|
+
end
|
86
|
+
|
87
|
+
def supported_methods
|
88
|
+
@supported_methods ||= []
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def imputers
|
93
|
+
return {} if column.preprocessing_steps.blank?
|
94
|
+
|
95
|
+
@imputers ||= column.preprocessing_steps.keys.reduce({}) do |hash, key|
|
96
|
+
hash.tap do
|
97
|
+
hash[key.to_sym] = Imputer.new(
|
98
|
+
column,
|
99
|
+
column.preprocessing_steps[key],
|
100
|
+
)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def training
|
106
|
+
@training ||= imputer_group(:training)
|
107
|
+
end
|
108
|
+
|
109
|
+
def inference
|
110
|
+
@inference ||= imputer_group(:inference)
|
111
|
+
end
|
112
|
+
|
113
|
+
def preprocessing_descriptions
|
114
|
+
return [] if column.preprocessing_steps.blank?
|
115
|
+
|
116
|
+
[training.description].compact
|
117
|
+
end
|
118
|
+
|
119
|
+
private
|
120
|
+
|
121
|
+
def imputer_group(key)
|
122
|
+
imputers.dig(key.to_sym) || NullImputer.new
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Learner
|
4
|
+
attr_accessor :dataset, :column
|
5
|
+
|
6
|
+
def initialize(column)
|
7
|
+
@column = column
|
8
|
+
@dataset = column.dataset
|
9
|
+
end
|
10
|
+
|
11
|
+
def learner
|
12
|
+
@learner ||= EasyML::Column::Learners::Base.adapter(column).new(column)
|
13
|
+
end
|
14
|
+
|
15
|
+
delegate :learn, to: :learner
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|