easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/application_controller.rb +4 -0
- data/app/controllers/easy_ml/datasets_controller.rb +32 -1
- data/app/frontend/components/DatasetPreview.tsx +50 -19
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
- data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
- data/app/frontend/components/dataset/ColumnList.tsx +14 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
- data/app/frontend/types/dataset.ts +3 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
- data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
- data/app/models/easy_ml/column/imputers/base.rb +89 -0
- data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
- data/app/models/easy_ml/column/imputers/clip.rb +30 -0
- data/app/models/easy_ml/column/imputers/constant.rb +27 -0
- data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
- data/app/models/easy_ml/column/imputers/mean.rb +27 -0
- data/app/models/easy_ml/column/imputers/median.rb +27 -0
- data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
- data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
- data/app/models/easy_ml/column/imputers/today.rb +20 -0
- data/app/models/easy_ml/column/imputers.rb +126 -0
- data/app/models/easy_ml/column/learner.rb +18 -0
- data/app/models/easy_ml/column/learners/base.rb +103 -0
- data/app/models/easy_ml/column/learners/boolean.rb +11 -0
- data/app/models/easy_ml/column/learners/categorical.rb +51 -0
- data/app/models/easy_ml/column/learners/datetime.rb +19 -0
- data/app/models/easy_ml/column/learners/null.rb +22 -0
- data/app/models/easy_ml/column/learners/numeric.rb +33 -0
- data/app/models/easy_ml/column/learners/string.rb +15 -0
- data/app/models/easy_ml/column/lineage/base.rb +22 -0
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
- data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
- data/app/models/easy_ml/column/lineage.rb +28 -0
- data/app/models/easy_ml/column/selector.rb +96 -0
- data/app/models/easy_ml/column.rb +319 -52
- data/app/models/easy_ml/column_history.rb +29 -22
- data/app/models/easy_ml/column_list.rb +63 -78
- data/app/models/easy_ml/dataset.rb +128 -96
- data/app/models/easy_ml/dataset_history.rb +23 -23
- data/app/models/easy_ml/datasource.rb +3 -0
- data/app/models/easy_ml/datasource_history.rb +1 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
- data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
- data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
- data/app/models/easy_ml/feature.rb +19 -7
- data/app/models/easy_ml/feature_history.rb +12 -0
- data/app/models/easy_ml/feature_list.rb +15 -0
- data/app/serializers/easy_ml/column_serializer.rb +11 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
- data/config/initializers/enumerable.rb +17 -0
- data/lib/easy_ml/data/date_converter.rb +137 -30
- data/lib/easy_ml/data/polars_column.rb +17 -0
- data/lib/easy_ml/data/polars_in_memory.rb +30 -0
- data/lib/easy_ml/data/polars_reader.rb +20 -1
- data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
- data/lib/easy_ml/data/splits/split.rb +2 -1
- data/lib/easy_ml/data/synced_directory.rb +1 -1
- data/lib/easy_ml/data.rb +1 -2
- data/lib/easy_ml/engine.rb +1 -0
- data/lib/easy_ml/feature_store.rb +33 -22
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
- data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
- data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
- data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
- data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/tasks/profile.rake +40 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
- metadata +41 -10
- data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
- data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
- data/lib/easy_ml/data/preprocessor.rb +0 -340
- data/lib/easy_ml/data/simple_imputer.rb +0 -255
- data/lib/easy_ml/data/statistics_learner.rb +0 -193
- data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1
@@ -0,0 +1,29 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Ffill < Base
|
5
|
+
method_applies :ffill
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"Forward fill imputation"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
return df unless last_value.present?
|
13
|
+
|
14
|
+
df = df.with_column(
|
15
|
+
Polars.when(Polars.col(column.name).is_null)
|
16
|
+
.then(Polars.lit(last_value).cast(column.polars_datatype))
|
17
|
+
.otherwise(Polars.col(column.name).cast(column.polars_datatype))
|
18
|
+
.alias(column.name)
|
19
|
+
)
|
20
|
+
df
|
21
|
+
end
|
22
|
+
|
23
|
+
def last_value
|
24
|
+
statistics(:last_value)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Imputer
|
5
|
+
attr_accessor :dataset, :column, :preprocessing_step
|
6
|
+
|
7
|
+
def initialize(column, preprocessing_step)
|
8
|
+
@column = column
|
9
|
+
@dataset = column.dataset
|
10
|
+
@preprocessing_step = preprocessing_step.with_indifferent_access
|
11
|
+
validate_preprocessing_step!
|
12
|
+
end
|
13
|
+
|
14
|
+
def inspect
|
15
|
+
"#<#{self.class.name} adapters=#{adapters.map(&:inspect).join(", ")}>"
|
16
|
+
end
|
17
|
+
|
18
|
+
def ordered_adapters
|
19
|
+
[
|
20
|
+
Clip,
|
21
|
+
Mean,
|
22
|
+
Median,
|
23
|
+
Constant,
|
24
|
+
Ffill,
|
25
|
+
Categorical,
|
26
|
+
MostFrequent,
|
27
|
+
Today,
|
28
|
+
OneHotEncoder,
|
29
|
+
OrdinalEncoder,
|
30
|
+
]
|
31
|
+
end
|
32
|
+
|
33
|
+
def adapters
|
34
|
+
@adapters ||= ordered_adapters.map { |klass| klass.new(column, preprocessing_step) }.select(&:applies?)
|
35
|
+
end
|
36
|
+
|
37
|
+
def imputers
|
38
|
+
return nil if column.preprocessing_steps.blank?
|
39
|
+
|
40
|
+
@imputers ||= column.preprocessing_steps.keys.reduce({}) do |hash, key|
|
41
|
+
hash[key.to_sym] = Imputer.new(
|
42
|
+
column: column,
|
43
|
+
preprocessing_step: column.preprocessing_steps[key],
|
44
|
+
)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def description
|
49
|
+
adapters.map(&:description).compact.join(", ")
|
50
|
+
end
|
51
|
+
|
52
|
+
def anything?
|
53
|
+
adapters.any?
|
54
|
+
end
|
55
|
+
|
56
|
+
def transform(df)
|
57
|
+
return df unless anything?
|
58
|
+
|
59
|
+
adapters.reduce(df) do |df, adapter|
|
60
|
+
adapter.transform(df)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def clip(df)
|
65
|
+
return df unless adapters.map(&:class).include?(Clip)
|
66
|
+
|
67
|
+
EasyML::Column::Imputers::Clip.new(column, preprocessing_step).transform(df)
|
68
|
+
end
|
69
|
+
|
70
|
+
def decode_labels(df)
|
71
|
+
return df unless adapters.map(&:class).include?(OrdinalEncoder)
|
72
|
+
|
73
|
+
EasyML::Column::Imputers::OrdinalEncoder.new(column, preprocessing_step).decode_labels(df)
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def validate_preprocessing_step!
|
79
|
+
validate_params!
|
80
|
+
validate_method!
|
81
|
+
end
|
82
|
+
|
83
|
+
def validate_params!
|
84
|
+
return unless preprocessing_step[:params]
|
85
|
+
|
86
|
+
preprocessing_step[:params].keys.each do |param|
|
87
|
+
unless Imputers.supported_params.include?(param.to_sym)
|
88
|
+
raise ArgumentError, "Unsupported preprocessing parameter '#{param}'. Supported parameters are: #{Imputers.supported_params.join(", ")}"
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def validate_method!
|
94
|
+
return unless preprocessing_step[:method]
|
95
|
+
|
96
|
+
unless Imputers.supported_methods.include?(preprocessing_step[:method].to_sym)
|
97
|
+
raise ArgumentError, "Unsupported preprocessing method '#{preprocessing_step[:method]}'. Supported methods are: #{Imputers.supported_methods.join(", ")}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Mean < Base
|
5
|
+
method_applies :mean
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"Mean imputation"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
return df unless mean.present?
|
13
|
+
|
14
|
+
mean = statistics(:mean)
|
15
|
+
df = df.with_column(
|
16
|
+
Polars.col(column.name).fill_null(mean).alias(column.name)
|
17
|
+
)
|
18
|
+
df
|
19
|
+
end
|
20
|
+
|
21
|
+
def mean
|
22
|
+
statistics(:mean)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Median < Base
|
5
|
+
method_applies :median
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"Median imputation"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
return df unless median.present?
|
13
|
+
|
14
|
+
median = statistics(:median)
|
15
|
+
df = df.with_column(
|
16
|
+
Polars.col(column.name).fill_null(median).alias(column.name)
|
17
|
+
)
|
18
|
+
df
|
19
|
+
end
|
20
|
+
|
21
|
+
def median
|
22
|
+
statistics(:median)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class MostFrequent < Base
|
5
|
+
method_applies :most_frequent
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"Most frequent value imputation"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
return df unless most_frequent.present?
|
13
|
+
|
14
|
+
most_frequent = statistics(:most_frequent_value)
|
15
|
+
df = df.with_column(
|
16
|
+
Polars.col(column.name).fill_null(most_frequent).alias(column.name)
|
17
|
+
)
|
18
|
+
df
|
19
|
+
end
|
20
|
+
|
21
|
+
def most_frequent
|
22
|
+
statistics(:most_frequent_value)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class OneHotEncoder < Base
|
5
|
+
param_applies :one_hot
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"One-hot encoder"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
return df unless allowed_categories.present?
|
13
|
+
|
14
|
+
allowed_categories.each do |value|
|
15
|
+
new_col_name = "#{column.name}_#{value}".gsub(/-/, "_")
|
16
|
+
df = df.with_column(
|
17
|
+
df[column.name].cast(Polars::String).eq(value.to_s).cast(Polars::Boolean).alias(new_col_name)
|
18
|
+
)
|
19
|
+
end
|
20
|
+
df = df.drop([column.name])
|
21
|
+
df
|
22
|
+
end
|
23
|
+
|
24
|
+
def allowed_categories
|
25
|
+
column.allowed_categories.sort
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class OrdinalEncoder < Base
|
5
|
+
param_applies :ordinal_encoding
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"Ordinal encoder"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
return df unless label_encoder.present?
|
13
|
+
|
14
|
+
case column.datatype
|
15
|
+
when :categorical
|
16
|
+
df = df.with_column(
|
17
|
+
Polars.when(Polars.col(column.name).is_in(allowed_categories))
|
18
|
+
.then(Polars.col(column.name))
|
19
|
+
.otherwise(Polars.lit("other"))
|
20
|
+
.alias(column.name)
|
21
|
+
)
|
22
|
+
when :boolean
|
23
|
+
# no-op
|
24
|
+
end
|
25
|
+
|
26
|
+
df = df.with_column(
|
27
|
+
df[column.name].map { |v| label_encoder[column.cast(v)] || other_value }.alias(column.name)
|
28
|
+
)
|
29
|
+
|
30
|
+
df
|
31
|
+
end
|
32
|
+
|
33
|
+
def decode_labels(df)
|
34
|
+
if df.is_a?(Array)
|
35
|
+
return df.map { |v| label_decoder[v.to_i] }
|
36
|
+
end
|
37
|
+
|
38
|
+
df = df.with_column(
|
39
|
+
df[column.name].map { |v| label_decoder[v.to_i] }.alias(column.name)
|
40
|
+
)
|
41
|
+
df
|
42
|
+
end
|
43
|
+
|
44
|
+
def categories
|
45
|
+
label_encoder.keys
|
46
|
+
end
|
47
|
+
|
48
|
+
def values
|
49
|
+
label_encoder.values
|
50
|
+
end
|
51
|
+
|
52
|
+
def cast_encoder(encoder)
|
53
|
+
encoder.transform_keys { |k| column.cast(k) }
|
54
|
+
end
|
55
|
+
|
56
|
+
def cast_decoder(decoder)
|
57
|
+
decoder.transform_keys { |k| k.to_i }
|
58
|
+
end
|
59
|
+
|
60
|
+
def label_encoder
|
61
|
+
@label_encoder ||= cast_encoder(statistics(:label_encoder))
|
62
|
+
end
|
63
|
+
|
64
|
+
def label_decoder
|
65
|
+
@label_decoder ||= cast_decoder(statistics(:label_decoder))
|
66
|
+
end
|
67
|
+
|
68
|
+
def other_value
|
69
|
+
label_encoder.values.max + 1
|
70
|
+
end
|
71
|
+
|
72
|
+
def allowed_categories
|
73
|
+
column.allowed_categories
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Today < Base
|
5
|
+
method_applies :today
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"Current date imputation"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
df = df.with_column(
|
13
|
+
Polars.col(column.name).fill_null(Polars.lit(UTC.today.beginning_of_day)).alias(column.name)
|
14
|
+
)
|
15
|
+
df
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
attr_accessor :dataset, :column
|
5
|
+
|
6
|
+
ALLOWED_PARAMS = {
|
7
|
+
constant: [:constant],
|
8
|
+
categorical: %i[categorical_min one_hot ordinal_encoding],
|
9
|
+
most_frequent: %i[one_hot ordinal_encoding],
|
10
|
+
mean: [:clip],
|
11
|
+
median: [:clip],
|
12
|
+
}
|
13
|
+
|
14
|
+
PREPROCESSING_STRATEGIES = {
|
15
|
+
float: [
|
16
|
+
{ value: "ffill", label: "Forward Fill" },
|
17
|
+
{ value: "mean", label: "Mean" },
|
18
|
+
{ value: "median", label: "Median" },
|
19
|
+
{ value: "constant", label: "Constant Value" },
|
20
|
+
],
|
21
|
+
integer: [
|
22
|
+
{ value: "ffill", label: "Forward Fill" },
|
23
|
+
{ value: "mean", label: "Mean" },
|
24
|
+
{ value: "median", label: "Median" },
|
25
|
+
{ value: "constant", label: "Constant Value" },
|
26
|
+
],
|
27
|
+
boolean: [
|
28
|
+
{ value: "ffill", label: "Forward Fill" },
|
29
|
+
{ value: "most_frequent", label: "Most Frequent" },
|
30
|
+
{ value: "constant", label: "Constant Value" },
|
31
|
+
],
|
32
|
+
datetime: [
|
33
|
+
{ value: "ffill", label: "Forward Fill" },
|
34
|
+
{ value: "constant", label: "Constant Value" },
|
35
|
+
{ value: "today", label: "Current Date" },
|
36
|
+
],
|
37
|
+
string: [
|
38
|
+
{ value: "ffill", label: "Forward Fill" },
|
39
|
+
{ value: "most_frequent", label: "Most Frequent" },
|
40
|
+
{ value: "constant", label: "Constant Value" },
|
41
|
+
],
|
42
|
+
text: [
|
43
|
+
{ value: "ffill", label: "Forward Fill" },
|
44
|
+
{ value: "most_frequent", label: "Most Frequent" },
|
45
|
+
{ value: "constant", label: "Constant Value" },
|
46
|
+
],
|
47
|
+
categorical: [
|
48
|
+
{ value: "ffill", label: "Forward Fill" },
|
49
|
+
{ value: "categorical", label: "Categorical" },
|
50
|
+
{ value: "most_frequent", label: "Most Frequent" },
|
51
|
+
{ value: "constant", label: "Constant Value" },
|
52
|
+
],
|
53
|
+
}.freeze
|
54
|
+
|
55
|
+
def self.constants
|
56
|
+
{
|
57
|
+
preprocessing_strategies: PREPROCESSING_STRATEGIES,
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.params_by_class
|
62
|
+
@params_by_class ||= {}
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.methods_by_class
|
66
|
+
@methods_by_class ||= {}
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.supported_params
|
70
|
+
@supported_params ||= []
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.supported_methods
|
74
|
+
@supported_methods ||= []
|
75
|
+
end
|
76
|
+
|
77
|
+
def initialize(column)
|
78
|
+
@column = column
|
79
|
+
@dataset = column.dataset
|
80
|
+
end
|
81
|
+
|
82
|
+
class << self
|
83
|
+
def supported_params
|
84
|
+
@supported_params ||= []
|
85
|
+
end
|
86
|
+
|
87
|
+
def supported_methods
|
88
|
+
@supported_methods ||= []
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def imputers
|
93
|
+
return {} if column.preprocessing_steps.blank?
|
94
|
+
|
95
|
+
@imputers ||= column.preprocessing_steps.keys.reduce({}) do |hash, key|
|
96
|
+
hash.tap do
|
97
|
+
hash[key.to_sym] = Imputer.new(
|
98
|
+
column,
|
99
|
+
column.preprocessing_steps[key],
|
100
|
+
)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def training
|
106
|
+
@training ||= imputer_group(:training)
|
107
|
+
end
|
108
|
+
|
109
|
+
def inference
|
110
|
+
@inference ||= imputer_group(:inference)
|
111
|
+
end
|
112
|
+
|
113
|
+
def preprocessing_descriptions
|
114
|
+
return [] if column.preprocessing_steps.blank?
|
115
|
+
|
116
|
+
[training.description].compact
|
117
|
+
end
|
118
|
+
|
119
|
+
private
|
120
|
+
|
121
|
+
def imputer_group(key)
|
122
|
+
imputers.dig(key.to_sym) || NullImputer.new
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Learner
|
4
|
+
attr_accessor :dataset, :column
|
5
|
+
|
6
|
+
def initialize(column)
|
7
|
+
@column = column
|
8
|
+
@dataset = column.dataset
|
9
|
+
end
|
10
|
+
|
11
|
+
def learner
|
12
|
+
@learner ||= EasyML::Column::Learners::Base.adapter(column).new(column)
|
13
|
+
end
|
14
|
+
|
15
|
+
delegate :learn, to: :learner
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
module Learners
|
4
|
+
class Base
|
5
|
+
attr_accessor :column, :dataset, :dtype, :select
|
6
|
+
|
7
|
+
def initialize(column)
|
8
|
+
@column = column
|
9
|
+
@dataset = column.dataset
|
10
|
+
@select = dataset.date_column.present? ? [dataset.date_column.name] : []
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.adapter(column)
|
14
|
+
begin
|
15
|
+
dtype = column.datatype || EasyML::Data::PolarsColumn.determine_type(column.raw.data[column.name])
|
16
|
+
rescue => e
|
17
|
+
raise "Unable to find column #{column.name}. If this column is computed by a feature, you forgot to declare computes_columns"
|
18
|
+
end
|
19
|
+
|
20
|
+
case dtype.to_sym
|
21
|
+
when :float, :integer
|
22
|
+
EasyML::Column::Learners::Numeric
|
23
|
+
when :string, :text
|
24
|
+
EasyML::Column::Learners::String
|
25
|
+
when :categorical
|
26
|
+
EasyML::Column::Learners::Categorical
|
27
|
+
when :datetime, :date
|
28
|
+
EasyML::Column::Learners::Datetime
|
29
|
+
when :boolean
|
30
|
+
EasyML::Column::Learners::Boolean
|
31
|
+
when :null
|
32
|
+
EasyML::Column::Learners::Null
|
33
|
+
else
|
34
|
+
raise "Don't know how to learn from dtype: #{dtype}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
TYPES_ALL = %i(raw clipped processed)
|
39
|
+
TYPES_RAW = %i(raw clipped)
|
40
|
+
TYPES_PROCESSED = %i(processed)
|
41
|
+
|
42
|
+
def types(type = :all)
|
43
|
+
return TYPES_PROCESSED if !column.in_raw_dataset?
|
44
|
+
|
45
|
+
case type
|
46
|
+
when :all then TYPES_ALL
|
47
|
+
when :raw then TYPES_RAW
|
48
|
+
when :processed then TYPES_PROCESSED
|
49
|
+
else
|
50
|
+
TYPES_ALL
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def learn(type: :all)
|
55
|
+
types(type).each_with_object({}) do |t, h|
|
56
|
+
h[t] = learn_split(column.send(t))
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def full_dataset_statistics(df)
|
61
|
+
return {} if df.nil?
|
62
|
+
|
63
|
+
{
|
64
|
+
num_rows: df.size,
|
65
|
+
null_count: df[column.name].null_count || 0,
|
66
|
+
}
|
67
|
+
end
|
68
|
+
|
69
|
+
def train_statistics(df)
|
70
|
+
return {} if df.nil?
|
71
|
+
|
72
|
+
{
|
73
|
+
last_value: last_value(df),
|
74
|
+
most_frequent_value: df[column.name].mode.sort.to_a&.first,
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
def learn_split(split)
|
79
|
+
df = split.data(select: select)
|
80
|
+
train_df = split.train(select: select)
|
81
|
+
full_dataset_stats = full_dataset_statistics(df)
|
82
|
+
train_stats = train_statistics(train_df)
|
83
|
+
full_dataset_stats.merge!(train_stats)
|
84
|
+
end
|
85
|
+
|
86
|
+
def last_value(df)
|
87
|
+
return unless dataset.date_column.present?
|
88
|
+
return nil if df.empty? || !df.columns.include?(dataset.date_column.name)
|
89
|
+
|
90
|
+
# Sort by date and get the last non-null value
|
91
|
+
sorted_df = df.sort(dataset.date_column.name, reverse: true)
|
92
|
+
last_value = sorted_df
|
93
|
+
.filter(Polars.col(column.name).is_not_null)
|
94
|
+
.select(column.name)
|
95
|
+
.head(1)
|
96
|
+
.item
|
97
|
+
|
98
|
+
last_value
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
module Learners
|
4
|
+
class Categorical < String
|
5
|
+
def learn(type)
|
6
|
+
types(type).each_with_object({}) do |type, h|
|
7
|
+
h[type] = case type
|
8
|
+
when :raw then learn_split(column.raw)
|
9
|
+
when :processed then learn_split(column.raw).merge!(null_count: 0)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def train_statistics(df)
|
15
|
+
return {} if df.nil?
|
16
|
+
|
17
|
+
super(df).merge!({
|
18
|
+
allowed_categories: allowed_categories(df),
|
19
|
+
counts: df[column.name].value_counts.to_hash,
|
20
|
+
}.merge!(learn_encoder_decoder(df)))
|
21
|
+
end
|
22
|
+
|
23
|
+
def learn_encoder_decoder(df)
|
24
|
+
value_counts = df[column.name].value_counts
|
25
|
+
column_names = value_counts.columns
|
26
|
+
value_column = column_names[0]
|
27
|
+
count_column = column_names[1]
|
28
|
+
|
29
|
+
as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&column.method(:cast))
|
30
|
+
label_encoder = as_hash.keys.compact.sort_by(&column.method(:sort_by)).each.with_index.reduce({}) do |h, (k, i)|
|
31
|
+
h.tap do
|
32
|
+
h[k] = i
|
33
|
+
end
|
34
|
+
end
|
35
|
+
label_decoder = label_encoder.invert
|
36
|
+
|
37
|
+
{
|
38
|
+
value: as_hash,
|
39
|
+
label_encoder: label_encoder,
|
40
|
+
label_decoder: label_decoder,
|
41
|
+
}
|
42
|
+
end
|
43
|
+
|
44
|
+
def allowed_categories(df)
|
45
|
+
val_counts = df[column.name].value_counts
|
46
|
+
val_counts[val_counts["count"] >= column.categorical_min][column.name].to_a.compact.sort_by(&column.method(:sort_by))
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
module Learners
|
4
|
+
class Datetime < Base
|
5
|
+
def full_dataset_statistics(df)
|
6
|
+
return {} if df.nil?
|
7
|
+
|
8
|
+
super(df).merge!({
|
9
|
+
unique_count: df[column.name].n_unique,
|
10
|
+
})
|
11
|
+
end
|
12
|
+
|
13
|
+
def last_value(df)
|
14
|
+
df[column.name].sort[-1]
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|