easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc61

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/application_controller.rb +4 -0
  3. data/app/controllers/easy_ml/datasets_controller.rb +32 -1
  4. data/app/frontend/components/DatasetPreview.tsx +50 -19
  5. data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
  6. data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
  7. data/app/frontend/components/dataset/ColumnList.tsx +14 -2
  8. data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
  9. data/app/frontend/types/dataset.ts +3 -0
  10. data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
  11. data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
  12. data/app/models/easy_ml/column/imputers/base.rb +89 -0
  13. data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
  14. data/app/models/easy_ml/column/imputers/clip.rb +30 -0
  15. data/app/models/easy_ml/column/imputers/constant.rb +27 -0
  16. data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
  17. data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
  18. data/app/models/easy_ml/column/imputers/mean.rb +27 -0
  19. data/app/models/easy_ml/column/imputers/median.rb +27 -0
  20. data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
  21. data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
  22. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
  23. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
  24. data/app/models/easy_ml/column/imputers/today.rb +20 -0
  25. data/app/models/easy_ml/column/imputers.rb +126 -0
  26. data/app/models/easy_ml/column/learner.rb +18 -0
  27. data/app/models/easy_ml/column/learners/base.rb +103 -0
  28. data/app/models/easy_ml/column/learners/boolean.rb +11 -0
  29. data/app/models/easy_ml/column/learners/categorical.rb +51 -0
  30. data/app/models/easy_ml/column/learners/datetime.rb +19 -0
  31. data/app/models/easy_ml/column/learners/null.rb +22 -0
  32. data/app/models/easy_ml/column/learners/numeric.rb +33 -0
  33. data/app/models/easy_ml/column/learners/string.rb +15 -0
  34. data/app/models/easy_ml/column/lineage/base.rb +22 -0
  35. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
  36. data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
  37. data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
  38. data/app/models/easy_ml/column/lineage.rb +28 -0
  39. data/app/models/easy_ml/column/selector.rb +96 -0
  40. data/app/models/easy_ml/column.rb +319 -52
  41. data/app/models/easy_ml/column_history.rb +29 -22
  42. data/app/models/easy_ml/column_list.rb +63 -78
  43. data/app/models/easy_ml/dataset.rb +128 -96
  44. data/app/models/easy_ml/dataset_history.rb +23 -23
  45. data/app/models/easy_ml/datasource.rb +3 -0
  46. data/app/models/easy_ml/datasource_history.rb +1 -0
  47. data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
  48. data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
  49. data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
  50. data/app/models/easy_ml/feature.rb +19 -7
  51. data/app/models/easy_ml/feature_history.rb +12 -0
  52. data/app/models/easy_ml/feature_list.rb +15 -0
  53. data/app/serializers/easy_ml/column_serializer.rb +11 -1
  54. data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
  55. data/config/initializers/enumerable.rb +17 -0
  56. data/lib/easy_ml/data/date_converter.rb +137 -30
  57. data/lib/easy_ml/data/polars_column.rb +17 -0
  58. data/lib/easy_ml/data/polars_in_memory.rb +30 -0
  59. data/lib/easy_ml/data/polars_reader.rb +20 -1
  60. data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
  61. data/lib/easy_ml/data/splits/split.rb +2 -1
  62. data/lib/easy_ml/data/synced_directory.rb +1 -1
  63. data/lib/easy_ml/data.rb +1 -2
  64. data/lib/easy_ml/engine.rb +1 -0
  65. data/lib/easy_ml/feature_store.rb +33 -22
  66. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
  67. data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
  68. data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
  69. data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
  70. data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
  71. data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
  72. data/lib/easy_ml/version.rb +1 -1
  73. data/lib/tasks/profile.rake +40 -0
  74. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  75. data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
  76. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
  77. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
  78. metadata +41 -10
  79. data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
  80. data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
  81. data/lib/easy_ml/data/preprocessor.rb +0 -340
  82. data/lib/easy_ml/data/simple_imputer.rb +0 -255
  83. data/lib/easy_ml/data/statistics_learner.rb +0 -193
  84. data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
  85. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
  86. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1
@@ -0,0 +1,29 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ class Ffill < Base
5
+ method_applies :ffill
6
+
7
+ def self.description
8
+ "Forward fill imputation"
9
+ end
10
+
11
+ def transform(df)
12
+ return df unless last_value.present?
13
+
14
+ df = df.with_column(
15
+ Polars.when(Polars.col(column.name).is_null)
16
+ .then(Polars.lit(last_value).cast(column.polars_datatype))
17
+ .otherwise(Polars.col(column.name).cast(column.polars_datatype))
18
+ .alias(column.name)
19
+ )
20
+ df
21
+ end
22
+
23
+ def last_value
24
+ statistics(:last_value)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,103 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ class Imputer
5
+ attr_accessor :dataset, :column, :preprocessing_step
6
+
7
+ def initialize(column, preprocessing_step)
8
+ @column = column
9
+ @dataset = column.dataset
10
+ @preprocessing_step = preprocessing_step.with_indifferent_access
11
+ validate_preprocessing_step!
12
+ end
13
+
14
+ def inspect
15
+ "#<#{self.class.name} adapters=#{adapters.map(&:inspect).join(", ")}>"
16
+ end
17
+
18
+ def ordered_adapters
19
+ [
20
+ Clip,
21
+ Mean,
22
+ Median,
23
+ Constant,
24
+ Ffill,
25
+ Categorical,
26
+ MostFrequent,
27
+ Today,
28
+ OneHotEncoder,
29
+ OrdinalEncoder,
30
+ ]
31
+ end
32
+
33
+ def adapters
34
+ @adapters ||= ordered_adapters.map { |klass| klass.new(column, preprocessing_step) }.select(&:applies?)
35
+ end
36
+
37
+ def imputers
38
+ return nil if column.preprocessing_steps.blank?
39
+
40
+ @imputers ||= column.preprocessing_steps.keys.reduce({}) do |hash, key|
41
+ hash[key.to_sym] = Imputer.new(
42
+ column: column,
43
+ preprocessing_step: column.preprocessing_steps[key],
44
+ )
45
+ end
46
+ end
47
+
48
+ def description
49
+ adapters.map(&:description).compact.join(", ")
50
+ end
51
+
52
+ def anything?
53
+ adapters.any?
54
+ end
55
+
56
+ def transform(df)
57
+ return df unless anything?
58
+
59
+ adapters.reduce(df) do |df, adapter|
60
+ adapter.transform(df)
61
+ end
62
+ end
63
+
64
+ def clip(df)
65
+ return df unless adapters.map(&:class).include?(Clip)
66
+
67
+ EasyML::Column::Imputers::Clip.new(column, preprocessing_step).transform(df)
68
+ end
69
+
70
+ def decode_labels(df)
71
+ return df unless adapters.map(&:class).include?(OrdinalEncoder)
72
+
73
+ EasyML::Column::Imputers::OrdinalEncoder.new(column, preprocessing_step).decode_labels(df)
74
+ end
75
+
76
+ private
77
+
78
+ def validate_preprocessing_step!
79
+ validate_params!
80
+ validate_method!
81
+ end
82
+
83
+ def validate_params!
84
+ return unless preprocessing_step[:params]
85
+
86
+ preprocessing_step[:params].keys.each do |param|
87
+ unless Imputers.supported_params.include?(param.to_sym)
88
+ raise ArgumentError, "Unsupported preprocessing parameter '#{param}'. Supported parameters are: #{Imputers.supported_params.join(", ")}"
89
+ end
90
+ end
91
+ end
92
+
93
+ def validate_method!
94
+ return unless preprocessing_step[:method]
95
+
96
+ unless Imputers.supported_methods.include?(preprocessing_step[:method].to_sym)
97
+ raise ArgumentError, "Unsupported preprocessing method '#{preprocessing_step[:method]}'. Supported methods are: #{Imputers.supported_methods.join(", ")}"
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,27 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ class Mean < Base
5
+ method_applies :mean
6
+
7
+ def self.description
8
+ "Mean imputation"
9
+ end
10
+
11
+ def transform(df)
12
+ return df unless mean.present?
13
+
14
+ mean = statistics(:mean)
15
+ df = df.with_column(
16
+ Polars.col(column.name).fill_null(mean).alias(column.name)
17
+ )
18
+ df
19
+ end
20
+
21
+ def mean
22
+ statistics(:mean)
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,27 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ class Median < Base
5
+ method_applies :median
6
+
7
+ def self.description
8
+ "Median imputation"
9
+ end
10
+
11
+ def transform(df)
12
+ return df unless median.present?
13
+
14
+ median = statistics(:median)
15
+ df = df.with_column(
16
+ Polars.col(column.name).fill_null(median).alias(column.name)
17
+ )
18
+ df
19
+ end
20
+
21
+ def median
22
+ statistics(:median)
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,27 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ class MostFrequent < Base
5
+ method_applies :most_frequent
6
+
7
+ def self.description
8
+ "Most frequent value imputation"
9
+ end
10
+
11
+ def transform(df)
12
+ return df unless most_frequent.present?
13
+
14
+ most_frequent = statistics(:most_frequent_value)
15
+ df = df.with_column(
16
+ Polars.col(column.name).fill_null(most_frequent).alias(column.name)
17
+ )
18
+ df
19
+ end
20
+
21
+ def most_frequent
22
+ statistics(:most_frequent_value)
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,15 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ class NullImputer
5
+ def anything?
6
+ false
7
+ end
8
+
9
+ def method_missing(_name, df)
10
+ df
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,30 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ class OneHotEncoder < Base
5
+ param_applies :one_hot
6
+
7
+ def self.description
8
+ "One-hot encoder"
9
+ end
10
+
11
+ def transform(df)
12
+ return df unless allowed_categories.present?
13
+
14
+ allowed_categories.each do |value|
15
+ new_col_name = "#{column.name}_#{value}".gsub(/-/, "_")
16
+ df = df.with_column(
17
+ df[column.name].cast(Polars::String).eq(value.to_s).cast(Polars::Boolean).alias(new_col_name)
18
+ )
19
+ end
20
+ df = df.drop([column.name])
21
+ df
22
+ end
23
+
24
+ def allowed_categories
25
+ column.allowed_categories.sort
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,78 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ class OrdinalEncoder < Base
5
+ param_applies :ordinal_encoding
6
+
7
+ def self.description
8
+ "Ordinal encoder"
9
+ end
10
+
11
+ def transform(df)
12
+ return df unless label_encoder.present?
13
+
14
+ case column.datatype
15
+ when :categorical
16
+ df = df.with_column(
17
+ Polars.when(Polars.col(column.name).is_in(allowed_categories))
18
+ .then(Polars.col(column.name))
19
+ .otherwise(Polars.lit("other"))
20
+ .alias(column.name)
21
+ )
22
+ when :boolean
23
+ # no-op
24
+ end
25
+
26
+ df = df.with_column(
27
+ df[column.name].map { |v| label_encoder[column.cast(v)] || other_value }.alias(column.name)
28
+ )
29
+
30
+ df
31
+ end
32
+
33
+ def decode_labels(df)
34
+ if df.is_a?(Array)
35
+ return df.map { |v| label_decoder[v.to_i] }
36
+ end
37
+
38
+ df = df.with_column(
39
+ df[column.name].map { |v| label_decoder[v.to_i] }.alias(column.name)
40
+ )
41
+ df
42
+ end
43
+
44
+ def categories
45
+ label_encoder.keys
46
+ end
47
+
48
+ def values
49
+ label_encoder.values
50
+ end
51
+
52
+ def cast_encoder(encoder)
53
+ encoder.transform_keys { |k| column.cast(k) }
54
+ end
55
+
56
+ def cast_decoder(decoder)
57
+ decoder.transform_keys { |k| k.to_i }
58
+ end
59
+
60
+ def label_encoder
61
+ @label_encoder ||= cast_encoder(statistics(:label_encoder))
62
+ end
63
+
64
+ def label_decoder
65
+ @label_decoder ||= cast_decoder(statistics(:label_decoder))
66
+ end
67
+
68
+ def other_value
69
+ label_encoder.values.max + 1
70
+ end
71
+
72
+ def allowed_categories
73
+ column.allowed_categories
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,20 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ class Today < Base
5
+ method_applies :today
6
+
7
+ def self.description
8
+ "Current date imputation"
9
+ end
10
+
11
+ def transform(df)
12
+ df = df.with_column(
13
+ Polars.col(column.name).fill_null(Polars.lit(UTC.today.beginning_of_day)).alias(column.name)
14
+ )
15
+ df
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,126 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ attr_accessor :dataset, :column
5
+
6
+ ALLOWED_PARAMS = {
7
+ constant: [:constant],
8
+ categorical: %i[categorical_min one_hot ordinal_encoding],
9
+ most_frequent: %i[one_hot ordinal_encoding],
10
+ mean: [:clip],
11
+ median: [:clip],
12
+ }
13
+
14
+ PREPROCESSING_STRATEGIES = {
15
+ float: [
16
+ { value: "ffill", label: "Forward Fill" },
17
+ { value: "mean", label: "Mean" },
18
+ { value: "median", label: "Median" },
19
+ { value: "constant", label: "Constant Value" },
20
+ ],
21
+ integer: [
22
+ { value: "ffill", label: "Forward Fill" },
23
+ { value: "mean", label: "Mean" },
24
+ { value: "median", label: "Median" },
25
+ { value: "constant", label: "Constant Value" },
26
+ ],
27
+ boolean: [
28
+ { value: "ffill", label: "Forward Fill" },
29
+ { value: "most_frequent", label: "Most Frequent" },
30
+ { value: "constant", label: "Constant Value" },
31
+ ],
32
+ datetime: [
33
+ { value: "ffill", label: "Forward Fill" },
34
+ { value: "constant", label: "Constant Value" },
35
+ { value: "today", label: "Current Date" },
36
+ ],
37
+ string: [
38
+ { value: "ffill", label: "Forward Fill" },
39
+ { value: "most_frequent", label: "Most Frequent" },
40
+ { value: "constant", label: "Constant Value" },
41
+ ],
42
+ text: [
43
+ { value: "ffill", label: "Forward Fill" },
44
+ { value: "most_frequent", label: "Most Frequent" },
45
+ { value: "constant", label: "Constant Value" },
46
+ ],
47
+ categorical: [
48
+ { value: "ffill", label: "Forward Fill" },
49
+ { value: "categorical", label: "Categorical" },
50
+ { value: "most_frequent", label: "Most Frequent" },
51
+ { value: "constant", label: "Constant Value" },
52
+ ],
53
+ }.freeze
54
+
55
+ def self.constants
56
+ {
57
+ preprocessing_strategies: PREPROCESSING_STRATEGIES,
58
+ }
59
+ end
60
+
61
+ def self.params_by_class
62
+ @params_by_class ||= {}
63
+ end
64
+
65
+ def self.methods_by_class
66
+ @methods_by_class ||= {}
67
+ end
68
+
69
+ def self.supported_params
70
+ @supported_params ||= []
71
+ end
72
+
73
+ def self.supported_methods
74
+ @supported_methods ||= []
75
+ end
76
+
77
+ def initialize(column)
78
+ @column = column
79
+ @dataset = column.dataset
80
+ end
81
+
82
+ class << self
83
+ def supported_params
84
+ @supported_params ||= []
85
+ end
86
+
87
+ def supported_methods
88
+ @supported_methods ||= []
89
+ end
90
+ end
91
+
92
+ def imputers
93
+ return {} if column.preprocessing_steps.blank?
94
+
95
+ @imputers ||= column.preprocessing_steps.keys.reduce({}) do |hash, key|
96
+ hash.tap do
97
+ hash[key.to_sym] = Imputer.new(
98
+ column,
99
+ column.preprocessing_steps[key],
100
+ )
101
+ end
102
+ end
103
+ end
104
+
105
+ def training
106
+ @training ||= imputer_group(:training)
107
+ end
108
+
109
+ def inference
110
+ @inference ||= imputer_group(:inference)
111
+ end
112
+
113
+ def preprocessing_descriptions
114
+ return [] if column.preprocessing_steps.blank?
115
+
116
+ [training.description].compact
117
+ end
118
+
119
+ private
120
+
121
+ def imputer_group(key)
122
+ imputers.dig(key.to_sym) || NullImputer.new
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,18 @@
1
+ module EasyML
2
+ class Column
3
+ class Learner
4
+ attr_accessor :dataset, :column
5
+
6
+ def initialize(column)
7
+ @column = column
8
+ @dataset = column.dataset
9
+ end
10
+
11
+ def learner
12
+ @learner ||= EasyML::Column::Learners::Base.adapter(column).new(column)
13
+ end
14
+
15
+ delegate :learn, to: :learner
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,103 @@
1
+ module EasyML
2
+ class Column
3
+ module Learners
4
+ class Base
5
+ attr_accessor :column, :dataset, :dtype, :select
6
+
7
+ def initialize(column)
8
+ @column = column
9
+ @dataset = column.dataset
10
+ @select = dataset.date_column.present? ? [dataset.date_column.name] : []
11
+ end
12
+
13
+ def self.adapter(column)
14
+ begin
15
+ dtype = column.datatype || EasyML::Data::PolarsColumn.determine_type(column.raw.data[column.name])
16
+ rescue => e
17
+ raise "Unable to find column #{column.name}. If this column is computed by a feature, you forgot to declare computes_columns"
18
+ end
19
+
20
+ case dtype.to_sym
21
+ when :float, :integer
22
+ EasyML::Column::Learners::Numeric
23
+ when :string, :text
24
+ EasyML::Column::Learners::String
25
+ when :categorical
26
+ EasyML::Column::Learners::Categorical
27
+ when :datetime, :date
28
+ EasyML::Column::Learners::Datetime
29
+ when :boolean
30
+ EasyML::Column::Learners::Boolean
31
+ when :null
32
+ EasyML::Column::Learners::Null
33
+ else
34
+ raise "Don't know how to learn from dtype: #{dtype}"
35
+ end
36
+ end
37
+
38
+ TYPES_ALL = %i(raw clipped processed)
39
+ TYPES_RAW = %i(raw clipped)
40
+ TYPES_PROCESSED = %i(processed)
41
+
42
+ def types(type = :all)
43
+ return TYPES_PROCESSED if !column.in_raw_dataset?
44
+
45
+ case type
46
+ when :all then TYPES_ALL
47
+ when :raw then TYPES_RAW
48
+ when :processed then TYPES_PROCESSED
49
+ else
50
+ TYPES_ALL
51
+ end
52
+ end
53
+
54
+ def learn(type: :all)
55
+ types(type).each_with_object({}) do |t, h|
56
+ h[t] = learn_split(column.send(t))
57
+ end
58
+ end
59
+
60
+ def full_dataset_statistics(df)
61
+ return {} if df.nil?
62
+
63
+ {
64
+ num_rows: df.size,
65
+ null_count: df[column.name].null_count || 0,
66
+ }
67
+ end
68
+
69
+ def train_statistics(df)
70
+ return {} if df.nil?
71
+
72
+ {
73
+ last_value: last_value(df),
74
+ most_frequent_value: df[column.name].mode.sort.to_a&.first,
75
+ }
76
+ end
77
+
78
+ def learn_split(split)
79
+ df = split.data(select: select)
80
+ train_df = split.train(select: select)
81
+ full_dataset_stats = full_dataset_statistics(df)
82
+ train_stats = train_statistics(train_df)
83
+ full_dataset_stats.merge!(train_stats)
84
+ end
85
+
86
+ def last_value(df)
87
+ return unless dataset.date_column.present?
88
+ return nil if df.empty? || !df.columns.include?(dataset.date_column.name)
89
+
90
+ # Sort by date and get the last non-null value
91
+ sorted_df = df.sort(dataset.date_column.name, reverse: true)
92
+ last_value = sorted_df
93
+ .filter(Polars.col(column.name).is_not_null)
94
+ .select(column.name)
95
+ .head(1)
96
+ .item
97
+
98
+ last_value
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,11 @@
1
+ module EasyML
2
+ class Column
3
+ module Learners
4
+ class Boolean < Categorical
5
+ def sort_by(value)
6
+ value == true ? 1 : 0
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,51 @@
1
+ module EasyML
2
+ class Column
3
+ module Learners
4
+ class Categorical < String
5
+ def learn(type)
6
+ types(type).each_with_object({}) do |type, h|
7
+ h[type] = case type
8
+ when :raw then learn_split(column.raw)
9
+ when :processed then learn_split(column.raw).merge!(null_count: 0)
10
+ end
11
+ end
12
+ end
13
+
14
+ def train_statistics(df)
15
+ return {} if df.nil?
16
+
17
+ super(df).merge!({
18
+ allowed_categories: allowed_categories(df),
19
+ counts: df[column.name].value_counts.to_hash,
20
+ }.merge!(learn_encoder_decoder(df)))
21
+ end
22
+
23
+ def learn_encoder_decoder(df)
24
+ value_counts = df[column.name].value_counts
25
+ column_names = value_counts.columns
26
+ value_column = column_names[0]
27
+ count_column = column_names[1]
28
+
29
+ as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&column.method(:cast))
30
+ label_encoder = as_hash.keys.compact.sort_by(&column.method(:sort_by)).each.with_index.reduce({}) do |h, (k, i)|
31
+ h.tap do
32
+ h[k] = i
33
+ end
34
+ end
35
+ label_decoder = label_encoder.invert
36
+
37
+ {
38
+ value: as_hash,
39
+ label_encoder: label_encoder,
40
+ label_decoder: label_decoder,
41
+ }
42
+ end
43
+
44
+ def allowed_categories(df)
45
+ val_counts = df[column.name].value_counts
46
+ val_counts[val_counts["count"] >= column.categorical_min][column.name].to_a.compact.sort_by(&column.method(:sort_by))
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,19 @@
1
+ module EasyML
2
+ class Column
3
+ module Learners
4
+ class Datetime < Base
5
+ def full_dataset_statistics(df)
6
+ return {} if df.nil?
7
+
8
+ super(df).merge!({
9
+ unique_count: df[column.name].n_unique,
10
+ })
11
+ end
12
+
13
+ def last_value(df)
14
+ df[column.name].sort[-1]
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end